feat(job_scout): expand to 19 companies + new adapters, filters, dedup
- New SmartRecruiters adapter (EU energy/commodity firms) - Add data-infra US tech (Confluent/GitLab/ClickHouse/Grafana) and commodity/energy traders (MET Group/Vitol/Louis Dreyfus) - Headless stealth (navigator.webdriver mask + chrome fingerprint) — unblocks Google; also enabled Meta and Cisco scraping - Tight title prefilter + title-only scoring + cross-region dedup so high-volume all-remote boards don't flood the report - Remove Canonical (below-market pay, poor culture) and IBM Research (research-scale pay below bar; weak data-eng fit) per reputation review Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -82,7 +82,8 @@
|
||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)",
|
||||
"Bash(job_scout/.venv/Scripts/python.exe -c ' *)",
|
||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)",
|
||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)"
|
||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)",
|
||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=confluent)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,6 +136,7 @@ _Update this section when starting/finishing a JD._
|
||||
| Infineon AI Engineer | Critique DONE Pass 2 (78.5/100) | Submit or Tier 2 polish |
|
||||
| Apple Data Engineer (ISE, Zurich) | Critique DONE Pass 1 (78.5/100) | /edit-resume for Tier 1 fixes or submit |
|
||||
| Kraken AI Infrastructure | Critique DONE Pass 2 (84.5/100) — converged near max | Submit, or apply Tier 2 polish (agent orchestration / guardrails in skills) |
|
||||
| Google FDE GenAI (Zurich) | PAUSED — GenAI evidence gap too large; redirecting to data-eng/MLOps roles | Likely abandon |
|
||||
|
||||
---
|
||||
|
||||
|
||||
+80
-18
@@ -37,7 +37,7 @@ CH_LOCATION_KEYWORDS = [
|
||||
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
|
||||
]
|
||||
|
||||
REMOTE_KEYWORDS = ["remote"]
|
||||
REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
|
||||
|
||||
US_ONLY_PATTERNS = [
|
||||
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
|
||||
@@ -49,7 +49,7 @@ EU_HINT_KEYWORDS = [
|
||||
"germany", "france", "spain", "portugal", "ireland", "netherlands",
|
||||
"sweden", "norway", "finland", "denmark", "poland", "czech",
|
||||
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
|
||||
"europe", "emea", "global",
|
||||
"europe", "emea", "global", "worldwide",
|
||||
] + CH_LOCATION_KEYWORDS
|
||||
|
||||
POSITIVE_KEYWORDS = {
|
||||
@@ -77,6 +77,19 @@ NEGATIVE_KEYWORDS = {
|
||||
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
|
||||
}
|
||||
|
||||
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
|
||||
# post mostly non-tech roles). Only keep titles containing one of these specific role
|
||||
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
|
||||
# don't leak in. Matched as case-insensitive substrings against the title only.
|
||||
ENG_TITLE_FILTER = [
|
||||
"data engineer", "data engineering", "data platform", "platform engineer",
|
||||
"data infrastructure", "data architect", "analytics engineer",
|
||||
"mlops", "ml engineer", "ml platform", "machine learning engineer",
|
||||
"site reliability", "sre", "backend engineer", "back-end engineer",
|
||||
"devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
|
||||
"kafka", "streaming", "big data", "quantitative developer", "quant developer",
|
||||
]
|
||||
|
||||
# id, display, adapter, adapter_args
|
||||
COMPANIES = [
|
||||
("nvidia", "NVIDIA", "workday", {
|
||||
@@ -103,6 +116,15 @@ COMPANIES = [
|
||||
("sygnum", "Sygnum", "wp_ajax", {
|
||||
"url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
|
||||
}),
|
||||
# --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
|
||||
("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
|
||||
# --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
|
||||
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
|
||||
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
||||
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
||||
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
||||
@@ -174,19 +196,6 @@ COMPANIES = [
|
||||
"scroll_count": 5,
|
||||
"use_inner_text_as_blob": True,
|
||||
}),
|
||||
("ibm", "IBM Research", "playwright", {
|
||||
# IBM Research Zurich careers page is mostly a static intro with few openings.
|
||||
# Use IBM's main careers search filtered to Switzerland instead.
|
||||
"url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland",
|
||||
"wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']",
|
||||
"card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)",
|
||||
"title_sel": "h3, h4",
|
||||
"link_sel": "a[href*='/careers/']",
|
||||
"link_attr": "href",
|
||||
"url_prefix": "https://www.ibm.com",
|
||||
"default_location": "Switzerland",
|
||||
"scroll_count": 4,
|
||||
}),
|
||||
]
|
||||
|
||||
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
||||
@@ -318,6 +327,39 @@ def fetch_pcsx(args):
|
||||
return jobs
|
||||
|
||||
|
||||
def fetch_smartrecruiters(args):
|
||||
"""SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
|
||||
company = args["company"]
|
||||
base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
|
||||
jobs, offset = [], 0
|
||||
while True:
|
||||
data = http_get_json(f"{base}?limit=100&offset={offset}")
|
||||
content = data.get("content", []) or []
|
||||
for p in content:
|
||||
loc = p.get("location") or {}
|
||||
parts = [loc.get("fullLocation") or loc.get("city") or ""]
|
||||
if loc.get("remote"):
|
||||
parts.append("Remote")
|
||||
if loc.get("hybrid"):
|
||||
parts.append("Hybrid")
|
||||
loc_str = " ".join(x for x in parts if x)
|
||||
dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
|
||||
func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
|
||||
jobs.append({
|
||||
"id": str(p.get("id")),
|
||||
"title": p.get("name", ""),
|
||||
"location": loc_str,
|
||||
"url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
|
||||
"posted": p.get("releasedDate", ""),
|
||||
"description": " ".join(filter(None, [dept, func])),
|
||||
})
|
||||
total = data.get("totalFound", 0)
|
||||
offset += len(content)
|
||||
if not content or offset >= total or offset >= 300:
|
||||
break
|
||||
return jobs
|
||||
|
||||
|
||||
def fetch_wp_ajax(args):
|
||||
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
||||
url = args["url"]
|
||||
@@ -502,6 +544,7 @@ ADAPTERS = {
|
||||
"greenhouse": fetch_greenhouse,
|
||||
"pcsx": fetch_pcsx,
|
||||
"wp_ajax": fetch_wp_ajax,
|
||||
"smartrecruiters": fetch_smartrecruiters,
|
||||
"playwright": fetch_playwright,
|
||||
}
|
||||
|
||||
@@ -520,8 +563,14 @@ def location_matches(loc_text):
|
||||
return in_ch, is_remote
|
||||
|
||||
|
||||
def score_job(job):
|
||||
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
||||
def score_job(job, title_only=False):
|
||||
# Title-filtered high-volume boards score on title only — the title filter already
|
||||
# gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
|
||||
# mention adds points), flooding the medium bucket.
|
||||
if title_only:
|
||||
blob = (job.get("title") or "").lower()
|
||||
else:
|
||||
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
||||
score, pos, neg = 0, [], []
|
||||
for kw, w in POSITIVE_KEYWORDS.items():
|
||||
if kw in blob:
|
||||
@@ -626,14 +675,27 @@ def main():
|
||||
errors.append((display, f"unexpected: {e!r}"))
|
||||
continue
|
||||
|
||||
# Optional per-company title prefilter for high-volume boards
|
||||
title_filter = args.get("_title_filter")
|
||||
if title_filter:
|
||||
jobs = [j for j in jobs
|
||||
if any(k in (j.get("title") or "").lower() for k in title_filter)]
|
||||
|
||||
company_seen = seen.setdefault(cid, {})
|
||||
title_seen = set()
|
||||
for j in jobs:
|
||||
jid = str(j.get("id") or j.get("url"))
|
||||
in_ch, is_remote = location_matches(j.get("location", ""))
|
||||
if not (in_ch or is_remote):
|
||||
continue
|
||||
# Collapse the same role posted once per remote country (title differs only
|
||||
# by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
|
||||
norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
|
||||
if norm_title in title_seen:
|
||||
continue
|
||||
title_seen.add(norm_title)
|
||||
is_new = jid not in company_seen
|
||||
score, pos, neg = score_job(j)
|
||||
score, pos, neg = score_job(j, title_only=bool(title_filter))
|
||||
all_results.append({
|
||||
"company": display, "company_id": cid,
|
||||
"title": j["title"], "location": j["location"],
|
||||
|
||||
Reference in New Issue
Block a user