From 3b07c4b900b8b9a833d3b138f75ffc2e71c758fd Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Fri, 22 May 2026 12:38:35 +0200 Subject: [PATCH] feat(job_scout): expand to 19 companies + new adapters, filters, dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New SmartRecruiters adapter (EU energy/commodity firms) - Add data-infra US tech (Confluent/GitLab/ClickHouse/Grafana) and commodity/energy traders (MET Group/Vitol/Louis Dreyfus) - Headless stealth (navigator.webdriver mask + chrome fingerprint) — unblocks Google; also enabled Meta and Cisco scraping - Tight title prefilter + title-only scoring + cross-region dedup so high-volume all-remote boards don't flood the report - Remove Canonical (below-market pay, poor culture) and IBM Research (research-scale pay below bar; weak data-eng fit) per reputation review Co-Authored-By: Claude Opus 4.7 --- .claude/settings.local.json | 3 +- CLAUDE.md | 1 + job_scout/scout.py | 98 ++++++++++++++++++++++++++++++------- 3 files changed, 83 insertions(+), 19 deletions(-) diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 2df8e8a..171c44d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -82,7 +82,8 @@ "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)", "Bash(job_scout/.venv/Scripts/python.exe -c ' *)", "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)", - "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)" + "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)", + "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=confluent)" ] } } diff --git a/CLAUDE.md b/CLAUDE.md index cd0c6b5..f491e67 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -136,6 +136,7 @@ _Update this section when starting/finishing a JD._ | Infineon AI Engineer | Critique DONE Pass 2 (78.5/100) | Submit or Tier 2 polish | | Apple Data Engineer (ISE, Zurich) | Critique DONE Pass 1 (78.5/100) | /edit-resume for Tier 1 fixes or submit | | Kraken AI Infrastructure | Critique DONE Pass 2 (84.5/100) — converged near max | Submit, or apply Tier 2 polish (agent orchestration / guardrails in skills) | +| Google FDE GenAI (Zurich) | PAUSED — GenAI evidence gap too large; redirecting to data-eng/MLOps roles | Likely abandon | --- diff --git a/job_scout/scout.py b/job_scout/scout.py index 8a953ac..9d0fd21 100644 --- a/job_scout/scout.py +++ b/job_scout/scout.py @@ -37,7 +37,7 @@ CH_LOCATION_KEYWORDS = [ "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse", ] -REMOTE_KEYWORDS = ["remote"] +REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"] US_ONLY_PATTERNS = [ "remote - us", "remote, us", "remote-us", "us remote", "us-remote", @@ -49,7 +49,7 @@ EU_HINT_KEYWORDS = [ "germany", "france", "spain", "portugal", "ireland", "netherlands", "sweden", "norway", "finland", "denmark", "poland", "czech", "romania", "italy", "austria", "belgium", "uk", "united kingdom", - "europe", "emea", "global", + "europe", "emea", "global", "worldwide", ] + CH_LOCATION_KEYWORDS POSITIVE_KEYWORDS = { @@ -77,6 +77,19 @@ NEGATIVE_KEYWORDS = { "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3, } +# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that +# post mostly non-tech roles). Only keep titles containing one of these specific role +# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel" +# don't leak in. Matched as case-insensitive substrings against the title only. +ENG_TITLE_FILTER = [ + "data engineer", "data engineering", "data platform", "platform engineer", + "data infrastructure", "data architect", "analytics engineer", + "mlops", "ml engineer", "ml platform", "machine learning engineer", + "site reliability", "sre", "backend engineer", "back-end engineer", + "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer", + "kafka", "streaming", "big data", "quantitative developer", "quant developer", +] + # id, display, adapter, adapter_args COMPANIES = [ ("nvidia", "NVIDIA", "workday", { @@ -103,6 +116,15 @@ COMPANIES = [ ("sygnum", "Sygnum", "wp_ajax", { "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c", }), + # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) --- + ("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}), + ("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}), + ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}), + ("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}), + # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) --- + ("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}), + ("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}), + ("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}), # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Google actively bot-detects; the STEALTH_JS init script (applied to every context) # is what makes its job list render. Cards are
  • with a "Learn more about " @@ -174,19 +196,6 @@ COMPANIES = [ "scroll_count": 5, "use_inner_text_as_blob": True, }), - ("ibm", "IBM Research", "playwright", { - # IBM Research Zurich careers page is mostly a static intro with few openings. - # Use IBM's main careers search filtered to Switzerland instead. - "url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland", - "wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']", - "card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)", - "title_sel": "h3, h4", - "link_sel": "a[href*='/careers/']", - "link_attr": "href", - "url_prefix": "https://www.ibm.com", - "default_location": "Switzerland", - "scroll_count": 4, - }), ] # Companies where adapter probing did not yield a reliable scrape. Reasons noted. @@ -318,6 +327,39 @@ def fetch_pcsx(args): return jobs +def fetch_smartrecruiters(args): + """SmartRecruiters public postings API. Used by many EU energy/commodity firms.""" + company = args["company"] + base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings" + jobs, offset = [], 0 + while True: + data = http_get_json(f"{base}?limit=100&offset={offset}") + content = data.get("content", []) or [] + for p in content: + loc = p.get("location") or {} + parts = [loc.get("fullLocation") or loc.get("city") or ""] + if loc.get("remote"): + parts.append("Remote") + if loc.get("hybrid"): + parts.append("Hybrid") + loc_str = " ".join(x for x in parts if x) + dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else "" + func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else "" + jobs.append({ + "id": str(p.get("id")), + "title": p.get("name", ""), + "location": loc_str, + "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}", + "posted": p.get("releasedDate", ""), + "description": " ".join(filter(None, [dept, func])), + }) + total = data.get("totalFound", 0) + offset += len(content) + if not content or offset >= total or offset >= 300: + break + return jobs + + def fetch_wp_ajax(args): """WordPress admin-ajax style endpoint. Sygnum uses this pattern.""" url = args["url"] @@ -502,6 +544,7 @@ ADAPTERS = { "greenhouse": fetch_greenhouse, "pcsx": fetch_pcsx, "wp_ajax": fetch_wp_ajax, + "smartrecruiters": fetch_smartrecruiters, "playwright": fetch_playwright, } @@ -520,8 +563,14 @@ def location_matches(loc_text): return in_ch, is_remote -def score_job(job): - blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower() +def score_job(job, title_only=False): + # Title-filtered high-volume boards score on title only — the title filter already + # gated relevance, and scoring the full JD body over-inflates (every "python"/"data" + # mention adds points), flooding the medium bucket. + if title_only: + blob = (job.get("title") or "").lower() + else: + blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower() score, pos, neg = 0, [], [] for kw, w in POSITIVE_KEYWORDS.items(): if kw in blob: @@ -626,14 +675,27 @@ def main(): errors.append((display, f"unexpected: {e!r}")) continue + # Optional per-company title prefilter for high-volume boards + title_filter = args.get("_title_filter") + if title_filter: + jobs = [j for j in jobs + if any(k in (j.get("title") or "").lower() for k in title_filter)] + company_seen = seen.setdefault(cid, {}) + title_seen = set() for j in jobs: jid = str(j.get("id") or j.get("url")) in_ch, is_remote = location_matches(j.get("location", "")) if not (in_ch or is_remote): continue + # Collapse the same role posted once per remote country (title differs only + # by a "| Country | Remote" suffix) — dedupe on the title before the first "|". + norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower() + if norm_title in title_seen: + continue + title_seen.add(norm_title) is_new = jid not in company_seen - score, pos, neg = score_job(j) + score, pos, neg = score_job(j, title_only=bool(title_filter)) all_results.append({ "company": display, "company_id": cid, "title": j["title"], "location": j["location"],