From 3b07c4b900b8b9a833d3b138f75ffc2e71c758fd Mon Sep 17 00:00:00 2001
From: Dennis Thiessen <dennis@thiessen.io>
Date: Fri, 22 May 2026 12:38:35 +0200
Subject: [PATCH] feat(job_scout): expand to 19 companies + new adapters,
 filters, dedup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- New SmartRecruiters adapter (EU energy/commodity firms)
- Add data-infra US tech (Confluent/GitLab/ClickHouse/Grafana) and
  commodity/energy traders (MET Group/Vitol/Louis Dreyfus)
- Headless stealth (navigator.webdriver mask + chrome fingerprint) — unblocks
  Google; also enabled Meta and Cisco scraping
- Tight title prefilter + title-only scoring + cross-region dedup so
  high-volume all-remote boards don't flood the report
- Remove Canonical (below-market pay, poor culture) and IBM Research
  (research-scale pay below bar; weak data-eng fit) per reputation review

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .claude/settings.local.json |  3 +-
 CLAUDE.md                   |  1 +
 job_scout/scout.py          | 98 ++++++++++++++++++++++++++++++-------
 3 files changed, 83 insertions(+), 19 deletions(-)

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 2df8e8a..171c44d 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -82,7 +82,8 @@
       "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)",
       "Bash(job_scout/.venv/Scripts/python.exe -c ' *)",
       "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)",
-      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)"
+      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)",
+      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=confluent)"
     ]
   }
 }
diff --git a/CLAUDE.md b/CLAUDE.md
index cd0c6b5..f491e67 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -136,6 +136,7 @@ _Update this section when starting/finishing a JD._
 | Infineon AI Engineer | Critique DONE Pass 2 (78.5/100) | Submit or Tier 2 polish |
 | Apple Data Engineer (ISE, Zurich) | Critique DONE Pass 1 (78.5/100) | /edit-resume for Tier 1 fixes or submit |
 | Kraken AI Infrastructure | Critique DONE Pass 2 (84.5/100) — converged near max | Submit, or apply Tier 2 polish (agent orchestration / guardrails in skills) |
+| Google FDE GenAI (Zurich) | PAUSED — GenAI evidence gap too large; redirecting to data-eng/MLOps roles | Likely abandon |
 
 ---
 
diff --git a/job_scout/scout.py b/job_scout/scout.py
index 8a953ac..9d0fd21 100644
--- a/job_scout/scout.py
+++ b/job_scout/scout.py
@@ -37,7 +37,7 @@ CH_LOCATION_KEYWORDS = [
     "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
 ]
 
-REMOTE_KEYWORDS = ["remote"]
+REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
 
 US_ONLY_PATTERNS = [
     "remote - us", "remote, us", "remote-us", "us remote", "us-remote",
@@ -49,7 +49,7 @@ EU_HINT_KEYWORDS = [
     "germany", "france", "spain", "portugal", "ireland", "netherlands",
     "sweden", "norway", "finland", "denmark", "poland", "czech",
     "romania", "italy", "austria", "belgium", "uk", "united kingdom",
-    "europe", "emea", "global",
+    "europe", "emea", "global", "worldwide",
 ] + CH_LOCATION_KEYWORDS
 
 POSITIVE_KEYWORDS = {
@@ -77,6 +77,19 @@ NEGATIVE_KEYWORDS = {
     "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
 }
 
+# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
+# post mostly non-tech roles). Only keep titles containing one of these specific role
+# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
+# don't leak in. Matched as case-insensitive substrings against the title only.
+ENG_TITLE_FILTER = [
+    "data engineer", "data engineering", "data platform", "platform engineer",
+    "data infrastructure", "data architect", "analytics engineer",
+    "mlops", "ml engineer", "ml platform", "machine learning engineer",
+    "site reliability", "sre", "backend engineer", "back-end engineer",
+    "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
+    "kafka", "streaming", "big data", "quantitative developer", "quant developer",
+]
+
 # id, display, adapter, adapter_args
 COMPANIES = [
     ("nvidia",    "NVIDIA",    "workday",    {
@@ -103,6 +116,15 @@ COMPANIES = [
     ("sygnum",    "Sygnum",    "wp_ajax", {
         "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
     }),
+    # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
+    ("confluent", "Confluent", "ashby",      {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
+    ("gitlab",    "GitLab",    "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
+    ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
+    ("grafana",   "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
+    # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
+    ("metgroup",  "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
+    ("vitol",     "Vitol",     "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
+    ("ldc",       "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
     # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
     # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
     # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -174,19 +196,6 @@ COMPANIES = [
         "scroll_count": 5,
         "use_inner_text_as_blob": True,
     }),
-    ("ibm",       "IBM Research", "playwright", {
-        # IBM Research Zurich careers page is mostly a static intro with few openings.
-        # Use IBM's main careers search filtered to Switzerland instead.
-        "url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland",
-        "wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']",
-        "card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)",
-        "title_sel": "h3, h4",
-        "link_sel": "a[href*='/careers/']",
-        "link_attr": "href",
-        "url_prefix": "https://www.ibm.com",
-        "default_location": "Switzerland",
-        "scroll_count": 4,
-    }),
 ]
 
 # Companies where adapter probing did not yield a reliable scrape. Reasons noted.
@@ -318,6 +327,39 @@ def fetch_pcsx(args):
     return jobs
 
 
+def fetch_smartrecruiters(args):
+    """SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
+    company = args["company"]
+    base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
+    jobs, offset = [], 0
+    while True:
+        data = http_get_json(f"{base}?limit=100&offset={offset}")
+        content = data.get("content", []) or []
+        for p in content:
+            loc = p.get("location") or {}
+            parts = [loc.get("fullLocation") or loc.get("city") or ""]
+            if loc.get("remote"):
+                parts.append("Remote")
+            if loc.get("hybrid"):
+                parts.append("Hybrid")
+            loc_str = " ".join(x for x in parts if x)
+            dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
+            func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
+            jobs.append({
+                "id": str(p.get("id")),
+                "title": p.get("name", ""),
+                "location": loc_str,
+                "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
+                "posted": p.get("releasedDate", ""),
+                "description": " ".join(filter(None, [dept, func])),
+            })
+        total = data.get("totalFound", 0)
+        offset += len(content)
+        if not content or offset >= total or offset >= 300:
+            break
+    return jobs
+
+
 def fetch_wp_ajax(args):
     """WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
     url = args["url"]
@@ -502,6 +544,7 @@ ADAPTERS = {
     "greenhouse": fetch_greenhouse,
     "pcsx": fetch_pcsx,
     "wp_ajax": fetch_wp_ajax,
+    "smartrecruiters": fetch_smartrecruiters,
     "playwright": fetch_playwright,
 }
 
@@ -520,8 +563,14 @@ def location_matches(loc_text):
     return in_ch, is_remote
 
 
-def score_job(job):
-    blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
+def score_job(job, title_only=False):
+    # Title-filtered high-volume boards score on title only — the title filter already
+    # gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
+    # mention adds points), flooding the medium bucket.
+    if title_only:
+        blob = (job.get("title") or "").lower()
+    else:
+        blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
     score, pos, neg = 0, [], []
     for kw, w in POSITIVE_KEYWORDS.items():
         if kw in blob:
@@ -626,14 +675,27 @@ def main():
             errors.append((display, f"unexpected: {e!r}"))
             continue
 
+        # Optional per-company title prefilter for high-volume boards
+        title_filter = args.get("_title_filter")
+        if title_filter:
+            jobs = [j for j in jobs
+                    if any(k in (j.get("title") or "").lower() for k in title_filter)]
+
         company_seen = seen.setdefault(cid, {})
+        title_seen = set()
         for j in jobs:
             jid = str(j.get("id") or j.get("url"))
             in_ch, is_remote = location_matches(j.get("location", ""))
             if not (in_ch or is_remote):
                 continue
+            # Collapse the same role posted once per remote country (title differs only
+            # by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
+            norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
+            if norm_title in title_seen:
+                continue
+            title_seen.add(norm_title)
             is_new = jid not in company_seen
-            score, pos, neg = score_job(j)
+            score, pos, neg = score_job(j, title_only=bool(title_filter))
             all_results.append({
                 "company": display, "company_id": cid,
                 "title": j["title"], "location": j["location"],