feat: add job_scout — automated CH job-board scraper for target companies

Pulls fresh postings from the quarterly target-company list, filters by Swiss location / EU-remote eligibility, scores fit against profile keywords, tracks seen jobs, and writes a markdown report. Adapters (13 companies automated): - Workday: NVIDIA, Novartis - Ashby: Kraken, OpenAI - Greenhouse: Anthropic - PCSX (Eightfold): Microsoft - WordPress AJAX: Sygnum - Playwright (headless + stealth): Google, Apple, Meta, Roche, Cisco, IBM 4 companies (Sonova, Coinbase, AMINA, Bitcoin Suisse) resist automation (server-side bot blocks / JS widgets) and surface as a manual-check checklist. venv, reports/, and state/ are gitignored; only scout.py + requirements.txt are tracked. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-22 10:12:27 +02:00
parent 1fde4c6b34
commit eeec77b1e3
4 changed files with 741 additions and 1 deletions
@@ -38,7 +38,51 @@
      "Bash(python resume_builder/helpers/char_count.py -f resume output/Kraken_AI_Infrastructure/e2e_kraken_ai_infra_resume.tex)",
      "Bash(\"/c/Users/Dennis/AppData/Local/Programs/MiKTeX/miktex/bin/x64/pdflatex.exe\" -interaction=nonstopmode -output-directory=output/Kraken_AI_Infrastructure output/Kraken_AI_Infrastructure/e2e_kraken_ai_infra_resume.tex)",
      "WebFetch(domain:blog.kraken.com)",
-      "WebFetch(domain:github.com)"
+      "WebFetch(domain:github.com)",
+      "WebFetch(domain:jobs.ashbyhq.com)",
+      "WebFetch(domain:api.ashbyhq.com)",
+      "Bash(curl -s \"https://jobs.nvidia.com/api/apply/v2/jobs?domain=nvidia.com&start=0&num=50&location=Schweiz&pid=893391032265&sort_by=distance&filter_include_remote=1\" -o /c/Workspace/claude-resume-kit/nvidia_jobs.json)",
+      "WebFetch(domain:jobs.nvidia.com)",
+      "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://boards-api.greenhouse.io/v1/boards/coinbase/jobs\")",
+      "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://boards-api.greenhouse.io/v1/boards/coinbase.com/jobs\")",
+      "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://api.ashbyhq.com/posting-api/job-board/coinbase\")",
+      "Bash(curl -s \"https://www.coinbase.com/careers\" -I)",
+      "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://coinbase.wd1.myworkdayjobs.com/wday/cxs/coinbase/External/jobs\" -X POST -H \"Content-Type: application/json\" -d '{\"appliedFacets\":{},\"limit\":5,\"offset\":0,\"searchText\":\"\"}')",
+      "Bash(curl -s \"https://novartis.wd3.myworkdayjobs.com/wday/cxs/novartis/Novartis_Careers/jobs\" -X POST -H \"Content-Type: application/json\" -d '{\"appliedFacets\":{},\"limit\":5,\"offset\":0,\"searchText\":\"Switzerland\"}')",
+      "Bash(curl -sL \"https://jobs.apple.com/api/role/search?lang=en-us\" -X POST -H \"Content-Type: application/json\" -H \"User-Agent: Mozilla/5.0\" -d '{\"query\":\"\",\"filters\":{\"locations\":[\"postLocation-CHE\"]},\"page\":1}')",
+      "Bash(curl -s \"https://gcsservices.careers.microsoft.com/search/api/v1/search?lc=Switzerland&l=en_us&pg=1&pgSz=5&o=Relevance&flt=true\" -H \"User-Agent: Mozilla/5.0\" -H \"Accept: application/json\")",
+      "Bash(curl -s \"https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/google_jobs.html -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -s https://jobs.apple.com/api/role/search?lang=en-us -X POST -H 'Content-Type: application/json' -H 'User-Agent: Mozilla/5.0' -H 'Accept: application/json' -H 'Referer: https://jobs.apple.com/en-us/search?location=switzerland-CHE' -d '{\"query\":\"\",\"filters\":{\"postingpostingProgram\":[],\"locations\":[\"postLocation-CHE\"]},\"page\":1,\"sort\":\"newest\"}' -o /tmp/apple_jobs.json -w '%{http_code}\\\\n')",
+      "Read(//tmp/**)",
+      "Bash(curl -s 'https://gcsservices.careers.microsoft.com/search/api/v1/search?lc=Switzerland&l=en_us&pg=1&pgSz=5&o=Relevance&flt=true' -H 'User-Agent: Mozilla/5.0' -o /tmp/ms_jobs.json -w '%{http_code}\\\\n')",
+      "Bash(curl -sL 'https://careers.ibm.com/api/jobs?country_code=CH&page=1&hits=5' -H 'User-Agent: Mozilla/5.0' -o /tmp/ibm_jobs.json -w '%{http_code}\\\\n')",
+      "Bash(curl -sL 'https://jobs.cisco.com/jobs/SearchJobs?folderRecordsPerPage=5&listFilterMode=1&21178217=Switzerland' -H 'User-Agent: Mozilla/5.0' -o /tmp/cisco.html -w '%{http_code} size:%{size_download}\\\\n')",
+      "Bash(curl -sI \"https://jobs.apple.com/api/role/search?lang=en-us\" -X POST -H \"User-Agent: Mozilla/5.0\" -H \"Content-Type: application/json\")",
+      "Bash(curl -sL \"https://jobs.apple.com/api/v1/search?lang=en-us\" -X POST -H \"User-Agent: Mozilla/5.0\" -H \"Content-Type: application/json\" -d '{\"query\":\"\",\"filters\":{\"locations\":[\"postLocation-CHE\"]},\"page\":1}' -w \"%{http_code}\\\\n\")",
+      "Bash(curl -s \"https://careers.microsoft.com/v2/global/en/search.html?lc=Switzerland\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/ms.html -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -v \"https://gcsservices.careers.microsoft.com/search/api/v1/search?lc=Switzerland&l=en_us&pg=1&pgSz=5\")",
+      "Bash(curl -sL \"https://www.sygnum.com/careers/\" -H \"User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" -o /tmp/sygnum.html -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -s https://jobs.apple.com/api/v1/search?lang=en-us -X POST -H 'Content-Type: application/json' -H 'User-Agent: Mozilla/5.0' -d '{\"query\":\"\",\"filters\":{\"locations\":[\"postLocation-CHE-zurich-metro\"]},\"page\":1}')",
+      "Bash(curl -s https://jobs.apple.com/api/v1/search?lang=en-us -X POST -H 'Content-Type: application/json' -H 'User-Agent: Mozilla/5.0' -d '{\"query\":\"Switzerland\",\"page\":1}')",
+      "Bash(curl -sL \"https://careers.sonova.com\" -o /tmp/sonova.html -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -sL \"https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland\" -o /tmp/roche.html -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -s \"https://careers.google.com/api/v3/jobs/?location=Switzerland&page=1&page_size=5\" -H \"User-Agent: Mozilla/5.0\" -w \"%{http_code}\\\\n\")",
+      "Bash(curl -sIL \"https://careers.sonova.com\")",
+      "Bash(curl -s \"https://sygnum.jobs.personio.com/\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/sygnum_p.html -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -s \"https://api.smartrecruiters.com/v1/companies/Sygnum/postings\" -w \"%{http_code}\\\\n\")",
+      "Bash(curl -s \"https://ibmglobal.avature.net/careers/JobDetail?jobId=&country=Switzerland\" -L -o /tmp/ibm2.html -w \"%{http_code}\\\\n\")",
+      "Bash(curl -s \"https://roche.wd3.myworkdayjobs.com/wday/cxs/roche/roche/jobs\" -X POST -H \"Content-Type: application/json\" -d '{\"appliedFacets\":{},\"limit\":5,\"offset\":0,\"searchText\":\"Switzerland\"}' -w \"%{http_code}\\\\n\")",
+      "Bash(curl -sIL \"https://sygnum.jobs.personio.com/\" -H \"User-Agent: Mozilla/5.0\")",
+      "Bash(curl -s \"https://sygnum.jobs.personio.com/xml\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/sygnum_xml.xml -w \"%{http_code} size:%{size_download}\\\\n\")",
+      "Bash(curl -s \"https://bitcoinsuisse.jobs.personio.com/xml\" -H \"User-Agent: Mozilla/5.0\" -w \"%{http_code} size:%{size_download}\\\\n\" -o /tmp/bts.xml)",
+      "Bash(job_scout/.venv/Scripts/python.exe -m pip install --upgrade pip)",
+      "Bash(job_scout/.venv/Scripts/python.exe -m pip install -r job_scout/requirements.txt)",
+      "Bash(job_scout/.venv/Scripts/python.exe -m playwright install chromium)",
+      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py)",
+      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)",
+      "Bash(job_scout/.venv/Scripts/python.exe -c ' *)",
+      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)",
+      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)"
    ]
  }
 }
@@ -20,6 +20,15 @@ Thumbs.db
 __pycache__/
 *.pyc

+# job_scout runtime artifacts (keep scout.py + requirements.txt only)
+job_scout/.venv/
+job_scout/reports/
+job_scout/state/
+
+# One-off job-board data pulls (debug artifacts)
+*_jd.json
+*_jobs*.json
+
 # Editor
 *.swp
 *.swo
@@ -0,0 +1 @@
+playwright>=1.40,<2
@@ -0,0 +1,686 @@
+"""Job scout for Dennis's quarterly target companies.
+
+Pulls latest openings from companies with known public ATS APIs (Workday/Ashby/Greenhouse),
+filters by Swiss location or remote eligibility, scores fit against profile keywords, tracks
+which job IDs we've already seen, writes a markdown report.
+
+Usage:
+    py scout.py                 # Pull all configured companies (strong + medium only)
+    py scout.py --only=nvidia   # Pull a single company by id
+    py scout.py --new-only      # Report only jobs not seen before
+    py scout.py --include-weak  # Include weak/noise bucket (default hidden)
+
+State : state/seen_jobs.json
+Output: reports/YYYY-MM-DD.md
+
+To add a company: append to COMPANIES with one of the existing adapter types.
+For companies behind custom careers sites (Google, MS, Meta, Apple, Roche, Novartis, IBM,
+Cisco, Sonova, Sygnum) — see TODO_ADAPTERS at the bottom.
+"""
+
+import json
+import re
+import sys
+import urllib.error
+import urllib.parse
+import urllib.request
+from datetime import datetime, timezone
+from pathlib import Path
+
+ROOT = Path(__file__).parent
+STATE_FILE = ROOT / "state" / "seen_jobs.json"
+REPORTS_DIR = ROOT / "reports"
+USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"
+
+CH_LOCATION_KEYWORDS = [
+    "switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
+    "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
+]
+
+REMOTE_KEYWORDS = ["remote"]
+
+US_ONLY_PATTERNS = [
+    "remote - us", "remote, us", "remote-us", "us remote", "us-remote",
+    "remote-friendly us", "remote (us)", "united states - remote",
+    "remote, united states",
+]
+
+EU_HINT_KEYWORDS = [
+    "germany", "france", "spain", "portugal", "ireland", "netherlands",
+    "sweden", "norway", "finland", "denmark", "poland", "czech",
+    "romania", "italy", "austria", "belgium", "uk", "united kingdom",
+    "europe", "emea", "global",
+] + CH_LOCATION_KEYWORDS
+
+POSITIVE_KEYWORDS = {
+    "genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
+    "applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
+    "mlops": 3, "ai platform": 3, "ml platform": 3,
+    "python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
+    "solutions architect": 2, "platform engineer": 2,
+    "ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
+    "kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
+    "crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
+    "senior": 1, "staff": 1, "lead": 1, "principal": 1,
+}
+
+NEGATIVE_KEYWORDS = {
+    "cuda": -3, "kernel driver": -3, "gpu programming": -3,
+    "compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
+    "rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
+    "frontend": -3, "front-end": -3, "react native": -3,
+    "ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
+    "ui engineer": -2, "ux engineer": -2,
+    "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
+    "physical design": -3, "silicon": -2,
+    "expert c++": -2, "5+ years c++": -2, "deep c++": -2,
+    "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
+}
+
+# id, display, adapter, adapter_args
+COMPANIES = [
+    ("nvidia",    "NVIDIA",    "workday",    {
+        "host": "nvidia.wd5.myworkdayjobs.com",
+        "tenant": "nvidia",
+        "site": "NVIDIAExternalCareerSite",
+        "search_text": "Switzerland",
+    }),
+    ("kraken",    "Kraken",    "ashby",      {"slug": "kraken.com"}),
+    ("openai",    "OpenAI",    "ashby",      {"slug": "openai"}),
+    ("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
+    ("novartis",  "Novartis",  "workday", {
+        "host": "novartis.wd3.myworkdayjobs.com",
+        "tenant": "novartis",
+        "site": "Novartis_Careers",
+        "search_text": "Switzerland",
+    }),
+    # PCSX (Eightfold) — Microsoft has a public position search endpoint
+    ("microsoft", "Microsoft", "pcsx", {
+        "domain": "microsoft.com",
+        "location": "Switzerland",
+    }),
+    # Sygnum — WordPress AJAX endpoint returns clean JSON
+    ("sygnum",    "Sygnum",    "wp_ajax", {
+        "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
+    }),
+    # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
+    # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
+    # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
+    # aria-label link; location lives in the card text (captured via blob mode).
+    ("google",    "Google",    "playwright", {
+        "url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
+        "wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
+        "card": "li:has(a[aria-label*='Learn more about'])",
+        "title_sel": "a[aria-label*='Learn more about']",
+        "title_sel_attr": "aria-label",
+        "title_strip_prefix": "Learn more about ",
+        "link_sel": "a[href*='jobs/results/']",
+        "link_attr": "href",
+        "url_prefix": "https://www.google.com/about/careers/applications/",
+        "default_location": "",
+        "scroll_count": 5,
+        "use_inner_text_as_blob": True,
+        "cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
+    }),
+    ("apple",     "Apple",     "playwright", {
+        "url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
+        "wait_for": "a[href*='/en-us/details/']",
+        "card": "a[href*='/en-us/details/']",
+        "title_attr": "text",
+        "link_attr": "href",
+        "url_prefix": "https://jobs.apple.com",
+        "default_location": "Switzerland",
+    }),
+    # Meta job links are /profile/job_details/<id>; title + location are in the link text.
+    ("meta",      "Meta",      "playwright", {
+        "url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
+        "wait_for": "a[href*='/profile/job_details/']",
+        "card": "a[href*='/profile/job_details/']",
+        "title_attr": "text",
+        "link_attr": "href",
+        "url_prefix": "https://www.metacareers.com",
+        "default_location": "Zurich, Switzerland",
+        "scroll_count": 5,
+        "use_inner_text_as_blob": True,
+    }),
+    # PhenomPeople pattern (Roche) uses li.jobs-list-item.
+    # Card inner text is structured like: "<title> | Location | <city, country> | Category | ..."
+    # We extract title from first line, full text becomes the "description" so our location
+    # filter still sees Switzerland mentions.
+    ("roche",     "Roche",     "playwright", {
+        "url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
+        "wait_for": "li.jobs-list-item, a.au-target",
+        "card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
+        "title_attr": "text",
+        "link_sel": "a[href]",
+        "link_attr": "href",
+        "url_prefix": "https://careers.roche.com",
+        "default_location": "",
+        "cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
+        "scroll_count": 6,
+        "use_inner_text_as_blob": True,
+    }),
+    # Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
+    ("cisco",     "Cisco",     "playwright", {
+        "url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
+        "wait_for": "a[href*='/job/'], div[role='listitem']",
+        "card": "div[role='listitem']:has(a[href*='/job/'])",
+        "title_sel": "a[href*='/job/']",
+        "link_sel": "a[href*='/job/']",
+        "link_attr": "href",
+        "url_prefix": "https://careers.cisco.com",
+        "default_location": "Switzerland",
+        "cookie_accept": ["#onetrust-accept-btn-handler"],
+        "scroll_count": 5,
+        "use_inner_text_as_blob": True,
+    }),
+    ("ibm",       "IBM Research", "playwright", {
+        # IBM Research Zurich careers page is mostly a static intro with few openings.
+        # Use IBM's main careers search filtered to Switzerland instead.
+        "url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland",
+        "wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']",
+        "card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)",
+        "title_sel": "h3, h4",
+        "link_sel": "a[href*='/careers/']",
+        "link_attr": "href",
+        "url_prefix": "https://www.ibm.com",
+        "default_location": "Switzerland",
+        "scroll_count": 4,
+    }),
+]
+
+# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
+# These surface as a clickable checklist in the report so they're not forgotten.
+MANUAL_CHECK = [
+    ("Sonova",         "PhenomPeople serves empty shell to automation (body never renders); widgets API rejects requests",
+     "https://careers.sonova.com/us/en/search-results?keywords=Switzerland"),
+    ("Coinbase",       "/careers/positions 302-redirects to landing; no job links or ATS API exposed even with stealth",
+     "https://www.coinbase.com/careers"),
+    ("AMINA Bank",     "jobs are at /careers/ (#positions) via JS widget; only ~4 apply links, no scrapable list",
+     "https://aminagroup.com/careers/#positions"),
+    ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
+     "https://bitcoinsuisse.com/careers#open-positions"),
+]
+
+
+def http_get_json(url, headers=None, data=None, method="GET"):
+    headers = headers or {}
+    headers.setdefault("User-Agent", USER_AGENT)
+    headers.setdefault("Accept", "application/json")
+    if data is not None and isinstance(data, dict):
+        data = json.dumps(data).encode("utf-8")
+        headers.setdefault("Content-Type", "application/json")
+    req = urllib.request.Request(url, data=data, headers=headers, method=method)
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+def fetch_workday(args):
+    host, site, tenant = args["host"], args["site"], args["tenant"]
+    search_text = args.get("search_text", "")
+    url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
+    jobs, offset = [], 0
+    while True:
+        data = http_get_json(url, method="POST", data={
+            "appliedFacets": {}, "limit": 20, "offset": offset,
+            "searchText": search_text,
+        })
+        postings = data.get("jobPostings", [])
+        for p in postings:
+            ext = p.get("externalPath", "")
+            jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
+            jobs.append({
+                "id": jid,
+                "title": p.get("title", ""),
+                "location": p.get("locationsText", "") + " " + ext,
+                "url": f"https://{host}{ext}",
+                "posted": p.get("postedOn", ""),
+                "description": "",
+            })
+        total = data.get("total", 0)
+        offset += len(postings)
+        if not postings or offset >= total:
+            break
+    return jobs
+
+
+def fetch_ashby(args):
+    slug = args["slug"]
+    url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
+    data = http_get_json(url)
+    jobs = []
+    for j in data.get("jobs", []):
+        secs = j.get("secondaryLocations", []) or []
+        sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
+        loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
+        jobs.append({
+            "id": j.get("id"),
+            "title": j.get("title", ""),
+            "location": loc_blob,
+            "url": j.get("jobUrl"),
+            "posted": j.get("publishedAt", ""),
+            "description": (j.get("descriptionPlain") or "")[:2500],
+            "department": j.get("department", ""),
+        })
+    return jobs
+
+
+def fetch_greenhouse(args):
+    board = args["board"]
+    url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
+    data = http_get_json(url)
+    jobs = []
+    for j in data.get("jobs", []):
+        loc = (j.get("location") or {}).get("name", "")
+        offices = j.get("offices") or []
+        office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
+        loc_blob = " ".join(x for x in [loc, office_names] if x)
+        desc = j.get("content", "") or ""
+        desc = re.sub(r"<[^>]+>", " ", desc)
+        desc = re.sub(r"\s+", " ", desc).strip()
+        jobs.append({
+            "id": str(j.get("id")),
+            "title": j.get("title", ""),
+            "location": loc_blob,
+            "url": j.get("absolute_url"),
+            "posted": j.get("updated_at", ""),
+            "description": desc[:2500],
+        })
+    return jobs
+
+
+def fetch_pcsx(args):
+    """Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
+    The same endpoint pattern is used by other PCS-hosted boards."""
+    domain = args["domain"]
+    location = args.get("location", "")
+    base = "https://apply.careers.microsoft.com/api/pcsx/search"
+    jobs, start = [], 0
+    while True:
+        url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
+        data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
+        positions = (data.get("data") or {}).get("positions", []) or []
+        for p in positions:
+            locs = p.get("locations") or []
+            jobs.append({
+                "id": str(p.get("id")),
+                "title": p.get("name", ""),
+                "location": " | ".join(locs),
+                "url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
+                "posted": p.get("postedTs", ""),
+                "description": (p.get("description") or "")[:2000],
+            })
+        if not positions or len(positions) < 50:
+            break
+        start += len(positions)
+        if start >= 500:
+            break
+    return jobs
+
+
+def fetch_wp_ajax(args):
+    """WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
+    url = args["url"]
+    data = http_get_json(url)
+    if not isinstance(data, list):
+        return []
+    jobs = []
+    for j in data:
+        jobs.append({
+            "id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
+            "title": j.get("title", ""),
+            "location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
+            "url": j.get("application_url") or args["url"],
+            "posted": "",
+            "description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
+        })
+    return jobs
+
+
+# Injected before page scripts run, to mask the most common headless-detection signals.
+# Required for Google; harmless for the other sites.
+STEALTH_JS = """
+Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
+window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
+Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
+Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
+const _q = navigator.permissions && navigator.permissions.query;
+if (_q) {
+  navigator.permissions.query = (p) => p && p.name === 'notifications'
+    ? Promise.resolve({state: Notification.permission}) : _q(p);
+}
+"""
+
+_playwright_singleton = {"pw": None, "browser": None}
+
+def _get_browser():
+    """Lazy-init a single shared headless browser. Saves ~3s per company."""
+    if _playwright_singleton["browser"] is not None:
+        return _playwright_singleton["browser"]
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError as e:
+        raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
+    pw = sync_playwright().start()
+    browser = pw.chromium.launch(
+        headless=True,
+        args=["--disable-blink-features=AutomationControlled"],
+    )
+    _playwright_singleton["pw"] = pw
+    _playwright_singleton["browser"] = browser
+    return browser
+
+
+def _absolutize(href, prefix):
+    """Join a possibly-relative href with the configured prefix."""
+    if not href or href.startswith("http"):
+        return href
+    cleaned = href.lstrip("./").lstrip("/")
+    if not prefix:
+        return href
+    return prefix.rstrip("/") + "/" + cleaned
+
+
+def _close_browser():
+    if _playwright_singleton["browser"]:
+        try:
+            _playwright_singleton["browser"].close()
+        except Exception:
+            pass
+    if _playwright_singleton["pw"]:
+        try:
+            _playwright_singleton["pw"].stop()
+        except Exception:
+            pass
+
+
+def fetch_playwright(args):
+    """Generic headless-browser scraper. See COMPANIES entries for selector args."""
+    browser = _get_browser()
+    ctx = browser.new_context(
+        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        locale="en-US",
+        viewport={"width": 1366, "height": 768},
+    )
+    ctx.add_init_script(STEALTH_JS)
+    page = ctx.new_page()
+    jobs = []
+    try:
+        page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
+        # Optional cookie banner acceptance
+        for sel in args.get("cookie_accept", []) or []:
+            try:
+                btn = page.locator(sel).first
+                if btn.is_visible(timeout=2000):
+                    btn.click()
+                    page.wait_for_timeout(500)
+            except Exception:
+                pass
+        # Wait for job content to render
+        wait_for = args.get("wait_for")
+        if wait_for:
+            try:
+                page.wait_for_selector(wait_for, timeout=15000)
+            except Exception:
+                page.wait_for_timeout(4000)
+        # Scroll a few times to trigger any lazy-loaded results
+        for _ in range(args.get("scroll_count", 3)):
+            try:
+                page.mouse.wheel(0, 4000)
+                page.wait_for_timeout(700)
+            except Exception:
+                break
+
+        cards = page.locator(args["card"])
+        n = min(cards.count(), args.get("max_cards", 150))
+        for i in range(n):
+            card = cards.nth(i)
+            try:
+                title = ""
+                if args.get("title_attr") == "text":
+                    title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
+                elif args.get("title_attr"):
+                    title = (card.get_attribute(args["title_attr"]) or "").strip()
+                elif args.get("title_sel"):
+                    t = card.locator(args["title_sel"]).first
+                    if t.count():
+                        # Read either an attribute (e.g. aria-label) or the inner text
+                        if args.get("title_sel_attr"):
+                            title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
+                        else:
+                            title = (t.inner_text() or "").strip()
+                if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
+                    title = title[len(args["title_strip_prefix"]):].strip()
+                if not title:
+                    title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
+
+                location = args.get("default_location", "")
+                if args.get("location_sel"):
+                    lsel = card.locator(args["location_sel"]).first
+                    if lsel.count():
+                        location = (lsel.inner_text() or location).strip()
+
+                link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
+                href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
+                href = _absolutize(href, args.get("url_prefix", ""))
+
+                if not title:
+                    continue
+                description = ""
+                if args.get("use_inner_text_as_blob"):
+                    # Use the full card text as both location source and description
+                    full = (card.inner_text() or "")
+                    description = full[:2000]
+                    if not location:
+                        location = full[:300]
+                jobs.append({
+                    "id": href or f"{args['url']}#{i}",
+                    "title": title,
+                    "location": location,
+                    "url": href or args["url"],
+                    "posted": "",
+                    "description": description,
+                })
+            except Exception:
+                continue
+    finally:
+        ctx.close()
+
+    # Deduplicate within a single company by id
+    seen, deduped = set(), []
+    for j in jobs:
+        if j["id"] in seen:
+            continue
+        seen.add(j["id"])
+        deduped.append(j)
+    return deduped
+
+
+ADAPTERS = {
+    "workday": fetch_workday,
+    "ashby": fetch_ashby,
+    "greenhouse": fetch_greenhouse,
+    "pcsx": fetch_pcsx,
+    "wp_ajax": fetch_wp_ajax,
+    "playwright": fetch_playwright,
+}
+
+
+def location_matches(loc_text):
+    if not loc_text:
+        return False, False
+    low = loc_text.lower()
+    in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
+    has_remote = any(k in low for k in REMOTE_KEYWORDS)
+    is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
+    has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
+    # Count as remote-eligible only if it isn't a US-only remote listing
+    # and it has at least one EU/global hint
+    is_remote = has_remote and not is_us_only and has_eu_hint
+    return in_ch, is_remote
+
+
+def score_job(job):
+    blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
+    score, pos, neg = 0, [], []
+    for kw, w in POSITIVE_KEYWORDS.items():
+        if kw in blob:
+            score += w
+            pos.append(kw)
+    for kw, w in NEGATIVE_KEYWORDS.items():
+        if kw in blob:
+            score += w
+            neg.append(kw)
+    return score, pos, neg
+
+
+def load_seen():
+    if STATE_FILE.exists():
+        return json.loads(STATE_FILE.read_text(encoding="utf-8"))
+    return {}
+
+
+def save_seen(seen):
+    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
+    STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")
+
+
+def write_report(path, results, errors, new_only, include_weak):
+    today = datetime.now().strftime("%Y-%m-%d")
+    n_new = sum(1 for r in results if r["is_new"])
+    lines = [
+        f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
+        f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
+        f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n",
+    ]
+    if errors:
+        lines.append("## Errors\n")
+        for company, err in errors:
+            lines.append(f"- **{company}**: {err}")
+        lines.append("")
+
+    strong = [r for r in results if r["score"] >= 6]
+    medium = [r for r in results if 2 <= r["score"] < 6]
+    weak   = [r for r in results if r["score"] < 2]
+
+    if not include_weak and weak:
+        lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")
+
+    buckets = [("Strong fit (score >= 6)", strong),
+               ("Medium fit (score 2-5)", medium)]
+    if include_weak:
+        buckets.append(("Weak / noise (score < 2)", weak))
+
+    for bucket_name, bucket in buckets:
+        if not bucket:
+            continue
+        lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n")
+        for r in bucket:
+            new_tag = " [NEW]" if r["is_new"] else ""
+            loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
+            lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}")
+            lines.append(f"- Location: {r['location']} *({loc_tag})*")
+            if r.get("posted"):
+                lines.append(f"- Posted: {r['posted']}")
+            lines.append(f"- URL: {r['url']}")
+            if r["pos"]:
+                lines.append(f"- Positive: {', '.join(r['pos'])}")
+            if r["neg"]:
+                lines.append(f"- Negative: {', '.join(r['neg'])}")
+            lines.append("")
+
+    lines.append("\n## Manual check (companies without scrapable APIs)\n")
+    lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
+    lines.append("Open each link, scan for new postings since your last quarterly review:\n")
+    for name, note, url in MANUAL_CHECK:
+        lines.append(f"- [ ] **{name}** — {note}: <{url}>")
+    lines.append("")
+
+    path.write_text("\n".join(lines), encoding="utf-8")
+
+
+def main():
+    only, new_only, include_weak = None, False, False
+    for arg in sys.argv[1:]:
+        if arg == "--new-only":
+            new_only = True
+        elif arg == "--include-weak":
+            include_weak = True
+        elif arg.startswith("--only="):
+            only = arg.split("=", 1)[1]
+
+    seen = load_seen()
+    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
+    all_results, errors = [], []
+
+    for cid, display, adapter, args in COMPANIES:
+        if only and cid != only:
+            continue
+        print(f"Fetching {display}...", file=sys.stderr)
+        try:
+            jobs = ADAPTERS[adapter](args)
+        except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
+            errors.append((display, repr(e)))
+            continue
+        except Exception as e:
+            errors.append((display, f"unexpected: {e!r}"))
+            continue
+
+        company_seen = seen.setdefault(cid, {})
+        for j in jobs:
+            jid = str(j.get("id") or j.get("url"))
+            in_ch, is_remote = location_matches(j.get("location", ""))
+            if not (in_ch or is_remote):
+                continue
+            is_new = jid not in company_seen
+            score, pos, neg = score_job(j)
+            all_results.append({
+                "company": display, "company_id": cid,
+                "title": j["title"], "location": j["location"],
+                "url": j["url"], "posted": j.get("posted", ""),
+                "score": score, "pos": pos, "neg": neg,
+                "in_ch": in_ch, "remote": is_remote, "is_new": is_new,
+            })
+            company_seen[jid] = {"title": j["title"], "first_seen": today}
+
+    save_seen(seen)
+    _close_browser()
+
+    if new_only:
+        all_results = [r for r in all_results if r["is_new"]]
+
+    all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))
+
+    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
+    report_path = REPORTS_DIR / f"{today}.md"
+    write_report(report_path, all_results, errors, new_only, include_weak)
+
+    n_new = sum(1 for r in all_results if r["is_new"])
+    print(f"\nReport written: {report_path}", file=sys.stderr)
+    print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr)
+    if errors:
+        print(f"Errors: {len(errors)} - see report", file=sys.stderr)
+
+
+# === Adapter probe results (2026-05-21) =======================================
+# Tested all 15 target companies. The 5 working adapters are in COMPANIES above.
+# The remaining 10 are in MANUAL_CHECK. To upgrade one of those from manual to
+# automated, you'd need Playwright/Selenium (real browser) — different project.
+#
+#   Google         careers.google.com         404 on documented API; auth-gated
+#   Microsoft      gcsservices.careers.ms.com TLS handshake hangs from non-MS clients
+#   Apple          jobs.apple.com/api/v1      endpoint exists, location filter codes opaque
+#   Meta           metacareers.com            GraphQL with auth token
+#   Roche          careers.roche.com          PhenomPeople/Eightfold, JS-rendered
+#   IBM Research   research.ibm.com           static page, no API
+#   Cisco          jobs.cisco.com             JS-rendered SPA
+#   Sonova         careers.sonova.com         PhenomPeople SaaS, no public JSON
+#   Sygnum         sygnum.com/careers         Cloudflare-protected
+#   AMINA          aminagroup.com/career      static, low volume
+#   Bitcoin Suisse bitcoinsuisse.com/careers  static, low volume
+#   Coinbase       coinbase.com/careers       Cloudflare-protected
+# ==============================================================================
+
+
+if __name__ == "__main__":
+    main()