From eeec77b1e30539a12b3d85f0bd1cf8e51c3e53e6 Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Fri, 22 May 2026 10:12:27 +0200 Subject: [PATCH] =?UTF-8?q?feat:=20add=20job=5Fscout=20=E2=80=94=20automat?= =?UTF-8?q?ed=20CH=20job-board=20scraper=20for=20target=20companies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pulls fresh postings from the quarterly target-company list, filters by Swiss location / EU-remote eligibility, scores fit against profile keywords, tracks seen jobs, and writes a markdown report. Adapters (13 companies automated): - Workday: NVIDIA, Novartis - Ashby: Kraken, OpenAI - Greenhouse: Anthropic - PCSX (Eightfold): Microsoft - WordPress AJAX: Sygnum - Playwright (headless + stealth): Google, Apple, Meta, Roche, Cisco, IBM 4 companies (Sonova, Coinbase, AMINA, Bitcoin Suisse) resist automation (server-side bot blocks / JS widgets) and surface as a manual-check checklist. venv, reports/, and state/ are gitignored; only scout.py + requirements.txt are tracked. Co-Authored-By: Claude Opus 4.7 --- .claude/settings.local.json | 46 ++- .gitignore | 9 + job_scout/requirements.txt | 1 + job_scout/scout.py | 686 ++++++++++++++++++++++++++++++++++++ 4 files changed, 741 insertions(+), 1 deletion(-) create mode 100644 job_scout/requirements.txt create mode 100644 job_scout/scout.py diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 19c0e8c..2df8e8a 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -38,7 +38,51 @@ "Bash(python resume_builder/helpers/char_count.py -f resume output/Kraken_AI_Infrastructure/e2e_kraken_ai_infra_resume.tex)", "Bash(\"/c/Users/Dennis/AppData/Local/Programs/MiKTeX/miktex/bin/x64/pdflatex.exe\" -interaction=nonstopmode -output-directory=output/Kraken_AI_Infrastructure output/Kraken_AI_Infrastructure/e2e_kraken_ai_infra_resume.tex)", "WebFetch(domain:blog.kraken.com)", - "WebFetch(domain:github.com)" + "WebFetch(domain:github.com)", + "WebFetch(domain:jobs.ashbyhq.com)", + "WebFetch(domain:api.ashbyhq.com)", + "Bash(curl -s \"https://jobs.nvidia.com/api/apply/v2/jobs?domain=nvidia.com&start=0&num=50&location=Schweiz&pid=893391032265&sort_by=distance&filter_include_remote=1\" -o /c/Workspace/claude-resume-kit/nvidia_jobs.json)", + "WebFetch(domain:jobs.nvidia.com)", + "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://boards-api.greenhouse.io/v1/boards/coinbase/jobs\")", + "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://boards-api.greenhouse.io/v1/boards/coinbase.com/jobs\")", + "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://api.ashbyhq.com/posting-api/job-board/coinbase\")", + "Bash(curl -s \"https://www.coinbase.com/careers\" -I)", + "Bash(curl -s -o /dev/null -w \"%{http_code}\\\\n\" \"https://coinbase.wd1.myworkdayjobs.com/wday/cxs/coinbase/External/jobs\" -X POST -H \"Content-Type: application/json\" -d '{\"appliedFacets\":{},\"limit\":5,\"offset\":0,\"searchText\":\"\"}')", + "Bash(curl -s \"https://novartis.wd3.myworkdayjobs.com/wday/cxs/novartis/Novartis_Careers/jobs\" -X POST -H \"Content-Type: application/json\" -d '{\"appliedFacets\":{},\"limit\":5,\"offset\":0,\"searchText\":\"Switzerland\"}')", + "Bash(curl -sL \"https://jobs.apple.com/api/role/search?lang=en-us\" -X POST -H \"Content-Type: application/json\" -H \"User-Agent: Mozilla/5.0\" -d '{\"query\":\"\",\"filters\":{\"locations\":[\"postLocation-CHE\"]},\"page\":1}')", + "Bash(curl -s \"https://gcsservices.careers.microsoft.com/search/api/v1/search?lc=Switzerland&l=en_us&pg=1&pgSz=5&o=Relevance&flt=true\" -H \"User-Agent: Mozilla/5.0\" -H \"Accept: application/json\")", + "Bash(curl -s \"https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/google_jobs.html -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -s https://jobs.apple.com/api/role/search?lang=en-us -X POST -H 'Content-Type: application/json' -H 'User-Agent: Mozilla/5.0' -H 'Accept: application/json' -H 'Referer: https://jobs.apple.com/en-us/search?location=switzerland-CHE' -d '{\"query\":\"\",\"filters\":{\"postingpostingProgram\":[],\"locations\":[\"postLocation-CHE\"]},\"page\":1,\"sort\":\"newest\"}' -o /tmp/apple_jobs.json -w '%{http_code}\\\\n')", + "Read(//tmp/**)", + "Bash(curl -s 'https://gcsservices.careers.microsoft.com/search/api/v1/search?lc=Switzerland&l=en_us&pg=1&pgSz=5&o=Relevance&flt=true' -H 'User-Agent: Mozilla/5.0' -o /tmp/ms_jobs.json -w '%{http_code}\\\\n')", + "Bash(curl -sL 'https://careers.ibm.com/api/jobs?country_code=CH&page=1&hits=5' -H 'User-Agent: Mozilla/5.0' -o /tmp/ibm_jobs.json -w '%{http_code}\\\\n')", + "Bash(curl -sL 'https://jobs.cisco.com/jobs/SearchJobs?folderRecordsPerPage=5&listFilterMode=1&21178217=Switzerland' -H 'User-Agent: Mozilla/5.0' -o /tmp/cisco.html -w '%{http_code} size:%{size_download}\\\\n')", + "Bash(curl -sI \"https://jobs.apple.com/api/role/search?lang=en-us\" -X POST -H \"User-Agent: Mozilla/5.0\" -H \"Content-Type: application/json\")", + "Bash(curl -sL \"https://jobs.apple.com/api/v1/search?lang=en-us\" -X POST -H \"User-Agent: Mozilla/5.0\" -H \"Content-Type: application/json\" -d '{\"query\":\"\",\"filters\":{\"locations\":[\"postLocation-CHE\"]},\"page\":1}' -w \"%{http_code}\\\\n\")", + "Bash(curl -s \"https://careers.microsoft.com/v2/global/en/search.html?lc=Switzerland\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/ms.html -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -v \"https://gcsservices.careers.microsoft.com/search/api/v1/search?lc=Switzerland&l=en_us&pg=1&pgSz=5\")", + "Bash(curl -sL \"https://www.sygnum.com/careers/\" -H \"User-Agent: Mozilla/5.0 \\(Windows NT 10.0; Win64; x64\\) AppleWebKit/537.36\" -o /tmp/sygnum.html -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -s https://jobs.apple.com/api/v1/search?lang=en-us -X POST -H 'Content-Type: application/json' -H 'User-Agent: Mozilla/5.0' -d '{\"query\":\"\",\"filters\":{\"locations\":[\"postLocation-CHE-zurich-metro\"]},\"page\":1}')", + "Bash(curl -s https://jobs.apple.com/api/v1/search?lang=en-us -X POST -H 'Content-Type: application/json' -H 'User-Agent: Mozilla/5.0' -d '{\"query\":\"Switzerland\",\"page\":1}')", + "Bash(curl -sL \"https://careers.sonova.com\" -o /tmp/sonova.html -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -sL \"https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland\" -o /tmp/roche.html -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -s \"https://careers.google.com/api/v3/jobs/?location=Switzerland&page=1&page_size=5\" -H \"User-Agent: Mozilla/5.0\" -w \"%{http_code}\\\\n\")", + "Bash(curl -sIL \"https://careers.sonova.com\")", + "Bash(curl -s \"https://sygnum.jobs.personio.com/\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/sygnum_p.html -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -s \"https://api.smartrecruiters.com/v1/companies/Sygnum/postings\" -w \"%{http_code}\\\\n\")", + "Bash(curl -s \"https://ibmglobal.avature.net/careers/JobDetail?jobId=&country=Switzerland\" -L -o /tmp/ibm2.html -w \"%{http_code}\\\\n\")", + "Bash(curl -s \"https://roche.wd3.myworkdayjobs.com/wday/cxs/roche/roche/jobs\" -X POST -H \"Content-Type: application/json\" -d '{\"appliedFacets\":{},\"limit\":5,\"offset\":0,\"searchText\":\"Switzerland\"}' -w \"%{http_code}\\\\n\")", + "Bash(curl -sIL \"https://sygnum.jobs.personio.com/\" -H \"User-Agent: Mozilla/5.0\")", + "Bash(curl -s \"https://sygnum.jobs.personio.com/xml\" -H \"User-Agent: Mozilla/5.0\" -o /tmp/sygnum_xml.xml -w \"%{http_code} size:%{size_download}\\\\n\")", + "Bash(curl -s \"https://bitcoinsuisse.jobs.personio.com/xml\" -H \"User-Agent: Mozilla/5.0\" -w \"%{http_code} size:%{size_download}\\\\n\" -o /tmp/bts.xml)", + "Bash(job_scout/.venv/Scripts/python.exe -m pip install --upgrade pip)", + "Bash(job_scout/.venv/Scripts/python.exe -m pip install -r job_scout/requirements.txt)", + "Bash(job_scout/.venv/Scripts/python.exe -m playwright install chromium)", + "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py)", + "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)", + "Bash(job_scout/.venv/Scripts/python.exe -c ' *)", + "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)", + "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)" ] } } diff --git a/.gitignore b/.gitignore index a775f86..876fe13 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,15 @@ Thumbs.db __pycache__/ *.pyc +# job_scout runtime artifacts (keep scout.py + requirements.txt only) +job_scout/.venv/ +job_scout/reports/ +job_scout/state/ + +# One-off job-board data pulls (debug artifacts) +*_jd.json +*_jobs*.json + # Editor *.swp *.swo diff --git a/job_scout/requirements.txt b/job_scout/requirements.txt new file mode 100644 index 0000000..b47208c --- /dev/null +++ b/job_scout/requirements.txt @@ -0,0 +1 @@ +playwright>=1.40,<2 diff --git a/job_scout/scout.py b/job_scout/scout.py new file mode 100644 index 0000000..8a953ac --- /dev/null +++ b/job_scout/scout.py @@ -0,0 +1,686 @@ +"""Job scout for Dennis's quarterly target companies. + +Pulls latest openings from companies with known public ATS APIs (Workday/Ashby/Greenhouse), +filters by Swiss location or remote eligibility, scores fit against profile keywords, tracks +which job IDs we've already seen, writes a markdown report. + +Usage: + py scout.py # Pull all configured companies (strong + medium only) + py scout.py --only=nvidia # Pull a single company by id + py scout.py --new-only # Report only jobs not seen before + py scout.py --include-weak # Include weak/noise bucket (default hidden) + +State : state/seen_jobs.json +Output: reports/YYYY-MM-DD.md + +To add a company: append to COMPANIES with one of the existing adapter types. +For companies behind custom careers sites (Google, MS, Meta, Apple, Roche, Novartis, IBM, +Cisco, Sonova, Sygnum) — see TODO_ADAPTERS at the bottom. +""" + +import json +import re +import sys +import urllib.error +import urllib.parse +import urllib.request +from datetime import datetime, timezone +from pathlib import Path + +ROOT = Path(__file__).parent +STATE_FILE = ROOT / "state" / "seen_jobs.json" +REPORTS_DIR = ROOT / "reports" +USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)" + +CH_LOCATION_KEYWORDS = [ + "switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf", + "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse", +] + +REMOTE_KEYWORDS = ["remote"] + +US_ONLY_PATTERNS = [ + "remote - us", "remote, us", "remote-us", "us remote", "us-remote", + "remote-friendly us", "remote (us)", "united states - remote", + "remote, united states", +] + +EU_HINT_KEYWORDS = [ + "germany", "france", "spain", "portugal", "ireland", "netherlands", + "sweden", "norway", "finland", "denmark", "poland", "czech", + "romania", "italy", "austria", "belgium", "uk", "united kingdom", + "europe", "emea", "global", +] + CH_LOCATION_KEYWORDS + +POSITIVE_KEYWORDS = { + "genai": 3, "generative ai": 3, "llm": 3, "large language model": 3, + "applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3, + "mlops": 3, "ai platform": 3, "ml platform": 3, + "python": 2, "java": 2, "data engineer": 2, "data engineering": 2, + "solutions architect": 2, "platform engineer": 2, + "ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2, + "kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1, + "crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3, + "senior": 1, "staff": 1, "lead": 1, "principal": 1, +} + +NEGATIVE_KEYWORDS = { + "cuda": -3, "kernel driver": -3, "gpu programming": -3, + "compiler engineer": -3, "pytorch internals": -3, "jax internals": -3, + "rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2, + "frontend": -3, "front-end": -3, "react native": -3, + "ios engineer": -3, "android engineer": -3, "mobile engineer": -3, + "ui engineer": -2, "ux engineer": -2, + "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3, + "physical design": -3, "silicon": -2, + "expert c++": -2, "5+ years c++": -2, "deep c++": -2, + "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3, +} + +# id, display, adapter, adapter_args +COMPANIES = [ + ("nvidia", "NVIDIA", "workday", { + "host": "nvidia.wd5.myworkdayjobs.com", + "tenant": "nvidia", + "site": "NVIDIAExternalCareerSite", + "search_text": "Switzerland", + }), + ("kraken", "Kraken", "ashby", {"slug": "kraken.com"}), + ("openai", "OpenAI", "ashby", {"slug": "openai"}), + ("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}), + ("novartis", "Novartis", "workday", { + "host": "novartis.wd3.myworkdayjobs.com", + "tenant": "novartis", + "site": "Novartis_Careers", + "search_text": "Switzerland", + }), + # PCSX (Eightfold) — Microsoft has a public position search endpoint + ("microsoft", "Microsoft", "pcsx", { + "domain": "microsoft.com", + "location": "Switzerland", + }), + # Sygnum — WordPress AJAX endpoint returns clean JSON + ("sygnum", "Sygnum", "wp_ajax", { + "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c", + }), + # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. + # Google actively bot-detects; the STEALTH_JS init script (applied to every context) + # is what makes its job list render. Cards are
  • with a "Learn more about " + # aria-label link; location lives in the card text (captured via blob mode). + ("google", "Google", "playwright", { + "url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland", + "wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']", + "card": "li:has(a[aria-label*='Learn more about'])", + "title_sel": "a[aria-label*='Learn more about']", + "title_sel_attr": "aria-label", + "title_strip_prefix": "Learn more about ", + "link_sel": "a[href*='jobs/results/']", + "link_attr": "href", + "url_prefix": "https://www.google.com/about/careers/applications/", + "default_location": "", + "scroll_count": 5, + "use_inner_text_as_blob": True, + "cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"], + }), + ("apple", "Apple", "playwright", { + "url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE", + "wait_for": "a[href*='/en-us/details/']", + "card": "a[href*='/en-us/details/']", + "title_attr": "text", + "link_attr": "href", + "url_prefix": "https://jobs.apple.com", + "default_location": "Switzerland", + }), + # Meta job links are /profile/job_details/<id>; title + location are in the link text. + ("meta", "Meta", "playwright", { + "url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland", + "wait_for": "a[href*='/profile/job_details/']", + "card": "a[href*='/profile/job_details/']", + "title_attr": "text", + "link_attr": "href", + "url_prefix": "https://www.metacareers.com", + "default_location": "Zurich, Switzerland", + "scroll_count": 5, + "use_inner_text_as_blob": True, + }), + # PhenomPeople pattern (Roche) uses li.jobs-list-item. + # Card inner text is structured like: "<title> | Location | <city, country> | Category | ..." + # We extract title from first line, full text becomes the "description" so our location + # filter still sees Switzerland mentions. + ("roche", "Roche", "playwright", { + "url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland", + "wait_for": "li.jobs-list-item, a.au-target", + "card": "li.jobs-list-item:not(:has-text('Saved jobs'))", + "title_attr": "text", + "link_sel": "a[href]", + "link_attr": "href", + "url_prefix": "https://careers.roche.com", + "default_location": "", + "cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"], + "scroll_count": 6, + "use_inner_text_as_blob": True, + }), + # Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles. + ("cisco", "Cisco", "playwright", { + "url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland", + "wait_for": "a[href*='/job/'], div[role='listitem']", + "card": "div[role='listitem']:has(a[href*='/job/'])", + "title_sel": "a[href*='/job/']", + "link_sel": "a[href*='/job/']", + "link_attr": "href", + "url_prefix": "https://careers.cisco.com", + "default_location": "Switzerland", + "cookie_accept": ["#onetrust-accept-btn-handler"], + "scroll_count": 5, + "use_inner_text_as_blob": True, + }), + ("ibm", "IBM Research", "playwright", { + # IBM Research Zurich careers page is mostly a static intro with few openings. + # Use IBM's main careers search filtered to Switzerland instead. + "url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland", + "wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']", + "card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)", + "title_sel": "h3, h4", + "link_sel": "a[href*='/careers/']", + "link_attr": "href", + "url_prefix": "https://www.ibm.com", + "default_location": "Switzerland", + "scroll_count": 4, + }), +] + +# Companies where adapter probing did not yield a reliable scrape. Reasons noted. +# These surface as a clickable checklist in the report so they're not forgotten. +MANUAL_CHECK = [ + ("Sonova", "PhenomPeople serves empty shell to automation (body never renders); widgets API rejects requests", + "https://careers.sonova.com/us/en/search-results?keywords=Switzerland"), + ("Coinbase", "/careers/positions 302-redirects to landing; no job links or ATS API exposed even with stealth", + "https://www.coinbase.com/careers"), + ("AMINA Bank", "jobs are at /careers/ (#positions) via JS widget; only ~4 apply links, no scrapable list", + "https://aminagroup.com/careers/#positions"), + ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)", + "https://bitcoinsuisse.com/careers#open-positions"), +] + + +def http_get_json(url, headers=None, data=None, method="GET"): + headers = headers or {} + headers.setdefault("User-Agent", USER_AGENT) + headers.setdefault("Accept", "application/json") + if data is not None and isinstance(data, dict): + data = json.dumps(data).encode("utf-8") + headers.setdefault("Content-Type", "application/json") + req = urllib.request.Request(url, data=data, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=30) as resp: + return json.loads(resp.read().decode("utf-8")) + + +def fetch_workday(args): + host, site, tenant = args["host"], args["site"], args["tenant"] + search_text = args.get("search_text", "") + url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs" + jobs, offset = [], 0 + while True: + data = http_get_json(url, method="POST", data={ + "appliedFacets": {}, "limit": 20, "offset": offset, + "searchText": search_text, + }) + postings = data.get("jobPostings", []) + for p in postings: + ext = p.get("externalPath", "") + jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext + jobs.append({ + "id": jid, + "title": p.get("title", ""), + "location": p.get("locationsText", "") + " " + ext, + "url": f"https://{host}{ext}", + "posted": p.get("postedOn", ""), + "description": "", + }) + total = data.get("total", 0) + offset += len(postings) + if not postings or offset >= total: + break + return jobs + + +def fetch_ashby(args): + slug = args["slug"] + url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true" + data = http_get_json(url) + jobs = [] + for j in data.get("jobs", []): + secs = j.get("secondaryLocations", []) or [] + sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs] + loc_blob = " | ".join([j.get("location", "") or ""] + sec_names) + jobs.append({ + "id": j.get("id"), + "title": j.get("title", ""), + "location": loc_blob, + "url": j.get("jobUrl"), + "posted": j.get("publishedAt", ""), + "description": (j.get("descriptionPlain") or "")[:2500], + "department": j.get("department", ""), + }) + return jobs + + +def fetch_greenhouse(args): + board = args["board"] + url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true" + data = http_get_json(url) + jobs = [] + for j in data.get("jobs", []): + loc = (j.get("location") or {}).get("name", "") + offices = j.get("offices") or [] + office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict)) + loc_blob = " ".join(x for x in [loc, office_names] if x) + desc = j.get("content", "") or "" + desc = re.sub(r"<[^>]+>", " ", desc) + desc = re.sub(r"\s+", " ", desc).strip() + jobs.append({ + "id": str(j.get("id")), + "title": j.get("title", ""), + "location": loc_blob, + "url": j.get("absolute_url"), + "posted": j.get("updated_at", ""), + "description": desc[:2500], + }) + return jobs + + +def fetch_pcsx(args): + """Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com. + The same endpoint pattern is used by other PCS-hosted boards.""" + domain = args["domain"] + location = args.get("location", "") + base = "https://apply.careers.microsoft.com/api/pcsx/search" + jobs, start = [], 0 + while True: + url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50" + data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"}) + positions = (data.get("data") or {}).get("positions", []) or [] + for p in positions: + locs = p.get("locations") or [] + jobs.append({ + "id": str(p.get("id")), + "title": p.get("name", ""), + "location": " | ".join(locs), + "url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}", + "posted": p.get("postedTs", ""), + "description": (p.get("description") or "")[:2000], + }) + if not positions or len(positions) < 50: + break + start += len(positions) + if start >= 500: + break + return jobs + + +def fetch_wp_ajax(args): + """WordPress admin-ajax style endpoint. Sygnum uses this pattern.""" + url = args["url"] + data = http_get_json(url) + if not isinstance(data, list): + return [] + jobs = [] + for j in data: + jobs.append({ + "id": (j.get("title", "") + "|" + j.get("location", ""))[:120], + "title": j.get("title", ""), + "location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])), + "url": j.get("application_url") or args["url"], + "posted": "", + "description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])), + }) + return jobs + + +# Injected before page scripts run, to mask the most common headless-detection signals. +# Required for Google; harmless for the other sites. +STEALTH_JS = """ +Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); +window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}}; +Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); +Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']}); +const _q = navigator.permissions && navigator.permissions.query; +if (_q) { + navigator.permissions.query = (p) => p && p.name === 'notifications' + ? Promise.resolve({state: Notification.permission}) : _q(p); +} +""" + +_playwright_singleton = {"pw": None, "browser": None} + +def _get_browser(): + """Lazy-init a single shared headless browser. Saves ~3s per company.""" + if _playwright_singleton["browser"] is not None: + return _playwright_singleton["browser"] + try: + from playwright.sync_api import sync_playwright + except ImportError as e: + raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e + pw = sync_playwright().start() + browser = pw.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"], + ) + _playwright_singleton["pw"] = pw + _playwright_singleton["browser"] = browser + return browser + + +def _absolutize(href, prefix): + """Join a possibly-relative href with the configured prefix.""" + if not href or href.startswith("http"): + return href + cleaned = href.lstrip("./").lstrip("/") + if not prefix: + return href + return prefix.rstrip("/") + "/" + cleaned + + +def _close_browser(): + if _playwright_singleton["browser"]: + try: + _playwright_singleton["browser"].close() + except Exception: + pass + if _playwright_singleton["pw"]: + try: + _playwright_singleton["pw"].stop() + except Exception: + pass + + +def fetch_playwright(args): + """Generic headless-browser scraper. See COMPANIES entries for selector args.""" + browser = _get_browser() + ctx = browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + locale="en-US", + viewport={"width": 1366, "height": 768}, + ) + ctx.add_init_script(STEALTH_JS) + page = ctx.new_page() + jobs = [] + try: + page.goto(args["url"], timeout=45000, wait_until="domcontentloaded") + # Optional cookie banner acceptance + for sel in args.get("cookie_accept", []) or []: + try: + btn = page.locator(sel).first + if btn.is_visible(timeout=2000): + btn.click() + page.wait_for_timeout(500) + except Exception: + pass + # Wait for job content to render + wait_for = args.get("wait_for") + if wait_for: + try: + page.wait_for_selector(wait_for, timeout=15000) + except Exception: + page.wait_for_timeout(4000) + # Scroll a few times to trigger any lazy-loaded results + for _ in range(args.get("scroll_count", 3)): + try: + page.mouse.wheel(0, 4000) + page.wait_for_timeout(700) + except Exception: + break + + cards = page.locator(args["card"]) + n = min(cards.count(), args.get("max_cards", 150)) + for i in range(n): + card = cards.nth(i) + try: + title = "" + if args.get("title_attr") == "text": + title = (card.inner_text() or "").strip().split("\n", 1)[0][:200] + elif args.get("title_attr"): + title = (card.get_attribute(args["title_attr"]) or "").strip() + elif args.get("title_sel"): + t = card.locator(args["title_sel"]).first + if t.count(): + # Read either an attribute (e.g. aria-label) or the inner text + if args.get("title_sel_attr"): + title = (t.get_attribute(args["title_sel_attr"]) or "").strip() + else: + title = (t.inner_text() or "").strip() + if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]): + title = title[len(args["title_strip_prefix"]):].strip() + if not title: + title = (card.inner_text() or "").strip().split("\n", 1)[0][:200] + + location = args.get("default_location", "") + if args.get("location_sel"): + lsel = card.locator(args["location_sel"]).first + if lsel.count(): + location = (lsel.inner_text() or location).strip() + + link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first + href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else "" + href = _absolutize(href, args.get("url_prefix", "")) + + if not title: + continue + description = "" + if args.get("use_inner_text_as_blob"): + # Use the full card text as both location source and description + full = (card.inner_text() or "") + description = full[:2000] + if not location: + location = full[:300] + jobs.append({ + "id": href or f"{args['url']}#{i}", + "title": title, + "location": location, + "url": href or args["url"], + "posted": "", + "description": description, + }) + except Exception: + continue + finally: + ctx.close() + + # Deduplicate within a single company by id + seen, deduped = set(), [] + for j in jobs: + if j["id"] in seen: + continue + seen.add(j["id"]) + deduped.append(j) + return deduped + + +ADAPTERS = { + "workday": fetch_workday, + "ashby": fetch_ashby, + "greenhouse": fetch_greenhouse, + "pcsx": fetch_pcsx, + "wp_ajax": fetch_wp_ajax, + "playwright": fetch_playwright, +} + + +def location_matches(loc_text): + if not loc_text: + return False, False + low = loc_text.lower() + in_ch = any(k in low for k in CH_LOCATION_KEYWORDS) + has_remote = any(k in low for k in REMOTE_KEYWORDS) + is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch + has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS) + # Count as remote-eligible only if it isn't a US-only remote listing + # and it has at least one EU/global hint + is_remote = has_remote and not is_us_only and has_eu_hint + return in_ch, is_remote + + +def score_job(job): + blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower() + score, pos, neg = 0, [], [] + for kw, w in POSITIVE_KEYWORDS.items(): + if kw in blob: + score += w + pos.append(kw) + for kw, w in NEGATIVE_KEYWORDS.items(): + if kw in blob: + score += w + neg.append(kw) + return score, pos, neg + + +def load_seen(): + if STATE_FILE.exists(): + return json.loads(STATE_FILE.read_text(encoding="utf-8")) + return {} + + +def save_seen(seen): + STATE_FILE.parent.mkdir(parents=True, exist_ok=True) + STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8") + + +def write_report(path, results, errors, new_only, include_weak): + today = datetime.now().strftime("%Y-%m-%d") + n_new = sum(1 for r in results if r["is_new"]) + lines = [ + f"# Job scout report {today}{' (new only)' if new_only else ''}\n", + f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.", + f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n", + ] + if errors: + lines.append("## Errors\n") + for company, err in errors: + lines.append(f"- **{company}**: {err}") + lines.append("") + + strong = [r for r in results if r["score"] >= 6] + medium = [r for r in results if 2 <= r["score"] < 6] + weak = [r for r in results if r["score"] < 2] + + if not include_weak and weak: + lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._") + + buckets = [("Strong fit (score >= 6)", strong), + ("Medium fit (score 2-5)", medium)] + if include_weak: + buckets.append(("Weak / noise (score < 2)", weak)) + + for bucket_name, bucket in buckets: + if not bucket: + continue + lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n") + for r in bucket: + new_tag = " [NEW]" if r["is_new"] else "" + loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?") + lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}") + lines.append(f"- Location: {r['location']} *({loc_tag})*") + if r.get("posted"): + lines.append(f"- Posted: {r['posted']}") + lines.append(f"- URL: {r['url']}") + if r["pos"]: + lines.append(f"- Positive: {', '.join(r['pos'])}") + if r["neg"]: + lines.append(f"- Negative: {', '.join(r['neg'])}") + lines.append("") + + lines.append("\n## Manual check (companies without scrapable APIs)\n") + lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.") + lines.append("Open each link, scan for new postings since your last quarterly review:\n") + for name, note, url in MANUAL_CHECK: + lines.append(f"- [ ] **{name}** — {note}: <{url}>") + lines.append("") + + path.write_text("\n".join(lines), encoding="utf-8") + + +def main(): + only, new_only, include_weak = None, False, False + for arg in sys.argv[1:]: + if arg == "--new-only": + new_only = True + elif arg == "--include-weak": + include_weak = True + elif arg.startswith("--only="): + only = arg.split("=", 1)[1] + + seen = load_seen() + today = datetime.now(timezone.utc).strftime("%Y-%m-%d") + all_results, errors = [], [] + + for cid, display, adapter, args in COMPANIES: + if only and cid != only: + continue + print(f"Fetching {display}...", file=sys.stderr) + try: + jobs = ADAPTERS[adapter](args) + except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e: + errors.append((display, repr(e))) + continue + except Exception as e: + errors.append((display, f"unexpected: {e!r}")) + continue + + company_seen = seen.setdefault(cid, {}) + for j in jobs: + jid = str(j.get("id") or j.get("url")) + in_ch, is_remote = location_matches(j.get("location", "")) + if not (in_ch or is_remote): + continue + is_new = jid not in company_seen + score, pos, neg = score_job(j) + all_results.append({ + "company": display, "company_id": cid, + "title": j["title"], "location": j["location"], + "url": j["url"], "posted": j.get("posted", ""), + "score": score, "pos": pos, "neg": neg, + "in_ch": in_ch, "remote": is_remote, "is_new": is_new, + }) + company_seen[jid] = {"title": j["title"], "first_seen": today} + + save_seen(seen) + _close_browser() + + if new_only: + all_results = [r for r in all_results if r["is_new"]] + + all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"])) + + REPORTS_DIR.mkdir(parents=True, exist_ok=True) + report_path = REPORTS_DIR / f"{today}.md" + write_report(report_path, all_results, errors, new_only, include_weak) + + n_new = sum(1 for r in all_results if r["is_new"]) + print(f"\nReport written: {report_path}", file=sys.stderr) + print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr) + if errors: + print(f"Errors: {len(errors)} - see report", file=sys.stderr) + + +# === Adapter probe results (2026-05-21) ======================================= +# Tested all 15 target companies. The 5 working adapters are in COMPANIES above. +# The remaining 10 are in MANUAL_CHECK. To upgrade one of those from manual to +# automated, you'd need Playwright/Selenium (real browser) — different project. +# +# Google careers.google.com 404 on documented API; auth-gated +# Microsoft gcsservices.careers.ms.com TLS handshake hangs from non-MS clients +# Apple jobs.apple.com/api/v1 endpoint exists, location filter codes opaque +# Meta metacareers.com GraphQL with auth token +# Roche careers.roche.com PhenomPeople/Eightfold, JS-rendered +# IBM Research research.ibm.com static page, no API +# Cisco jobs.cisco.com JS-rendered SPA +# Sonova careers.sonova.com PhenomPeople SaaS, no public JSON +# Sygnum sygnum.com/careers Cloudflare-protected +# AMINA aminagroup.com/career static, low volume +# Bitcoin Suisse bitcoinsuisse.com/careers static, low volume +# Coinbase coinbase.com/careers Cloudflare-protected +# ============================================================================== + + +if __name__ == "__main__": + main()