"""Job scout for Dennis's quarterly target companies. Pulls latest openings from companies with known public ATS APIs (Workday/Ashby/Greenhouse), filters by Swiss location or remote eligibility, scores fit against profile keywords, tracks which job IDs we've already seen, writes a markdown report. Usage: py scout.py # Pull all configured companies (strong + medium only) py scout.py --only=nvidia # Pull a single company by id py scout.py --new-only # Report only jobs not seen before py scout.py --include-weak # Include weak/noise bucket (default hidden) State : state/seen_jobs.json Output: reports/YYYY-MM-DD.md To add a company: append to COMPANIES with one of the existing adapter types. For companies behind custom careers sites (Google, MS, Meta, Apple, Roche, Novartis, IBM, Cisco, Sonova, Sygnum) — see TODO_ADAPTERS at the bottom. """ import json import re import sys import urllib.error import urllib.parse import urllib.request from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).parent STATE_FILE = ROOT / "state" / "seen_jobs.json" REPORTS_DIR = ROOT / "reports" USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)" CH_LOCATION_KEYWORDS = [ "switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf", "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse", ] REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"] US_ONLY_PATTERNS = [ "remote - us", "remote, us", "remote-us", "us remote", "us-remote", "remote-friendly us", "remote (us)", "united states - remote", "remote, united states", ] EU_HINT_KEYWORDS = [ "germany", "france", "spain", "portugal", "ireland", "netherlands", "sweden", "norway", "finland", "denmark", "poland", "czech", "romania", "italy", "austria", "belgium", "uk", "united kingdom", "europe", "emea", "global", "worldwide", ] + CH_LOCATION_KEYWORDS POSITIVE_KEYWORDS = { "genai": 3, "generative ai": 3, "llm": 3, "large language model": 3, "applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3, "mlops": 3, "ai platform": 3, "ml platform": 3, "python": 2, "java": 2, "data engineer": 2, "data engineering": 2, "solutions architect": 2, "platform engineer": 2, "ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2, "kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1, "crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3, "senior": 1, "staff": 1, "lead": 1, "principal": 1, } NEGATIVE_KEYWORDS = { "cuda": -3, "kernel driver": -3, "gpu programming": -3, "compiler engineer": -3, "pytorch internals": -3, "jax internals": -3, "rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2, "frontend": -3, "front-end": -3, "react native": -3, "ios engineer": -3, "android engineer": -3, "mobile engineer": -3, "ui engineer": -2, "ux engineer": -2, "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3, "physical design": -3, "silicon": -2, "expert c++": -2, "5+ years c++": -2, "deep c++": -2, "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3, } # Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that # post mostly non-tech roles). Only keep titles containing one of these specific role # phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel" # don't leak in. Matched as case-insensitive substrings against the title only. ENG_TITLE_FILTER = [ "data engineer", "data engineering", "data platform", "platform engineer", "data infrastructure", "data architect", "analytics engineer", "mlops", "ml engineer", "ml platform", "machine learning engineer", "site reliability", "sre", "backend engineer", "back-end engineer", "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer", "kafka", "streaming", "big data", "quantitative developer", "quant developer", ] # id, display, adapter, adapter_args COMPANIES = [ ("nvidia", "NVIDIA", "workday", { "host": "nvidia.wd5.myworkdayjobs.com", "tenant": "nvidia", "site": "NVIDIAExternalCareerSite", "search_text": "Switzerland", }), ("kraken", "Kraken", "ashby", {"slug": "kraken.com"}), ("openai", "OpenAI", "ashby", {"slug": "openai"}), ("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}), ("novartis", "Novartis", "workday", { "host": "novartis.wd3.myworkdayjobs.com", "tenant": "novartis", "site": "Novartis_Careers", "search_text": "Switzerland", }), # PCSX (Eightfold) — Microsoft has a public position search endpoint ("microsoft", "Microsoft", "pcsx", { "domain": "microsoft.com", "location": "Switzerland", }), # Sygnum — WordPress AJAX endpoint returns clean JSON ("sygnum", "Sygnum", "wp_ajax", { "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c", }), # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) --- ("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}), ("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}), ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}), ("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}), # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) --- ("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}), ("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}), ("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}), # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Google actively bot-detects; the STEALTH_JS init script (applied to every context) # is what makes its job list render. Cards are
  • with a "Learn more about " # aria-label link; location lives in the card text (captured via blob mode). ("google", "Google", "playwright", { "url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland", "wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']", "card": "li:has(a[aria-label*='Learn more about'])", "title_sel": "a[aria-label*='Learn more about']", "title_sel_attr": "aria-label", "title_strip_prefix": "Learn more about ", "link_sel": "a[href*='jobs/results/']", "link_attr": "href", "url_prefix": "https://www.google.com/about/careers/applications/", "default_location": "", "scroll_count": 5, "use_inner_text_as_blob": True, "cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"], }), ("apple", "Apple", "playwright", { "url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE", "wait_for": "a[href*='/en-us/details/']", "card": "a[href*='/en-us/details/']", "title_attr": "text", "link_attr": "href", "url_prefix": "https://jobs.apple.com", "default_location": "Switzerland", }), # Meta job links are /profile/job_details/<id>; title + location are in the link text. ("meta", "Meta", "playwright", { "url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland", "wait_for": "a[href*='/profile/job_details/']", "card": "a[href*='/profile/job_details/']", "title_attr": "text", "link_attr": "href", "url_prefix": "https://www.metacareers.com", "default_location": "Zurich, Switzerland", "scroll_count": 5, "use_inner_text_as_blob": True, }), # PhenomPeople pattern (Roche) uses li.jobs-list-item. # Card inner text is structured like: "<title> | Location | <city, country> | Category | ..." # We extract title from first line, full text becomes the "description" so our location # filter still sees Switzerland mentions. ("roche", "Roche", "playwright", { "url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland", "wait_for": "li.jobs-list-item, a.au-target", "card": "li.jobs-list-item:not(:has-text('Saved jobs'))", "title_attr": "text", "link_sel": "a[href]", "link_attr": "href", "url_prefix": "https://careers.roche.com", "default_location": "", "cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"], "scroll_count": 6, "use_inner_text_as_blob": True, }), # Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles. ("cisco", "Cisco", "playwright", { "url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland", "wait_for": "a[href*='/job/'], div[role='listitem']", "card": "div[role='listitem']:has(a[href*='/job/'])", "title_sel": "a[href*='/job/']", "link_sel": "a[href*='/job/']", "link_attr": "href", "url_prefix": "https://careers.cisco.com", "default_location": "Switzerland", "cookie_accept": ["#onetrust-accept-btn-handler"], "scroll_count": 5, "use_inner_text_as_blob": True, }), ] # Companies where adapter probing did not yield a reliable scrape. Reasons noted. # These surface as a clickable checklist in the report so they're not forgotten. MANUAL_CHECK = [ ("Sonova", "PhenomPeople serves empty shell to automation (body never renders); widgets API rejects requests", "https://careers.sonova.com/us/en/search-results?keywords=Switzerland"), ("Coinbase", "/careers/positions 302-redirects to landing; no job links or ATS API exposed even with stealth", "https://www.coinbase.com/careers"), ("AMINA Bank", "jobs are at /careers/ (#positions) via JS widget; only ~4 apply links, no scrapable list", "https://aminagroup.com/careers/#positions"), ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)", "https://bitcoinsuisse.com/careers#open-positions"), ] def http_get_json(url, headers=None, data=None, method="GET"): headers = headers or {} headers.setdefault("User-Agent", USER_AGENT) headers.setdefault("Accept", "application/json") if data is not None and isinstance(data, dict): data = json.dumps(data).encode("utf-8") headers.setdefault("Content-Type", "application/json") req = urllib.request.Request(url, data=data, headers=headers, method=method) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode("utf-8")) def fetch_workday(args): host, site, tenant = args["host"], args["site"], args["tenant"] search_text = args.get("search_text", "") url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs" jobs, offset = [], 0 while True: data = http_get_json(url, method="POST", data={ "appliedFacets": {}, "limit": 20, "offset": offset, "searchText": search_text, }) postings = data.get("jobPostings", []) for p in postings: ext = p.get("externalPath", "") jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext jobs.append({ "id": jid, "title": p.get("title", ""), "location": p.get("locationsText", "") + " " + ext, "url": f"https://{host}{ext}", "posted": p.get("postedOn", ""), "description": "", }) total = data.get("total", 0) offset += len(postings) if not postings or offset >= total: break return jobs def fetch_ashby(args): slug = args["slug"] url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true" data = http_get_json(url) jobs = [] for j in data.get("jobs", []): secs = j.get("secondaryLocations", []) or [] sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs] loc_blob = " | ".join([j.get("location", "") or ""] + sec_names) jobs.append({ "id": j.get("id"), "title": j.get("title", ""), "location": loc_blob, "url": j.get("jobUrl"), "posted": j.get("publishedAt", ""), "description": (j.get("descriptionPlain") or "")[:2500], "department": j.get("department", ""), }) return jobs def fetch_greenhouse(args): board = args["board"] url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true" data = http_get_json(url) jobs = [] for j in data.get("jobs", []): loc = (j.get("location") or {}).get("name", "") offices = j.get("offices") or [] office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict)) loc_blob = " ".join(x for x in [loc, office_names] if x) desc = j.get("content", "") or "" desc = re.sub(r"<[^>]+>", " ", desc) desc = re.sub(r"\s+", " ", desc).strip() jobs.append({ "id": str(j.get("id")), "title": j.get("title", ""), "location": loc_blob, "url": j.get("absolute_url"), "posted": j.get("updated_at", ""), "description": desc[:2500], }) return jobs def fetch_pcsx(args): """Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com. The same endpoint pattern is used by other PCS-hosted boards.""" domain = args["domain"] location = args.get("location", "") base = "https://apply.careers.microsoft.com/api/pcsx/search" jobs, start = [], 0 while True: url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50" data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"}) positions = (data.get("data") or {}).get("positions", []) or [] for p in positions: locs = p.get("locations") or [] jobs.append({ "id": str(p.get("id")), "title": p.get("name", ""), "location": " | ".join(locs), "url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}", "posted": p.get("postedTs", ""), "description": (p.get("description") or "")[:2000], }) if not positions or len(positions) < 50: break start += len(positions) if start >= 500: break return jobs def fetch_smartrecruiters(args): """SmartRecruiters public postings API. Used by many EU energy/commodity firms.""" company = args["company"] base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings" jobs, offset = [], 0 while True: data = http_get_json(f"{base}?limit=100&offset={offset}") content = data.get("content", []) or [] for p in content: loc = p.get("location") or {} parts = [loc.get("fullLocation") or loc.get("city") or ""] if loc.get("remote"): parts.append("Remote") if loc.get("hybrid"): parts.append("Hybrid") loc_str = " ".join(x for x in parts if x) dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else "" func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else "" jobs.append({ "id": str(p.get("id")), "title": p.get("name", ""), "location": loc_str, "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}", "posted": p.get("releasedDate", ""), "description": " ".join(filter(None, [dept, func])), }) total = data.get("totalFound", 0) offset += len(content) if not content or offset >= total or offset >= 300: break return jobs def fetch_wp_ajax(args): """WordPress admin-ajax style endpoint. Sygnum uses this pattern.""" url = args["url"] data = http_get_json(url) if not isinstance(data, list): return [] jobs = [] for j in data: jobs.append({ "id": (j.get("title", "") + "|" + j.get("location", ""))[:120], "title": j.get("title", ""), "location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])), "url": j.get("application_url") or args["url"], "posted": "", "description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])), }) return jobs # Injected before page scripts run, to mask the most common headless-detection signals. # Required for Google; harmless for the other sites. STEALTH_JS = """ Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}}; Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']}); const _q = navigator.permissions && navigator.permissions.query; if (_q) { navigator.permissions.query = (p) => p && p.name === 'notifications' ? Promise.resolve({state: Notification.permission}) : _q(p); } """ _playwright_singleton = {"pw": None, "browser": None} def _get_browser(): """Lazy-init a single shared headless browser. Saves ~3s per company.""" if _playwright_singleton["browser"] is not None: return _playwright_singleton["browser"] try: from playwright.sync_api import sync_playwright except ImportError as e: raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e pw = sync_playwright().start() browser = pw.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled"], ) _playwright_singleton["pw"] = pw _playwright_singleton["browser"] = browser return browser def _absolutize(href, prefix): """Join a possibly-relative href with the configured prefix.""" if not href or href.startswith("http"): return href cleaned = href.lstrip("./").lstrip("/") if not prefix: return href return prefix.rstrip("/") + "/" + cleaned def _close_browser(): if _playwright_singleton["browser"]: try: _playwright_singleton["browser"].close() except Exception: pass if _playwright_singleton["pw"]: try: _playwright_singleton["pw"].stop() except Exception: pass def fetch_playwright(args): """Generic headless-browser scraper. See COMPANIES entries for selector args.""" browser = _get_browser() ctx = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", locale="en-US", viewport={"width": 1366, "height": 768}, ) ctx.add_init_script(STEALTH_JS) page = ctx.new_page() jobs = [] try: page.goto(args["url"], timeout=45000, wait_until="domcontentloaded") # Optional cookie banner acceptance for sel in args.get("cookie_accept", []) or []: try: btn = page.locator(sel).first if btn.is_visible(timeout=2000): btn.click() page.wait_for_timeout(500) except Exception: pass # Wait for job content to render wait_for = args.get("wait_for") if wait_for: try: page.wait_for_selector(wait_for, timeout=15000) except Exception: page.wait_for_timeout(4000) # Scroll a few times to trigger any lazy-loaded results for _ in range(args.get("scroll_count", 3)): try: page.mouse.wheel(0, 4000) page.wait_for_timeout(700) except Exception: break cards = page.locator(args["card"]) n = min(cards.count(), args.get("max_cards", 150)) for i in range(n): card = cards.nth(i) try: title = "" if args.get("title_attr") == "text": title = (card.inner_text() or "").strip().split("\n", 1)[0][:200] elif args.get("title_attr"): title = (card.get_attribute(args["title_attr"]) or "").strip() elif args.get("title_sel"): t = card.locator(args["title_sel"]).first if t.count(): # Read either an attribute (e.g. aria-label) or the inner text if args.get("title_sel_attr"): title = (t.get_attribute(args["title_sel_attr"]) or "").strip() else: title = (t.inner_text() or "").strip() if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]): title = title[len(args["title_strip_prefix"]):].strip() if not title: title = (card.inner_text() or "").strip().split("\n", 1)[0][:200] location = args.get("default_location", "") if args.get("location_sel"): lsel = card.locator(args["location_sel"]).first if lsel.count(): location = (lsel.inner_text() or location).strip() link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else "" href = _absolutize(href, args.get("url_prefix", "")) if not title: continue description = "" if args.get("use_inner_text_as_blob"): # Use the full card text as both location source and description full = (card.inner_text() or "") description = full[:2000] if not location: location = full[:300] jobs.append({ "id": href or f"{args['url']}#{i}", "title": title, "location": location, "url": href or args["url"], "posted": "", "description": description, }) except Exception: continue finally: ctx.close() # Deduplicate within a single company by id seen, deduped = set(), [] for j in jobs: if j["id"] in seen: continue seen.add(j["id"]) deduped.append(j) return deduped ADAPTERS = { "workday": fetch_workday, "ashby": fetch_ashby, "greenhouse": fetch_greenhouse, "pcsx": fetch_pcsx, "wp_ajax": fetch_wp_ajax, "smartrecruiters": fetch_smartrecruiters, "playwright": fetch_playwright, } def location_matches(loc_text): if not loc_text: return False, False low = loc_text.lower() in_ch = any(k in low for k in CH_LOCATION_KEYWORDS) has_remote = any(k in low for k in REMOTE_KEYWORDS) is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS) # Count as remote-eligible only if it isn't a US-only remote listing # and it has at least one EU/global hint is_remote = has_remote and not is_us_only and has_eu_hint return in_ch, is_remote def score_job(job, title_only=False): # Title-filtered high-volume boards score on title only — the title filter already # gated relevance, and scoring the full JD body over-inflates (every "python"/"data" # mention adds points), flooding the medium bucket. if title_only: blob = (job.get("title") or "").lower() else: blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower() score, pos, neg = 0, [], [] for kw, w in POSITIVE_KEYWORDS.items(): if kw in blob: score += w pos.append(kw) for kw, w in NEGATIVE_KEYWORDS.items(): if kw in blob: score += w neg.append(kw) return score, pos, neg def load_seen(): if STATE_FILE.exists(): return json.loads(STATE_FILE.read_text(encoding="utf-8")) return {} def save_seen(seen): STATE_FILE.parent.mkdir(parents=True, exist_ok=True) STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8") def write_report(path, results, errors, new_only, include_weak): today = datetime.now().strftime("%Y-%m-%d") n_new = sum(1 for r in results if r["is_new"]) lines = [ f"# Job scout report {today}{' (new only)' if new_only else ''}\n", f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.", f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n", ] if errors: lines.append("## Errors\n") for company, err in errors: lines.append(f"- **{company}**: {err}") lines.append("") strong = [r for r in results if r["score"] >= 6] medium = [r for r in results if 2 <= r["score"] < 6] weak = [r for r in results if r["score"] < 2] if not include_weak and weak: lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._") buckets = [("Strong fit (score >= 6)", strong), ("Medium fit (score 2-5)", medium)] if include_weak: buckets.append(("Weak / noise (score < 2)", weak)) for bucket_name, bucket in buckets: if not bucket: continue lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n") for r in bucket: new_tag = " [NEW]" if r["is_new"] else "" loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?") lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}") lines.append(f"- Location: {r['location']} *({loc_tag})*") if r.get("posted"): lines.append(f"- Posted: {r['posted']}") lines.append(f"- URL: {r['url']}") if r["pos"]: lines.append(f"- Positive: {', '.join(r['pos'])}") if r["neg"]: lines.append(f"- Negative: {', '.join(r['neg'])}") lines.append("") lines.append("\n## Manual check (companies without scrapable APIs)\n") lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.") lines.append("Open each link, scan for new postings since your last quarterly review:\n") for name, note, url in MANUAL_CHECK: lines.append(f"- [ ] **{name}** — {note}: <{url}>") lines.append("") path.write_text("\n".join(lines), encoding="utf-8") def main(): only, new_only, include_weak = None, False, False for arg in sys.argv[1:]: if arg == "--new-only": new_only = True elif arg == "--include-weak": include_weak = True elif arg.startswith("--only="): only = arg.split("=", 1)[1] seen = load_seen() today = datetime.now(timezone.utc).strftime("%Y-%m-%d") all_results, errors = [], [] for cid, display, adapter, args in COMPANIES: if only and cid != only: continue print(f"Fetching {display}...", file=sys.stderr) try: jobs = ADAPTERS[adapter](args) except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e: errors.append((display, repr(e))) continue except Exception as e: errors.append((display, f"unexpected: {e!r}")) continue # Optional per-company title prefilter for high-volume boards title_filter = args.get("_title_filter") if title_filter: jobs = [j for j in jobs if any(k in (j.get("title") or "").lower() for k in title_filter)] company_seen = seen.setdefault(cid, {}) title_seen = set() for j in jobs: jid = str(j.get("id") or j.get("url")) in_ch, is_remote = location_matches(j.get("location", "")) if not (in_ch or is_remote): continue # Collapse the same role posted once per remote country (title differs only # by a "| Country | Remote" suffix) — dedupe on the title before the first "|". norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower() if norm_title in title_seen: continue title_seen.add(norm_title) is_new = jid not in company_seen score, pos, neg = score_job(j, title_only=bool(title_filter)) all_results.append({ "company": display, "company_id": cid, "title": j["title"], "location": j["location"], "url": j["url"], "posted": j.get("posted", ""), "score": score, "pos": pos, "neg": neg, "in_ch": in_ch, "remote": is_remote, "is_new": is_new, }) company_seen[jid] = {"title": j["title"], "first_seen": today} save_seen(seen) _close_browser() if new_only: all_results = [r for r in all_results if r["is_new"]] all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"])) REPORTS_DIR.mkdir(parents=True, exist_ok=True) report_path = REPORTS_DIR / f"{today}.md" write_report(report_path, all_results, errors, new_only, include_weak) n_new = sum(1 for r in all_results if r["is_new"]) print(f"\nReport written: {report_path}", file=sys.stderr) print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr) if errors: print(f"Errors: {len(errors)} - see report", file=sys.stderr) # === Adapter probe results (2026-05-21) ======================================= # Tested all 15 target companies. The 5 working adapters are in COMPANIES above. # The remaining 10 are in MANUAL_CHECK. To upgrade one of those from manual to # automated, you'd need Playwright/Selenium (real browser) — different project. # # Google careers.google.com 404 on documented API; auth-gated # Microsoft gcsservices.careers.ms.com TLS handshake hangs from non-MS clients # Apple jobs.apple.com/api/v1 endpoint exists, location filter codes opaque # Meta metacareers.com GraphQL with auth token # Roche careers.roche.com PhenomPeople/Eightfold, JS-rendered # IBM Research research.ibm.com static page, no API # Cisco jobs.cisco.com JS-rendered SPA # Sonova careers.sonova.com PhenomPeople SaaS, no public JSON # Sygnum sygnum.com/careers Cloudflare-protected # AMINA aminagroup.com/career static, low volume # Bitcoin Suisse bitcoinsuisse.com/careers static, low volume # Coinbase coinbase.com/careers Cloudflare-protected # ============================================================================== if __name__ == "__main__": main()