"""Job scout for Dennis's quarterly target companies. Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/ SmartRecruiters/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser (playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against profile keywords, tracks which job IDs we've already seen, writes a markdown report. Usage: py scout.py # Pull all configured companies (strong + medium only) py scout.py --only=nvidia # Pull a single company by id py scout.py --new-only # Report only jobs not seen before py scout.py --include-weak # Include weak/noise bucket (default hidden) State : state/seen_jobs.json Output: reports/YYYY-MM-DD.md To add a company: append to COMPANIES with one of the existing adapter types. A few sites resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist). See the adapter-coverage notes at the bottom for the current automated/manual split. """ import json import re import sys import urllib.error import urllib.parse import urllib.request from datetime import datetime, timezone from pathlib import Path ROOT = Path(__file__).parent STATE_FILE = ROOT / "state" / "seen_jobs.json" REPORTS_DIR = ROOT / "reports" USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)" CH_LOCATION_KEYWORDS = [ "switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf", "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse", ] REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"] US_ONLY_PATTERNS = [ "remote - us", "remote, us", "remote-us", "us remote", "us-remote", "remote-friendly us", "remote (us)", "united states - remote", "remote, united states", ] EU_HINT_KEYWORDS = [ "germany", "france", "spain", "portugal", "ireland", "netherlands", "sweden", "norway", "finland", "denmark", "poland", "czech", "romania", "italy", "austria", "belgium", "uk", "united kingdom", "europe", "emea", "global", "worldwide", ] + CH_LOCATION_KEYWORDS POSITIVE_KEYWORDS = { "genai": 3, "generative ai": 3, "llm": 3, "large language model": 3, "applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3, "mlops": 3, "ai platform": 3, "ml platform": 3, "python": 2, "java": 2, "data engineer": 2, "data engineering": 2, "solutions architect": 2, "platform engineer": 2, "ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2, "kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1, "crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3, "senior": 1, "staff": 1, "lead": 1, "principal": 1, } NEGATIVE_KEYWORDS = { "cuda": -3, "kernel driver": -3, "gpu programming": -3, "compiler engineer": -3, "pytorch internals": -3, "jax internals": -3, "rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2, "frontend": -3, "front-end": -3, "react native": -3, "ios engineer": -3, "android engineer": -3, "mobile engineer": -3, "ui engineer": -2, "ux engineer": -2, "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3, "physical design": -3, "silicon": -2, "expert c++": -2, "5+ years c++": -2, "deep c++": -2, "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3, } # Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that # post mostly non-tech roles). Only keep titles containing one of these specific role # phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel" # don't leak in. Matched as case-insensitive substrings against the title only. ENG_TITLE_FILTER = [ "data engineer", "data engineering", "data platform", "platform engineer", "data infrastructure", "data architect", "analytics engineer", "mlops", "ml engineer", "ml platform", "machine learning engineer", "site reliability", "sre", "backend engineer", "back-end engineer", "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer", "kafka", "streaming", "big data", "quantitative developer", "quant developer", ] # id, display, adapter, adapter_args COMPANIES = [ ("nvidia", "NVIDIA", "workday", { "host": "nvidia.wd5.myworkdayjobs.com", "tenant": "nvidia", "site": "NVIDIAExternalCareerSite", "search_text": "Switzerland", }), ("kraken", "Kraken", "ashby", {"slug": "kraken.com"}), ("openai", "OpenAI", "ashby", {"slug": "openai"}), ("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}), ("novartis", "Novartis", "workday", { "host": "novartis.wd3.myworkdayjobs.com", "tenant": "novartis", "site": "Novartis_Careers", "search_text": "Switzerland", }), # PCSX (Eightfold) — Microsoft has a public position search endpoint ("microsoft", "Microsoft", "pcsx", { "domain": "microsoft.com", "location": "Switzerland", }), # Sygnum — WordPress AJAX endpoint returns clean JSON ("sygnum", "Sygnum", "wp_ajax", { "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c", }), # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) --- ("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}), ("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}), ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}), ("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}), # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) --- ("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}), ("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}), ("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}), # International org — BIS (Basel), commutable from Bern, salary net of Swiss tax. # Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled). ("bis", "BIS (Basel)","rss", { "url": "https://www.bis.org/doclist/vacancies.rss", "default_location": "Basel, Switzerland", }), # Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles # across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself — # see fetch_getro. CH-filtered + eng title-filtered to stay relevant. ("coinbase_ventures", "Coinbase Ventures (web3)", "getro", { "collection": 1625, "locations": ["Switzerland"], "job_functions": ["Software Engineering", "IT", "Data Science"], "_title_filter": ENG_TITLE_FILTER, }), # Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto # firm, only a handful of CH roles; let scoring rank them (CH filter does the rest). ("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}), # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Google actively bot-detects; the STEALTH_JS init script (applied to every context) # is what makes its job list render. Cards are
  • with a "Learn more about " # aria-label link; location lives in the card text (captured via blob mode). ("google", "Google", "playwright", { "url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland", "wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']", "card": "li:has(a[aria-label*='Learn more about'])", "title_sel": "a[aria-label*='Learn more about']", "title_sel_attr": "aria-label", "title_strip_prefix": "Learn more about ", "link_sel": "a[href*='jobs/results/']", "link_attr": "href", "url_prefix": "https://www.google.com/about/careers/applications/", "default_location": "", "scroll_count": 5, "use_inner_text_as_blob": True, "cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"], }), ("apple", "Apple", "playwright", { "url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE", "wait_for": "a[href*='/en-us/details/']", "card": "a[href*='/en-us/details/']", "title_attr": "text", "link_attr": "href", "url_prefix": "https://jobs.apple.com", "default_location": "Switzerland", }), # Meta job links are /profile/job_details/<id>; title + location are in the link text. ("meta", "Meta", "playwright", { "url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland", "wait_for": "a[href*='/profile/job_details/']", "card": "a[href*='/profile/job_details/']", "title_attr": "text", "link_attr": "href", "url_prefix": "https://www.metacareers.com", "default_location": "Zurich, Switzerland", "scroll_count": 5, "use_inner_text_as_blob": True, }), # PhenomPeople pattern (Roche) uses li.jobs-list-item. # Card inner text is structured like: "<title> | Location | <city, country> | Category | ..." # We extract title from first line, full text becomes the "description" so our location # filter still sees Switzerland mentions. ("roche", "Roche", "playwright", { "url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland", "wait_for": "li.jobs-list-item, a.au-target", "card": "li.jobs-list-item:not(:has-text('Saved jobs'))", "title_attr": "text", "link_sel": "a[href]", "link_attr": "href", "url_prefix": "https://careers.roche.com", "default_location": "", "cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"], "scroll_count": 6, "use_inner_text_as_blob": True, }), # Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles. ("cisco", "Cisco", "playwright", { "url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland", "wait_for": "a[href*='/job/'], div[role='listitem']", "card": "div[role='listitem']:has(a[href*='/job/'])", "title_sel": "a[href*='/job/']", "link_sel": "a[href*='/job/']", "link_attr": "href", "url_prefix": "https://careers.cisco.com", "default_location": "Switzerland", "cookie_accept": ["#onetrust-accept-btn-handler"], "scroll_count": 5, "use_inner_text_as_blob": True, }), ] # Companies where adapter probing did not yield a reliable scrape. Reasons noted. # These surface as a clickable checklist in the report so they're not forgotten. # (Empty — all current target companies are automated.) MANUAL_CHECK = [] def http_get_json(url, headers=None, data=None, method="GET"): headers = headers or {} headers.setdefault("User-Agent", USER_AGENT) headers.setdefault("Accept", "application/json") if data is not None and isinstance(data, dict): data = json.dumps(data).encode("utf-8") headers.setdefault("Content-Type", "application/json") req = urllib.request.Request(url, data=data, headers=headers, method=method) with urllib.request.urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode("utf-8")) def fetch_workday(args): host, site, tenant = args["host"], args["site"], args["tenant"] search_text = args.get("search_text", "") url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs" jobs, offset = [], 0 while True: data = http_get_json(url, method="POST", data={ "appliedFacets": {}, "limit": 20, "offset": offset, "searchText": search_text, }) postings = data.get("jobPostings", []) for p in postings: ext = p.get("externalPath", "") jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext jobs.append({ "id": jid, "title": p.get("title", ""), "location": p.get("locationsText", "") + " " + ext, "url": f"https://{host}{ext}", "posted": p.get("postedOn", ""), "description": "", }) total = data.get("total", 0) offset += len(postings) if not postings or offset >= total: break return jobs def fetch_ashby(args): slug = args["slug"] url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true" data = http_get_json(url) jobs = [] for j in data.get("jobs", []): secs = j.get("secondaryLocations", []) or [] sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs] loc_blob = " | ".join([j.get("location", "") or ""] + sec_names) jobs.append({ "id": j.get("id"), "title": j.get("title", ""), "location": loc_blob, "url": j.get("jobUrl"), "posted": j.get("publishedAt", ""), "description": (j.get("descriptionPlain") or "")[:2500], "department": j.get("department", ""), }) return jobs def fetch_greenhouse(args): board = args["board"] url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true" data = http_get_json(url) jobs = [] for j in data.get("jobs", []): loc = (j.get("location") or {}).get("name", "") offices = j.get("offices") or [] office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict)) loc_blob = " ".join(x for x in [loc, office_names] if x) desc = j.get("content", "") or "" desc = re.sub(r"<[^>]+>", " ", desc) desc = re.sub(r"\s+", " ", desc).strip() jobs.append({ "id": str(j.get("id")), "title": j.get("title", ""), "location": loc_blob, "url": j.get("absolute_url"), "posted": j.get("updated_at", ""), "description": desc[:2500], }) return jobs def fetch_pcsx(args): """Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com. The same endpoint pattern is used by other PCS-hosted boards.""" domain = args["domain"] location = args.get("location", "") base = "https://apply.careers.microsoft.com/api/pcsx/search" jobs, start = [], 0 while True: url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50" data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"}) positions = (data.get("data") or {}).get("positions", []) or [] for p in positions: locs = p.get("locations") or [] jobs.append({ "id": str(p.get("id")), "title": p.get("name", ""), "location": " | ".join(locs), "url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}", "posted": p.get("postedTs", ""), "description": (p.get("description") or "")[:2000], }) if not positions or len(positions) < 50: break start += len(positions) if start >= 500: break return jobs def fetch_smartrecruiters(args): """SmartRecruiters public postings API. Used by many EU energy/commodity firms.""" company = args["company"] base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings" jobs, offset = [], 0 while True: data = http_get_json(f"{base}?limit=100&offset={offset}") content = data.get("content", []) or [] for p in content: loc = p.get("location") or {} parts = [loc.get("fullLocation") or loc.get("city") or ""] if loc.get("remote"): parts.append("Remote") if loc.get("hybrid"): parts.append("Hybrid") loc_str = " ".join(x for x in parts if x) dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else "" func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else "" jobs.append({ "id": str(p.get("id")), "title": p.get("name", ""), "location": loc_str, "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}", "posted": p.get("releasedDate", ""), "description": " ".join(filter(None, [dept, func])), }) total = data.get("totalFound", 0) offset += len(content) if not content or offset >= total or offset >= 300: break return jobs def fetch_rss(args): """Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose <item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required.""" import xml.etree.ElementTree as ET req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT}) with urllib.request.urlopen(req, timeout=30) as resp: root = ET.fromstring(resp.read()) ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"} items = root.findall(".//rss1:item", ns) or root.findall(".//item") jobs = [] for it in items: def field(tag, namespaced=True): el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag) if el is None and namespaced: el = it.find(tag) return (el.text or "").strip() if el is not None and el.text else "" link = field("link") jobs.append({ "id": link or field("title"), "title": field("title"), "location": args.get("default_location", ""), "url": link, "posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")), "description": re.sub(r"<[^>]+>", " ", field("description"))[:1500], }) return jobs def fetch_wp_ajax(args): """WordPress admin-ajax style endpoint. Sygnum uses this pattern.""" url = args["url"] data = http_get_json(url) if not isinstance(data, list): return [] jobs = [] for j in data: jobs.append({ "id": (j.get("title", "") + "|" + j.get("location", ""))[:120], "title": j.get("title", ""), "location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])), "url": j.get("application_url") or args["url"], "posted": "", "description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])), }) return jobs def fetch_getro(args): """Getro network job-board search API (POST JSON). Powers VC portfolio talent networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself; Coinbase doesn't list its own openings on its Ventures board. Server-side filters: searchable_locations and job_functions. Org name is folded into the title since this is a multi-company board.""" collection = args["collection"] url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs" filters = {} if args.get("locations"): filters["searchable_locations"] = args["locations"] if args.get("job_functions"): filters["job_functions"] = args["job_functions"] jobs, page = [], 0 while True: data = http_get_json(url, method="POST", data={ "hitsPerPage": 100, "page": page, "query": "", "filters": filters, }) res = data.get("results", {}) or {} batch = res.get("jobs", []) or [] for j in batch: org = (j.get("organization") or {}).get("name", "") locs = j.get("searchable_locations") or j.get("locations") or [] loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs) ts = j.get("created_at") posted = "" if isinstance(ts, (int, float)): posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") title = j.get("title", "") jobs.append({ "id": str(j.get("id")), "title": f"{title} @ {org}" if org else title, "location": loc_str, "url": j.get("url", ""), "posted": posted, "description": " ".join(filter(None, [org] + (j.get("skills") or []))), }) total = res.get("count", 0) page += 1 if not batch or len(jobs) >= total or page >= 10: break return jobs def fetch_onlyfy(args): """onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each card carries a <a href="/job/ID">title</a> and a location cell flagged by an icon-map-marker. Titles and locations appear in document order, one of each per card, so we extract both lists and zip them. No JSON API and no headless browser needed.""" import html as _html slug = args["slug"] base = f"https://{slug}.onlyfy.jobs" url = (f"{base}/candidate/job/ajax_list" f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=") req = urllib.request.Request(url, headers={ "User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest", }) with urllib.request.urlopen(req, timeout=30) as resp: page = resp.read().decode("utf-8", "replace") titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S) locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page) jobs = [] for (href, raw_title), raw_loc in zip(titles, locs): title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip() loc = _html.unescape(raw_loc).strip() jobs.append({ "id": href.rsplit("/", 1)[-1], "title": title, "location": loc, "url": base + href, "posted": "", "description": loc, }) return jobs # Injected before page scripts run, to mask the most common headless-detection signals. # Required for Google; harmless for the other sites. STEALTH_JS = """ Object.defineProperty(navigator, 'webdriver', {get: () => undefined}); window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}}; Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]}); Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']}); const _q = navigator.permissions && navigator.permissions.query; if (_q) { navigator.permissions.query = (p) => p && p.name === 'notifications' ? Promise.resolve({state: Notification.permission}) : _q(p); } """ _playwright_singleton = {"pw": None, "browser": None} def _get_browser(): """Lazy-init a single shared headless browser. Saves ~3s per company.""" if _playwright_singleton["browser"] is not None: return _playwright_singleton["browser"] try: from playwright.sync_api import sync_playwright except ImportError as e: raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e pw = sync_playwright().start() browser = pw.chromium.launch( headless=True, args=["--disable-blink-features=AutomationControlled"], ) _playwright_singleton["pw"] = pw _playwright_singleton["browser"] = browser return browser def _absolutize(href, prefix): """Join a possibly-relative href with the configured prefix.""" if not href or href.startswith("http"): return href cleaned = href.lstrip("./").lstrip("/") if not prefix: return href return prefix.rstrip("/") + "/" + cleaned def _close_browser(): if _playwright_singleton["browser"]: try: _playwright_singleton["browser"].close() except Exception: pass if _playwright_singleton["pw"]: try: _playwright_singleton["pw"].stop() except Exception: pass def fetch_playwright(args): """Generic headless-browser scraper. See COMPANIES entries for selector args.""" browser = _get_browser() ctx = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", locale="en-US", viewport={"width": 1366, "height": 768}, ) ctx.add_init_script(STEALTH_JS) page = ctx.new_page() jobs = [] try: page.goto(args["url"], timeout=45000, wait_until="domcontentloaded") # Optional cookie banner acceptance for sel in args.get("cookie_accept", []) or []: try: btn = page.locator(sel).first if btn.is_visible(timeout=2000): btn.click() page.wait_for_timeout(500) except Exception: pass # Wait for job content to render wait_for = args.get("wait_for") if wait_for: try: page.wait_for_selector(wait_for, timeout=15000) except Exception: page.wait_for_timeout(4000) # Scroll a few times to trigger any lazy-loaded results for _ in range(args.get("scroll_count", 3)): try: page.mouse.wheel(0, 4000) page.wait_for_timeout(700) except Exception: break cards = page.locator(args["card"]) n = min(cards.count(), args.get("max_cards", 150)) for i in range(n): card = cards.nth(i) try: title = "" if args.get("title_attr") == "text": title = (card.inner_text() or "").strip().split("\n", 1)[0][:200] elif args.get("title_attr"): title = (card.get_attribute(args["title_attr"]) or "").strip() elif args.get("title_sel"): t = card.locator(args["title_sel"]).first if t.count(): # Read either an attribute (e.g. aria-label) or the inner text if args.get("title_sel_attr"): title = (t.get_attribute(args["title_sel_attr"]) or "").strip() else: title = (t.inner_text() or "").strip() if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]): title = title[len(args["title_strip_prefix"]):].strip() if not title: title = (card.inner_text() or "").strip().split("\n", 1)[0][:200] location = args.get("default_location", "") if args.get("location_sel"): lsel = card.locator(args["location_sel"]).first if lsel.count(): location = (lsel.inner_text() or location).strip() link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else "" href = _absolutize(href, args.get("url_prefix", "")) if not title: continue description = "" if args.get("use_inner_text_as_blob"): # Use the full card text as both location source and description full = (card.inner_text() or "") description = full[:2000] if not location: location = full[:300] jobs.append({ "id": href or f"{args['url']}#{i}", "title": title, "location": location, "url": href or args["url"], "posted": "", "description": description, }) except Exception: continue finally: ctx.close() # Deduplicate within a single company by id seen, deduped = set(), [] for j in jobs: if j["id"] in seen: continue seen.add(j["id"]) deduped.append(j) return deduped ADAPTERS = { "workday": fetch_workday, "ashby": fetch_ashby, "greenhouse": fetch_greenhouse, "pcsx": fetch_pcsx, "wp_ajax": fetch_wp_ajax, "smartrecruiters": fetch_smartrecruiters, "rss": fetch_rss, "getro": fetch_getro, "onlyfy": fetch_onlyfy, "playwright": fetch_playwright, } def location_matches(loc_text): if not loc_text: return False, False low = loc_text.lower() in_ch = any(k in low for k in CH_LOCATION_KEYWORDS) has_remote = any(k in low for k in REMOTE_KEYWORDS) is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS) # Count as remote-eligible only if it isn't a US-only remote listing # and it has at least one EU/global hint is_remote = has_remote and not is_us_only and has_eu_hint return in_ch, is_remote def score_job(job, title_only=False): # Title-filtered high-volume boards score on title only — the title filter already # gated relevance, and scoring the full JD body over-inflates (every "python"/"data" # mention adds points), flooding the medium bucket. if title_only: blob = (job.get("title") or "").lower() else: blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower() score, pos, neg = 0, [], [] for kw, w in POSITIVE_KEYWORDS.items(): if kw in blob: score += w pos.append(kw) for kw, w in NEGATIVE_KEYWORDS.items(): if kw in blob: score += w neg.append(kw) return score, pos, neg def load_seen(): if STATE_FILE.exists(): return json.loads(STATE_FILE.read_text(encoding="utf-8")) return {} def save_seen(seen): STATE_FILE.parent.mkdir(parents=True, exist_ok=True) STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8") def write_report(path, results, errors, new_only, include_weak): today = datetime.now().strftime("%Y-%m-%d") n_new = sum(1 for r in results if r["is_new"]) lines = [ f"# Job scout report {today}{' (new only)' if new_only else ''}\n", f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.", f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n", ] if errors: lines.append("## Errors\n") for company, err in errors: lines.append(f"- **{company}**: {err}") lines.append("") strong = [r for r in results if r["score"] >= 6] medium = [r for r in results if 2 <= r["score"] < 6] weak = [r for r in results if r["score"] < 2] if not include_weak and weak: lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._") buckets = [("Strong fit (score >= 6)", strong), ("Medium fit (score 2-5)", medium)] if include_weak: buckets.append(("Weak / noise (score < 2)", weak)) for bucket_name, bucket in buckets: if not bucket: continue lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n") for r in bucket: new_tag = " [NEW]" if r["is_new"] else "" loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?") lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}") lines.append(f"- Location: {r['location']} *({loc_tag})*") if r.get("posted"): lines.append(f"- Posted: {r['posted']}") lines.append(f"- URL: {r['url']}") if r["pos"]: lines.append(f"- Positive: {', '.join(r['pos'])}") if r["neg"]: lines.append(f"- Negative: {', '.join(r['neg'])}") lines.append("") if MANUAL_CHECK: lines.append("\n## Manual check (companies without scrapable APIs)\n") lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.") lines.append("Open each link, scan for new postings since your last quarterly review:\n") for name, note, url in MANUAL_CHECK: lines.append(f"- [ ] **{name}** — {note}: <{url}>") lines.append("") path.write_text("\n".join(lines), encoding="utf-8") def main(): only, new_only, include_weak = None, False, False for arg in sys.argv[1:]: if arg == "--new-only": new_only = True elif arg == "--include-weak": include_weak = True elif arg.startswith("--only="): only = arg.split("=", 1)[1] seen = load_seen() today = datetime.now(timezone.utc).strftime("%Y-%m-%d") all_results, errors = [], [] for cid, display, adapter, args in COMPANIES: if only and cid != only: continue print(f"Fetching {display}...", file=sys.stderr) try: jobs = ADAPTERS[adapter](args) except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e: errors.append((display, repr(e))) continue except Exception as e: errors.append((display, f"unexpected: {e!r}")) continue # Optional per-company title prefilter for high-volume boards title_filter = args.get("_title_filter") if title_filter: jobs = [j for j in jobs if any(k in (j.get("title") or "").lower() for k in title_filter)] company_seen = seen.setdefault(cid, {}) title_seen = set() for j in jobs: jid = str(j.get("id") or j.get("url")) in_ch, is_remote = location_matches(j.get("location", "")) if not (in_ch or is_remote): continue # Collapse the same role posted once per remote country (title differs only # by a "| Country | Remote" suffix) — dedupe on the title before the first "|". norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower() if norm_title in title_seen: continue title_seen.add(norm_title) is_new = jid not in company_seen score, pos, neg = score_job(j, title_only=bool(title_filter)) all_results.append({ "company": display, "company_id": cid, "title": j["title"], "location": j["location"], "url": j["url"], "posted": j.get("posted", ""), "score": score, "pos": pos, "neg": neg, "in_ch": in_ch, "remote": is_remote, "is_new": is_new, }) company_seen[jid] = {"title": j["title"], "first_seen": today} save_seen(seen) _close_browser() if new_only: all_results = [r for r in all_results if r["is_new"]] all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"])) REPORTS_DIR.mkdir(parents=True, exist_ok=True) report_path = REPORTS_DIR / f"{today}.md" write_report(report_path, all_results, errors, new_only, include_weak) n_new = sum(1 for r in all_results if r["is_new"]) print(f"\nReport written: {report_path}", file=sys.stderr) print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr) if errors: print(f"Errors: {len(errors)} - see report", file=sys.stderr) # === Adapter coverage (refreshed 2026-05-24) ================================== # 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK. # # Automated (COMPANIES above): # workday nvidia, novartis # ashby kraken, openai, confluent # greenhouse anthropic, gitlab, clickhouse, grafana # pcsx microsoft (Eightfold position-search endpoint) # wp_ajax sygnum (WordPress admin-ajax JSON) # smartrecruiters metgroup, vitol, ldc # rss bis (vacancies.rss — RSS 1.0/RDF) # getro coinbase_ventures (web3 portfolio network, collection 1625) # onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment) # playwright google, apple, meta, roche, cisco (headless browser, 3-15s each) # # Since the 2026-05-21 probe, six originally-manual sites moved to automated: # Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and # Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the # Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse # via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying # onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were # dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis). # # Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase # itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05; # re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor). # # MANUAL_CHECK is now empty — every current target company is automated. # ============================================================================== if __name__ == "__main__": main()