"""Job scout for Dennis's quarterly target companies.
Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/
SmartRecruiters/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser
(playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against
profile keywords, tracks which job IDs we've already seen, writes a markdown report.
Usage:
py scout.py # Pull all configured companies (strong + medium only)
py scout.py --only=nvidia # Pull a single company by id
py scout.py --new-only # Report only jobs not seen before
py scout.py --include-weak # Include weak/noise bucket (default hidden)
State : state/seen_jobs.json
Output: reports/YYYY-MM-DD.md
To add a company: append to COMPANIES with one of the existing adapter types. A few sites
resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist).
See the adapter-coverage notes at the bottom for the current automated/manual split.
"""
import json
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).parent
STATE_FILE = ROOT / "state" / "seen_jobs.json"
REPORTS_DIR = ROOT / "reports"
USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"
CH_LOCATION_KEYWORDS = [
"switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
]
REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
US_ONLY_PATTERNS = [
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
"remote-friendly us", "remote (us)", "united states - remote",
"remote, united states",
]
EU_HINT_KEYWORDS = [
"germany", "france", "spain", "portugal", "ireland", "netherlands",
"sweden", "norway", "finland", "denmark", "poland", "czech",
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
"europe", "emea", "global", "worldwide",
] + CH_LOCATION_KEYWORDS
POSITIVE_KEYWORDS = {
"genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
"applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
"mlops": 3, "ai platform": 3, "ml platform": 3,
"python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
"solutions architect": 2, "platform engineer": 2,
"ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
"kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
"crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
"senior": 1, "staff": 1, "lead": 1, "principal": 1,
}
NEGATIVE_KEYWORDS = {
"cuda": -3, "kernel driver": -3, "gpu programming": -3,
"compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
"rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
"frontend": -3, "front-end": -3, "react native": -3,
"ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
"ui engineer": -2, "ux engineer": -2,
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
"physical design": -3, "silicon": -2,
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
}
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
# post mostly non-tech roles). Only keep titles containing one of these specific role
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
# don't leak in. Matched as case-insensitive substrings against the title only.
ENG_TITLE_FILTER = [
"data engineer", "data engineering", "data platform", "platform engineer",
"data infrastructure", "data architect", "analytics engineer",
"mlops", "ml engineer", "ml platform", "machine learning engineer",
"site reliability", "sre", "backend engineer", "back-end engineer",
"devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
"kafka", "streaming", "big data", "quantitative developer", "quant developer",
]
# id, display, adapter, adapter_args
COMPANIES = [
("nvidia", "NVIDIA", "workday", {
"host": "nvidia.wd5.myworkdayjobs.com",
"tenant": "nvidia",
"site": "NVIDIAExternalCareerSite",
"search_text": "Switzerland",
}),
("kraken", "Kraken", "ashby", {"slug": "kraken.com"}),
("openai", "OpenAI", "ashby", {"slug": "openai"}),
("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
("novartis", "Novartis", "workday", {
"host": "novartis.wd3.myworkdayjobs.com",
"tenant": "novartis",
"site": "Novartis_Careers",
"search_text": "Switzerland",
}),
# PCSX (Eightfold) — Microsoft has a public position search endpoint
("microsoft", "Microsoft", "pcsx", {
"domain": "microsoft.com",
"location": "Switzerland",
}),
# Sygnum — WordPress AJAX endpoint returns clean JSON
("sygnum", "Sygnum", "wp_ajax", {
"url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
}),
# --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
# --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
# International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
# Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
("bis", "BIS (Basel)","rss", {
"url": "https://www.bis.org/doclist/vacancies.rss",
"default_location": "Basel, Switzerland",
}),
# Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles
# across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself —
# see fetch_getro. CH-filtered + eng title-filtered to stay relevant.
("coinbase_ventures", "Coinbase Ventures (web3)", "getro", {
"collection": 1625,
"locations": ["Switzerland"],
"job_functions": ["Software Engineering", "IT", "Data Science"],
"_title_filter": ENG_TITLE_FILTER,
}),
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
# is what makes its job list render. Cards are
with a "Learn more about "
# aria-label link; location lives in the card text (captured via blob mode).
("google", "Google", "playwright", {
"url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
"wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
"card": "li:has(a[aria-label*='Learn more about'])",
"title_sel": "a[aria-label*='Learn more about']",
"title_sel_attr": "aria-label",
"title_strip_prefix": "Learn more about ",
"link_sel": "a[href*='jobs/results/']",
"link_attr": "href",
"url_prefix": "https://www.google.com/about/careers/applications/",
"default_location": "",
"scroll_count": 5,
"use_inner_text_as_blob": True,
"cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
}),
("apple", "Apple", "playwright", {
"url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
"wait_for": "a[href*='/en-us/details/']",
"card": "a[href*='/en-us/details/']",
"title_attr": "text",
"link_attr": "href",
"url_prefix": "https://jobs.apple.com",
"default_location": "Switzerland",
}),
# Meta job links are /profile/job_details/; title + location are in the link text.
("meta", "Meta", "playwright", {
"url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
"wait_for": "a[href*='/profile/job_details/']",
"card": "a[href*='/profile/job_details/']",
"title_attr": "text",
"link_attr": "href",
"url_prefix": "https://www.metacareers.com",
"default_location": "Zurich, Switzerland",
"scroll_count": 5,
"use_inner_text_as_blob": True,
}),
# PhenomPeople pattern (Roche) uses li.jobs-list-item.
# Card inner text is structured like: " | Location | | Category | ..."
# We extract title from first line, full text becomes the "description" so our location
# filter still sees Switzerland mentions.
("roche", "Roche", "playwright", {
"url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
"wait_for": "li.jobs-list-item, a.au-target",
"card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
"title_attr": "text",
"link_sel": "a[href]",
"link_attr": "href",
"url_prefix": "https://careers.roche.com",
"default_location": "",
"cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
"scroll_count": 6,
"use_inner_text_as_blob": True,
}),
# Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
("cisco", "Cisco", "playwright", {
"url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
"wait_for": "a[href*='/job/'], div[role='listitem']",
"card": "div[role='listitem']:has(a[href*='/job/'])",
"title_sel": "a[href*='/job/']",
"link_sel": "a[href*='/job/']",
"link_attr": "href",
"url_prefix": "https://careers.cisco.com",
"default_location": "Switzerland",
"cookie_accept": ["#onetrust-accept-btn-handler"],
"scroll_count": 5,
"use_inner_text_as_blob": True,
}),
]
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
# These surface as a clickable checklist in the report so they're not forgotten.
# (Empty — all current target companies are automated.)
MANUAL_CHECK = []
def http_get_json(url, headers=None, data=None, method="GET"):
headers = headers or {}
headers.setdefault("User-Agent", USER_AGENT)
headers.setdefault("Accept", "application/json")
if data is not None and isinstance(data, dict):
data = json.dumps(data).encode("utf-8")
headers.setdefault("Content-Type", "application/json")
req = urllib.request.Request(url, data=data, headers=headers, method=method)
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
def fetch_workday(args):
host, site, tenant = args["host"], args["site"], args["tenant"]
search_text = args.get("search_text", "")
url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
jobs, offset = [], 0
while True:
data = http_get_json(url, method="POST", data={
"appliedFacets": {}, "limit": 20, "offset": offset,
"searchText": search_text,
})
postings = data.get("jobPostings", [])
for p in postings:
ext = p.get("externalPath", "")
jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
jobs.append({
"id": jid,
"title": p.get("title", ""),
"location": p.get("locationsText", "") + " " + ext,
"url": f"https://{host}{ext}",
"posted": p.get("postedOn", ""),
"description": "",
})
total = data.get("total", 0)
offset += len(postings)
if not postings or offset >= total:
break
return jobs
def fetch_ashby(args):
slug = args["slug"]
url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
data = http_get_json(url)
jobs = []
for j in data.get("jobs", []):
secs = j.get("secondaryLocations", []) or []
sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
jobs.append({
"id": j.get("id"),
"title": j.get("title", ""),
"location": loc_blob,
"url": j.get("jobUrl"),
"posted": j.get("publishedAt", ""),
"description": (j.get("descriptionPlain") or "")[:2500],
"department": j.get("department", ""),
})
return jobs
def fetch_greenhouse(args):
board = args["board"]
url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
data = http_get_json(url)
jobs = []
for j in data.get("jobs", []):
loc = (j.get("location") or {}).get("name", "")
offices = j.get("offices") or []
office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
loc_blob = " ".join(x for x in [loc, office_names] if x)
desc = j.get("content", "") or ""
desc = re.sub(r"<[^>]+>", " ", desc)
desc = re.sub(r"\s+", " ", desc).strip()
jobs.append({
"id": str(j.get("id")),
"title": j.get("title", ""),
"location": loc_blob,
"url": j.get("absolute_url"),
"posted": j.get("updated_at", ""),
"description": desc[:2500],
})
return jobs
def fetch_pcsx(args):
"""Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
The same endpoint pattern is used by other PCS-hosted boards."""
domain = args["domain"]
location = args.get("location", "")
base = "https://apply.careers.microsoft.com/api/pcsx/search"
jobs, start = [], 0
while True:
url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
positions = (data.get("data") or {}).get("positions", []) or []
for p in positions:
locs = p.get("locations") or []
jobs.append({
"id": str(p.get("id")),
"title": p.get("name", ""),
"location": " | ".join(locs),
"url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
"posted": p.get("postedTs", ""),
"description": (p.get("description") or "")[:2000],
})
if not positions or len(positions) < 50:
break
start += len(positions)
if start >= 500:
break
return jobs
def fetch_smartrecruiters(args):
"""SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
company = args["company"]
base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
jobs, offset = [], 0
while True:
data = http_get_json(f"{base}?limit=100&offset={offset}")
content = data.get("content", []) or []
for p in content:
loc = p.get("location") or {}
parts = [loc.get("fullLocation") or loc.get("city") or ""]
if loc.get("remote"):
parts.append("Remote")
if loc.get("hybrid"):
parts.append("Hybrid")
loc_str = " ".join(x for x in parts if x)
dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
jobs.append({
"id": str(p.get("id")),
"title": p.get("name", ""),
"location": loc_str,
"url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
"posted": p.get("releasedDate", ""),
"description": " ".join(filter(None, [dept, func])),
})
total = data.get("totalFound", 0)
offset += len(content)
if not content or offset >= total or offset >= 300:
break
return jobs
def fetch_rss(args):
"""Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
- elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
RSS 2.0
- elements. Location isn't in the feed, so default_location is required."""
import xml.etree.ElementTree as ET
req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
root = ET.fromstring(resp.read())
ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
items = root.findall(".//rss1:item", ns) or root.findall(".//item")
jobs = []
for it in items:
def field(tag, namespaced=True):
el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
if el is None and namespaced:
el = it.find(tag)
return (el.text or "").strip() if el is not None and el.text else ""
link = field("link")
jobs.append({
"id": link or field("title"),
"title": field("title"),
"location": args.get("default_location", ""),
"url": link,
"posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
"description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
})
return jobs
def fetch_wp_ajax(args):
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
url = args["url"]
data = http_get_json(url)
if not isinstance(data, list):
return []
jobs = []
for j in data:
jobs.append({
"id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
"title": j.get("title", ""),
"location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
"url": j.get("application_url") or args["url"],
"posted": "",
"description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
})
return jobs
def fetch_getro(args):
"""Getro network job-board search API (POST JSON). Powers VC portfolio talent
networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles
across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself;
Coinbase doesn't list its own openings on its Ventures board. Server-side filters:
searchable_locations and job_functions. Org name is folded into the title since this
is a multi-company board."""
collection = args["collection"]
url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs"
filters = {}
if args.get("locations"):
filters["searchable_locations"] = args["locations"]
if args.get("job_functions"):
filters["job_functions"] = args["job_functions"]
jobs, page = [], 0
while True:
data = http_get_json(url, method="POST", data={
"hitsPerPage": 100, "page": page, "query": "", "filters": filters,
})
res = data.get("results", {}) or {}
batch = res.get("jobs", []) or []
for j in batch:
org = (j.get("organization") or {}).get("name", "")
locs = j.get("searchable_locations") or j.get("locations") or []
loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs)
ts = j.get("created_at")
posted = ""
if isinstance(ts, (int, float)):
posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
title = j.get("title", "")
jobs.append({
"id": str(j.get("id")),
"title": f"{title} @ {org}" if org else title,
"location": loc_str,
"url": j.get("url", ""),
"posted": posted,
"description": " ".join(filter(None, [org] + (j.get("skills") or []))),
})
total = res.get("count", 0)
page += 1
if not batch or len(jobs) >= total or page >= 10:
break
return jobs
def fetch_onlyfy(args):
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
card carries a title and a location cell flagged by an
icon-map-marker. Titles and locations appear in document order, one of each per card,
so we extract both lists and zip them. No JSON API and no headless browser needed."""
import html as _html
slug = args["slug"]
base = f"https://{slug}.onlyfy.jobs"
url = (f"{base}/candidate/job/ajax_list"
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
})
with urllib.request.urlopen(req, timeout=30) as resp:
page = resp.read().decode("utf-8", "replace")
titles = re.findall(r'(.*?)', page, re.S)
locs = re.findall(r'icon-map-marker[^>]*>\s*([^<]+)', page)
jobs = []
for (href, raw_title), raw_loc in zip(titles, locs):
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
loc = _html.unescape(raw_loc).strip()
jobs.append({
"id": href.rsplit("/", 1)[-1],
"title": title,
"location": loc,
"url": base + href,
"posted": "",
"description": loc,
})
return jobs
# Injected before page scripts run, to mask the most common headless-detection signals.
# Required for Google; harmless for the other sites.
STEALTH_JS = """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
const _q = navigator.permissions && navigator.permissions.query;
if (_q) {
navigator.permissions.query = (p) => p && p.name === 'notifications'
? Promise.resolve({state: Notification.permission}) : _q(p);
}
"""
_playwright_singleton = {"pw": None, "browser": None}
def _get_browser():
"""Lazy-init a single shared headless browser. Saves ~3s per company."""
if _playwright_singleton["browser"] is not None:
return _playwright_singleton["browser"]
try:
from playwright.sync_api import sync_playwright
except ImportError as e:
raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
pw = sync_playwright().start()
browser = pw.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled"],
)
_playwright_singleton["pw"] = pw
_playwright_singleton["browser"] = browser
return browser
def _absolutize(href, prefix):
"""Join a possibly-relative href with the configured prefix."""
if not href or href.startswith("http"):
return href
cleaned = href.lstrip("./").lstrip("/")
if not prefix:
return href
return prefix.rstrip("/") + "/" + cleaned
def _close_browser():
if _playwright_singleton["browser"]:
try:
_playwright_singleton["browser"].close()
except Exception:
pass
if _playwright_singleton["pw"]:
try:
_playwright_singleton["pw"].stop()
except Exception:
pass
def fetch_playwright(args):
"""Generic headless-browser scraper. See COMPANIES entries for selector args."""
browser = _get_browser()
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
locale="en-US",
viewport={"width": 1366, "height": 768},
)
ctx.add_init_script(STEALTH_JS)
page = ctx.new_page()
jobs = []
try:
page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
# Optional cookie banner acceptance
for sel in args.get("cookie_accept", []) or []:
try:
btn = page.locator(sel).first
if btn.is_visible(timeout=2000):
btn.click()
page.wait_for_timeout(500)
except Exception:
pass
# Wait for job content to render
wait_for = args.get("wait_for")
if wait_for:
try:
page.wait_for_selector(wait_for, timeout=15000)
except Exception:
page.wait_for_timeout(4000)
# Scroll a few times to trigger any lazy-loaded results
for _ in range(args.get("scroll_count", 3)):
try:
page.mouse.wheel(0, 4000)
page.wait_for_timeout(700)
except Exception:
break
cards = page.locator(args["card"])
n = min(cards.count(), args.get("max_cards", 150))
for i in range(n):
card = cards.nth(i)
try:
title = ""
if args.get("title_attr") == "text":
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
elif args.get("title_attr"):
title = (card.get_attribute(args["title_attr"]) or "").strip()
elif args.get("title_sel"):
t = card.locator(args["title_sel"]).first
if t.count():
# Read either an attribute (e.g. aria-label) or the inner text
if args.get("title_sel_attr"):
title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
else:
title = (t.inner_text() or "").strip()
if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
title = title[len(args["title_strip_prefix"]):].strip()
if not title:
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
location = args.get("default_location", "")
if args.get("location_sel"):
lsel = card.locator(args["location_sel"]).first
if lsel.count():
location = (lsel.inner_text() or location).strip()
link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
href = _absolutize(href, args.get("url_prefix", ""))
if not title:
continue
description = ""
if args.get("use_inner_text_as_blob"):
# Use the full card text as both location source and description
full = (card.inner_text() or "")
description = full[:2000]
if not location:
location = full[:300]
jobs.append({
"id": href or f"{args['url']}#{i}",
"title": title,
"location": location,
"url": href or args["url"],
"posted": "",
"description": description,
})
except Exception:
continue
finally:
ctx.close()
# Deduplicate within a single company by id
seen, deduped = set(), []
for j in jobs:
if j["id"] in seen:
continue
seen.add(j["id"])
deduped.append(j)
return deduped
ADAPTERS = {
"workday": fetch_workday,
"ashby": fetch_ashby,
"greenhouse": fetch_greenhouse,
"pcsx": fetch_pcsx,
"wp_ajax": fetch_wp_ajax,
"smartrecruiters": fetch_smartrecruiters,
"rss": fetch_rss,
"getro": fetch_getro,
"onlyfy": fetch_onlyfy,
"playwright": fetch_playwright,
}
def location_matches(loc_text):
if not loc_text:
return False, False
low = loc_text.lower()
in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
has_remote = any(k in low for k in REMOTE_KEYWORDS)
is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
# Count as remote-eligible only if it isn't a US-only remote listing
# and it has at least one EU/global hint
is_remote = has_remote and not is_us_only and has_eu_hint
return in_ch, is_remote
def score_job(job, title_only=False):
# Title-filtered high-volume boards score on title only — the title filter already
# gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
# mention adds points), flooding the medium bucket.
if title_only:
blob = (job.get("title") or "").lower()
else:
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
score, pos, neg = 0, [], []
for kw, w in POSITIVE_KEYWORDS.items():
if kw in blob:
score += w
pos.append(kw)
for kw, w in NEGATIVE_KEYWORDS.items():
if kw in blob:
score += w
neg.append(kw)
return score, pos, neg
def load_seen():
if STATE_FILE.exists():
return json.loads(STATE_FILE.read_text(encoding="utf-8"))
return {}
def save_seen(seen):
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")
def write_report(path, results, errors, new_only, include_weak):
today = datetime.now().strftime("%Y-%m-%d")
n_new = sum(1 for r in results if r["is_new"])
lines = [
f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n",
]
if errors:
lines.append("## Errors\n")
for company, err in errors:
lines.append(f"- **{company}**: {err}")
lines.append("")
strong = [r for r in results if r["score"] >= 6]
medium = [r for r in results if 2 <= r["score"] < 6]
weak = [r for r in results if r["score"] < 2]
if not include_weak and weak:
lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")
buckets = [("Strong fit (score >= 6)", strong),
("Medium fit (score 2-5)", medium)]
if include_weak:
buckets.append(("Weak / noise (score < 2)", weak))
for bucket_name, bucket in buckets:
if not bucket:
continue
lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n")
for r in bucket:
new_tag = " [NEW]" if r["is_new"] else ""
loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}")
lines.append(f"- Location: {r['location']} *({loc_tag})*")
if r.get("posted"):
lines.append(f"- Posted: {r['posted']}")
lines.append(f"- URL: {r['url']}")
if r["pos"]:
lines.append(f"- Positive: {', '.join(r['pos'])}")
if r["neg"]:
lines.append(f"- Negative: {', '.join(r['neg'])}")
lines.append("")
if MANUAL_CHECK:
lines.append("\n## Manual check (companies without scrapable APIs)\n")
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
for name, note, url in MANUAL_CHECK:
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
lines.append("")
path.write_text("\n".join(lines), encoding="utf-8")
def main():
only, new_only, include_weak = None, False, False
for arg in sys.argv[1:]:
if arg == "--new-only":
new_only = True
elif arg == "--include-weak":
include_weak = True
elif arg.startswith("--only="):
only = arg.split("=", 1)[1]
seen = load_seen()
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
all_results, errors = [], []
for cid, display, adapter, args in COMPANIES:
if only and cid != only:
continue
print(f"Fetching {display}...", file=sys.stderr)
try:
jobs = ADAPTERS[adapter](args)
except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
errors.append((display, repr(e)))
continue
except Exception as e:
errors.append((display, f"unexpected: {e!r}"))
continue
# Optional per-company title prefilter for high-volume boards
title_filter = args.get("_title_filter")
if title_filter:
jobs = [j for j in jobs
if any(k in (j.get("title") or "").lower() for k in title_filter)]
company_seen = seen.setdefault(cid, {})
title_seen = set()
for j in jobs:
jid = str(j.get("id") or j.get("url"))
in_ch, is_remote = location_matches(j.get("location", ""))
if not (in_ch or is_remote):
continue
# Collapse the same role posted once per remote country (title differs only
# by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
if norm_title in title_seen:
continue
title_seen.add(norm_title)
is_new = jid not in company_seen
score, pos, neg = score_job(j, title_only=bool(title_filter))
all_results.append({
"company": display, "company_id": cid,
"title": j["title"], "location": j["location"],
"url": j["url"], "posted": j.get("posted", ""),
"score": score, "pos": pos, "neg": neg,
"in_ch": in_ch, "remote": is_remote, "is_new": is_new,
})
company_seen[jid] = {"title": j["title"], "first_seen": today}
save_seen(seen)
_close_browser()
if new_only:
all_results = [r for r in all_results if r["is_new"]]
all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
report_path = REPORTS_DIR / f"{today}.md"
write_report(report_path, all_results, errors, new_only, include_weak)
n_new = sum(1 for r in all_results if r["is_new"])
print(f"\nReport written: {report_path}", file=sys.stderr)
print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr)
if errors:
print(f"Errors: {len(errors)} - see report", file=sys.stderr)
# === Adapter coverage (refreshed 2026-05-24) ==================================
# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
#
# Automated (COMPANIES above):
# workday nvidia, novartis
# ashby kraken, openai, confluent
# greenhouse anthropic, gitlab, clickhouse, grafana
# pcsx microsoft (Eightfold position-search endpoint)
# wp_ajax sygnum (WordPress admin-ajax JSON)
# smartrecruiters metgroup, vitol, ldc
# rss bis (vacancies.rss — RSS 1.0/RDF)
# getro coinbase_ventures (web3 portfolio network, collection 1625)
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each)
#
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
#
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
#
# MANUAL_CHECK is now empty — every current target company is automated.
# ==============================================================================
if __name__ == "__main__":
main()