Files
claude-resume-kit/job_scout/scout.py
T
dennisthiessen 8a5955c0a8 feat(job_scout): decision log + report annotations
Track per-job decisions across runs so we don't re-evaluate roles.

- state/decisions.json (keyed by URL: company/title/decision/note/date), now
  git-tracked while seen_jobs.json stays local
- --decide "<url>" <status> [note] records a decision; --hide-decided gives an
  undecided-only view; report tags each role inline with its decision
- usage docstring updated
- seed 18 decisions (9 shortlist, 7 skip, 1 paused, 1 maybe); flags Google Staff
  FDE GenAI as the paused prior session

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-01 15:30:49 +02:00

1299 lines
60 KiB
Python

"""Job scout for Dennis's quarterly target companies.
Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/
SmartRecruiters/Lever/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser
(playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against
profile keywords, tracks which job IDs we've already seen, writes a markdown report.
Usage:
py scout.py # Pull all configured companies (strong + medium only)
py scout.py --only=nvidia # Pull a single company by id
py scout.py --new-only # Report only jobs not seen before
py scout.py --include-weak # Include weak/noise bucket (default hidden)
py scout.py --hide-decided # Drop roles already in the decision log (undecided-only view)
py scout.py --decide "<url>" <status> [note...] # Record a decision and exit
# status is free-text: shortlist | skip | applied | paused | ...
State : state/seen_jobs.json (job IDs seen) · state/decisions.json (per-URL decisions)
Output: reports/YYYY-MM-DD.md (scan-stats table + scored roles, decisions tagged inline)
To add a company: append to COMPANIES with one of the existing adapter types. A few sites
resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist).
See the adapter-coverage notes at the bottom for the current automated/manual split.
"""
import json
import re
import sys
import time
from functools import lru_cache
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from pathlib import Path
ROOT = Path(__file__).parent
STATE_FILE = ROOT / "state" / "seen_jobs.json"
DECISIONS_FILE = ROOT / "state" / "decisions.json"
REPORTS_DIR = ROOT / "reports"
USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"
CH_LOCATION_KEYWORDS = [
"switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
]
REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
US_ONLY_PATTERNS = [
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
"remote-friendly us", "remote (us)", "united states - remote",
"remote, united states",
]
EU_HINT_KEYWORDS = [
"germany", "france", "spain", "portugal", "ireland", "netherlands",
"sweden", "norway", "finland", "denmark", "poland", "czech",
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
"europe", "emea", "global", "worldwide",
] + CH_LOCATION_KEYWORDS
POSITIVE_KEYWORDS = {
"genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
"applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
"mlops": 3, "ai platform": 3, "ml platform": 3,
"python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
# "data scientist" scored modestly (medium, not strong) — secondary to his data-eng/
# platform thesis, but the targeted band at boutiques like QuantCo (see target memory).
"data scientist": 2, "data science": 2,
"solutions architect": 2, "platform engineer": 2,
"ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
"kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
# Core CV lane — DevOps / data-platform / cloud (was scoring 0; surfaced only via "senior")
"data platform": 3, "platform engineering": 2, "devops": 2,
"sre": 2, "site reliability": 2, "cloud engineer": 2, "cloud": 1,
"software engineer": 1,
# Technical-architect pivot targets (cloud/data/platform = build on his stack; rank above
# bare "solutions architect" pre-sales). Generic "architect" catches the long tail.
"cloud architect": 3, "data architect": 3, "platform architect": 3,
"enterprise architect": 2, "architect": 1,
"crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
# Trading / quant-finance — explicit user interest (energy/finance/crypto trading)
"trading": 2, "trader": 2, "quant": 2, "quantitative": 2,
"market data": 2, "low latency": 2, "low-latency": 2, "fix protocol": 2,
"brokerage": 2, "commodity": 1, "execution": 1,
# "solutions architect" (plural) already scored above; add singular + adjacent stack
"solution architect": 2, "c#": 1, ".net": 1,
"senior": 1, "staff": 1, "lead": 1, "principal": 1,
}
NEGATIVE_KEYWORDS = {
"cuda": -3, "kernel driver": -3, "gpu programming": -3,
"compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
"rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
"frontend": -3, "front-end": -3, "react native": -3,
"ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
"ui engineer": -2, "ux engineer": -2,
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
"physical design": -3, "silicon": -2,
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
"intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
}
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
# post mostly non-tech roles). Only keep titles containing one of these specific role
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
# don't leak in. Matched as case-insensitive substrings against the title only.
ENG_TITLE_FILTER = [
"data engineer", "data engineering", "data platform", "platform engineer",
"data infrastructure", "data architect", "analytics engineer",
"mlops", "ml engineer", "ml platform", "machine learning engineer",
"site reliability", "sre", "backend engineer", "back-end engineer",
"devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
"kafka", "streaming", "big data", "quantitative developer", "quant developer",
]
# id, display, adapter, adapter_args
COMPANIES = [
("nvidia", "NVIDIA", "workday", {
"host": "nvidia.wd5.myworkdayjobs.com",
"tenant": "nvidia",
"site": "NVIDIAExternalCareerSite",
"search_text": "Switzerland",
}),
("kraken", "Kraken", "ashby", {"slug": "kraken.com"}),
("openai", "OpenAI", "ashby", {"slug": "openai"}),
("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
("novartis", "Novartis", "workday", {
"host": "novartis.wd3.myworkdayjobs.com",
"tenant": "novartis",
"site": "Novartis_Careers",
"search_text": "Switzerland",
}),
# PCSX (Eightfold) — Microsoft has a public position search endpoint
("microsoft", "Microsoft", "pcsx", {
"domain": "microsoft.com",
"location": "Switzerland",
}),
# --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
# Dropped: ClickHouse (Glassdoor 3.3, 36% recommend, toxic-culture flag — 2026-05).
("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
# --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
# Dropped: Vitol (Glassdoor 3.5, 55% recommend, grueling-hours/toxic flag — 2026-05).
# Dropped: Sygnum (Glassdoor 3.4, 51% recommend, comp 2.3/5 — below 180k bar — 2026-05).
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
# International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
# Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
("bis", "BIS (Basel)","rss", {
"url": "https://www.bis.org/doclist/vacancies.rss",
"default_location": "Basel, Switzerland",
}),
# Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles
# across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself —
# see fetch_getro. CH-filtered + eng title-filtered to stay relevant.
("coinbase_ventures", "Coinbase Ventures (web3)", "getro", {
"collection": 1625,
"locations": ["Switzerland"],
"job_functions": ["Software Engineering", "IT", "Data Science"],
"_title_filter": ENG_TITLE_FILTER,
}),
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
# aria-label link; location lives in the card text (captured via blob mode).
("google", "Google", "playwright", {
"url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
"wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
"card": "li:has(a[aria-label*='Learn more about'])",
"title_sel": "a[aria-label*='Learn more about']",
"title_sel_attr": "aria-label",
"title_strip_prefix": "Learn more about ",
"link_sel": "a[href*='jobs/results/']",
"link_attr": "href",
"url_prefix": "https://www.google.com/about/careers/applications/",
"default_location": "",
"scroll_count": 5,
"use_inner_text_as_blob": True,
"cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
}),
("apple", "Apple", "playwright", {
"url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
"wait_for": "a[href*='/en-us/details/']",
"card": "a[href*='/en-us/details/']",
"title_attr": "text",
"link_attr": "href",
"url_prefix": "https://jobs.apple.com",
"default_location": "Switzerland",
}),
# Meta job links are /profile/job_details/<id>; title + location are in the link text.
("meta", "Meta", "playwright", {
"url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
"wait_for": "a[href*='/profile/job_details/']",
"card": "a[href*='/profile/job_details/']",
"title_attr": "text",
"link_attr": "href",
"url_prefix": "https://www.metacareers.com",
"default_location": "Zurich, Switzerland",
"scroll_count": 5,
"use_inner_text_as_blob": True,
}),
# PhenomPeople pattern (Roche) uses li.jobs-list-item.
# Card inner text is structured like: "<title> | Location | <city, country> | Category | ..."
# We extract title from first line, full text becomes the "description" so our location
# filter still sees Switzerland mentions.
("roche", "Roche", "playwright", {
"url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
"wait_for": "li.jobs-list-item, a.au-target",
"card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
"title_attr": "text",
"link_sel": "a[href]",
"link_attr": "href",
"url_prefix": "https://careers.roche.com",
"default_location": "",
"cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
"scroll_count": 6,
"use_inner_text_as_blob": True,
}),
# Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
("cisco", "Cisco", "playwright", {
"url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
"wait_for": "a[href*='/job/'], div[role='listitem']",
"card": "div[role='listitem']:has(a[href*='/job/'])",
"title_sel": "a[href*='/job/']",
"link_sel": "a[href*='/job/']",
"link_attr": "href",
"url_prefix": "https://careers.cisco.com",
"default_location": "Switzerland",
"cookie_accept": ["#onetrust-accept-btn-handler"],
"scroll_count": 5,
"use_inner_text_as_blob": True,
}),
# --- Zürich/Zug high-comp additions (2026-05-31 list review) ---
# Palantir (Lever). Verified: 221 postings on the public board. It's US/London-heavy, so
# Swiss/Schwyz roles are rare but self-surface when posted (the location filter drops the
# US/London bulk). No title filter: his target titles (Forward Deployed Software Engineer,
# Deployment Strategist) aren't in ENG_TITLE_FILTER, so filtering would hide them.
("palantir", "Palantir", "lever", {"slug": "palantir"}),
# QuantCo (Lever — note the trailing-hyphen slug "quantco-"). ~16 roles, most tagged
# "Europe" (hybrid); QuantCo's continental hub is Zürich, so the EU-wide rule in
# location_matches surfaces them. No title filter: the target band is DS/Quant/AI/Cloud
# (see comp analysis), which ENG_TITLE_FILTER would drop; interns/frontend are caught by
# NEGATIVE_KEYWORDS instead.
("quantco", "QuantCo", "lever", {"slug": "quantco-"}),
# --- Bern/Thun local tier — WLB & proximity exception (comp bar relaxed; 2026-06-01) ---
# Wired after live endpoint discovery. ⚠️ German citizen: RUAG classified work may require
# Swiss citizenship — verify per-role before tailoring (see project_target_companies).
# Swissgrid (Aarau): Magnolia CMS JSON endpoint (verified). placeOfWork is a bare city
# (Aarau/Prilly/...), so loc_suffix tags it Switzerland for the CH filter. No title filter
# (small board ~13 roles; lets Data Scientist / Applied-ML roles surface).
("swissgrid", "Swissgrid (Aarau)", "json", {
"url": "https://www.swissgrid.ch/.rest/cloud/component-data?path=%2Fswissgrid%2Fen%2Fhome%2Fcareer%2Fjobs%2Fmain%2Fjoblist_transferred_11",
"jobs_key": "jobs",
"field_title": "title", "field_location": "placeOfWork",
"field_url": "descriptionUrl", "field_date": "onlineSince",
"loc_suffix": " Switzerland",
"desc_keys": ["department", "typeOfEmployment", "entryLevel"],
}),
# RUAG (Thun/Bern/Emmen). Jobs render on the portal as anchors to jobs.ruag.ch; the first
# line of each anchor is the title. All sites are Swiss, so default_location=Switzerland
# passes the CH filter. ENG_TITLE_FILTER cuts the apprenticeship/Lehrstelle bulk.
# Drupal portal: 20 jobs/page, server-rendered, paginated via ?page=N (0-indexed). The
# first page is apprenticeship-heavy; eng roles (DevOps/Data/Cloud) are on later pages,
# so we page through until a page adds nothing new (~5-6 pages).
("ruag", "RUAG (Thun/Bern)", "playwright", {
"url": "https://www.ruag.ch/en/working-us/job-portal",
"wait_for": "a[href*='/offene-stellen/']",
"card": "a[href*='/offene-stellen/']",
"title_attr": "text",
"link_attr": "href",
"default_location": "Switzerland",
"scroll_count": 1,
"page_param": "page",
"max_pages": 10,
"_title_filter": ENG_TITLE_FILTER,
}),
# SBB (company.sbb.ch — the correct host; company-jobs.sbb.ch was wrong). AEM job filter
# served as a flat JSON list; the fetch_sbb adapter replicates the user's IT + Bern-region
# filter. German/generic titles, so _score_floor keeps the pre-filtered results visible.
# ⚠️ DE-citizen limits may apply to some SBB security/critical-infra roles.
("sbb", "SBB", "sbb", {
"topic": "IT / Telekommunikation",
"region": "Bern Mittelland",
"_score_floor": 2,
}),
# BKW Group (jobs.bkw.com — the real ATS host). PMS structured-data API; ~600 roles
# group-wide, so fetch_bkw keeps only Berufsfeld categories Informatik/Trading/Finanzen
# (IT/data + energy-trading, incl. the flagged Energiehandel roles). German/generic
# titles, so _score_floor keeps the pre-filtered set visible.
("bkw", "BKW (Bern)", "bkw", {"_score_floor": 2}),
]
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
# These surface as a clickable checklist in the report so they're not forgotten.
# Companies that resist scraping stay here as a clickable report checklist. Currently empty —
# every target company is automated. (Dropped 2026-06-01: BFH — academic FH pay below even the
# relaxed Bern/Thun floor, research-leaning, 403s anyway; Dialectic — ~50-person crypto VC,
# 0 open roles, crypto angle already covered by Kraken/Bitcoin Suisse/Coinbase Ventures.)
MANUAL_CHECK = []
def http_get_json(url, headers=None, data=None, method="GET"):
headers = headers or {}
headers.setdefault("User-Agent", USER_AGENT)
headers.setdefault("Accept", "application/json")
if data is not None and isinstance(data, dict):
data = json.dumps(data).encode("utf-8")
headers.setdefault("Content-Type", "application/json")
req = urllib.request.Request(url, data=data, headers=headers, method=method)
with urllib.request.urlopen(req, timeout=30) as resp:
return json.loads(resp.read().decode("utf-8"))
def fetch_workday(args):
host, site, tenant = args["host"], args["site"], args["tenant"]
search_text = args.get("search_text", "")
url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
jobs, offset = [], 0
while True:
data = http_get_json(url, method="POST", data={
"appliedFacets": {}, "limit": 20, "offset": offset,
"searchText": search_text,
})
postings = data.get("jobPostings", [])
for p in postings:
ext = p.get("externalPath", "")
jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
jobs.append({
"id": jid,
"title": p.get("title", ""),
"location": p.get("locationsText", "") + " " + ext,
"url": f"https://{host}{ext}",
"posted": p.get("postedOn", ""),
"description": "",
})
total = data.get("total", 0)
offset += len(postings)
if not postings or offset >= total:
break
return jobs
def fetch_ashby(args):
slug = args["slug"]
url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
data = http_get_json(url)
jobs = []
for j in data.get("jobs", []):
secs = j.get("secondaryLocations", []) or []
sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
jobs.append({
"id": j.get("id"),
"title": j.get("title", ""),
"location": loc_blob,
"url": j.get("jobUrl"),
"posted": j.get("publishedAt", ""),
"description": (j.get("descriptionPlain") or "")[:2500],
"department": j.get("department", ""),
})
return jobs
def fetch_greenhouse(args):
board = args["board"]
url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
data = http_get_json(url)
jobs = []
for j in data.get("jobs", []):
loc = (j.get("location") or {}).get("name", "")
offices = j.get("offices") or []
office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
loc_blob = " ".join(x for x in [loc, office_names] if x)
desc = j.get("content", "") or ""
desc = re.sub(r"<[^>]+>", " ", desc)
desc = re.sub(r"\s+", " ", desc).strip()
jobs.append({
"id": str(j.get("id")),
"title": j.get("title", ""),
"location": loc_blob,
"url": j.get("absolute_url"),
"posted": j.get("updated_at", ""),
"description": desc[:2500],
})
return jobs
def fetch_pcsx(args):
"""Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
The same endpoint pattern is used by other PCS-hosted boards."""
domain = args["domain"]
location = args.get("location", "")
base = "https://apply.careers.microsoft.com/api/pcsx/search"
jobs, start = [], 0
while True:
url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
positions = (data.get("data") or {}).get("positions", []) or []
for p in positions:
locs = p.get("locations") or []
jobs.append({
"id": str(p.get("id")),
"title": p.get("name", ""),
"location": " | ".join(locs),
"url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
"posted": p.get("postedTs", ""),
"description": (p.get("description") or "")[:2000],
})
if not positions or len(positions) < 50:
break
start += len(positions)
if start >= 500:
break
return jobs
def fetch_smartrecruiters(args):
"""SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
company = args["company"]
base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
jobs, offset = [], 0
while True:
data = http_get_json(f"{base}?limit=100&offset={offset}")
content = data.get("content", []) or []
for p in content:
loc = p.get("location") or {}
parts = [loc.get("fullLocation") or loc.get("city") or ""]
if loc.get("remote"):
parts.append("Remote")
if loc.get("hybrid"):
parts.append("Hybrid")
loc_str = " ".join(x for x in parts if x)
dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
jobs.append({
"id": str(p.get("id")),
"title": p.get("name", ""),
"location": loc_str,
"url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
"posted": p.get("releasedDate", ""),
"description": " ".join(filter(None, [dept, func])),
})
total = data.get("totalFound", 0)
offset += len(content)
if not content or offset >= total or offset >= 300:
break
return jobs
def fetch_rss(args):
"""Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
<item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
import xml.etree.ElementTree as ET
req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
root = ET.fromstring(resp.read())
ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
items = root.findall(".//rss1:item", ns) or root.findall(".//item")
jobs = []
for it in items:
def field(tag, namespaced=True):
el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
if el is None and namespaced:
el = it.find(tag)
return (el.text or "").strip() if el is not None and el.text else ""
link = field("link")
jobs.append({
"id": link or field("title"),
"title": field("title"),
"location": args.get("default_location", ""),
"url": link,
"posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
"description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
})
return jobs
def fetch_wp_ajax(args):
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
url = args["url"]
data = http_get_json(url)
if not isinstance(data, list):
return []
jobs = []
for j in data:
jobs.append({
"id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
"title": j.get("title", ""),
"location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
"url": j.get("application_url") or args["url"],
"posted": "",
"description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
})
return jobs
def fetch_getro(args):
"""Getro network job-board search API (POST JSON). Powers VC portfolio talent
networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles
across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself;
Coinbase doesn't list its own openings on its Ventures board. Server-side filters:
searchable_locations and job_functions. Org name is folded into the title since this
is a multi-company board."""
collection = args["collection"]
url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs"
filters = {}
if args.get("locations"):
filters["searchable_locations"] = args["locations"]
if args.get("job_functions"):
filters["job_functions"] = args["job_functions"]
jobs, page = [], 0
while True:
data = http_get_json(url, method="POST", data={
"hitsPerPage": 100, "page": page, "query": "", "filters": filters,
})
res = data.get("results", {}) or {}
batch = res.get("jobs", []) or []
for j in batch:
org = (j.get("organization") or {}).get("name", "")
locs = j.get("searchable_locations") or j.get("locations") or []
loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs)
ts = j.get("created_at")
posted = ""
if isinstance(ts, (int, float)):
posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
title = j.get("title", "")
jobs.append({
"id": str(j.get("id")),
"title": f"{title} @ {org}" if org else title,
"location": loc_str,
"url": j.get("url", ""),
"posted": posted,
"description": " ".join(filter(None, [org] + (j.get("skills") or []))),
})
total = res.get("count", 0)
page += 1
if not batch or len(jobs) >= total or page >= 10:
break
return jobs
def fetch_onlyfy(args):
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
card carries a <a href="/job/ID">title</a> and a location cell flagged by an
icon-map-marker. Titles and locations appear in document order, one of each per card,
so we extract both lists and zip them. No JSON API and no headless browser needed."""
import html as _html
slug = args["slug"]
base = f"https://{slug}.onlyfy.jobs"
url = (f"{base}/candidate/job/ajax_list"
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
})
with urllib.request.urlopen(req, timeout=30) as resp:
page = resp.read().decode("utf-8", "replace")
titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
jobs = []
for (href, raw_title), raw_loc in zip(titles, locs):
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
loc = _html.unescape(raw_loc).strip()
jobs.append({
"id": href.rsplit("/", 1)[-1],
"title": title,
"location": loc,
"url": base + href,
"posted": "",
"description": loc,
})
return jobs
def fetch_lever(args):
"""Lever public postings API. Palantir uses this. The board is US/London-heavy;
Swiss/Zurich (Schwyz hub) roles are rare on it but will surface here when posted —
location filtering downstream drops the US/London bulk. categories.allLocations
captures multi-location postings; createdAt is epoch-ms."""
slug = args["slug"]
data = http_get_json(f"https://api.lever.co/v0/postings/{slug}?mode=json")
jobs = []
for j in data:
cats = j.get("categories") or {}
all_locs = cats.get("allLocations") or []
loc_blob = " | ".join(x for x in ([cats.get("location") or ""] + [str(a) for a in all_locs]) if x)
ts = j.get("createdAt")
posted = ""
if isinstance(ts, (int, float)):
posted = datetime.fromtimestamp(ts / 1000, tz=timezone.utc).strftime("%Y-%m-%d")
jobs.append({
"id": j.get("id"),
"title": j.get("text", ""),
"location": loc_blob,
"url": j.get("hostedUrl"),
"posted": posted,
"description": (j.get("descriptionPlain") or "")[:2500],
})
return jobs
def fetch_json(args):
"""Generic JSON jobs API with configurable field names, for employer sites that expose
a clean public endpoint. Verified use: Swissgrid (Magnolia CMS
/.rest/cloud/component-data — {config, jobs:[...], filters}). Field names vary by site,
so they're configurable: field_title/field_location/field_url/field_date. loc_suffix
appends e.g. ' Switzerland' so the CH location filter matches city-only values such as
"Aarau"/"Prilly" (not every Swiss town is in CH_LOCATION_KEYWORDS). desc_keys fold extra
fields (department, employment type) into the description for keyword scoring.
Args: url, jobs_key (default "jobs"), field_* (defaults title/location/url/date),
url_prefix, loc_suffix, desc_keys."""
data = http_get_json(args["url"])
arr = data.get(args.get("jobs_key", "jobs"), []) if isinstance(data, dict) else (data or [])
ft, fl = args.get("field_title", "title"), args.get("field_location", "location")
fu, fd = args.get("field_url", "url"), args.get("field_date", "date")
prefix, suffix = args.get("url_prefix", ""), args.get("loc_suffix", "")
desc_keys = args.get("desc_keys", [])
jobs = []
for j in arr:
url = j.get(fu, "") or ""
if url and not url.startswith("http") and prefix:
url = prefix.rstrip("/") + "/" + url.lstrip("/")
loc = (j.get(fl, "") or "").strip() + suffix
desc = " ".join(str(j.get(k)) for k in desc_keys if j.get(k))
jobs.append({
"id": str(j.get("id") or url),
"title": j.get(ft, ""),
"location": loc,
"url": url,
"posted": j.get(fd, "") or "",
"description": desc[:500],
})
return jobs
def fetch_sbb(args):
"""SBB (company.sbb.ch) AEM job filter. The whole board is served as a flat JSON list
at .../jobfilter.results.json (~145 roles); the website filters client-side via each
job's numbered `attributes`: '20'=Berufsfeld/topic, '110'=region, '100'=city,
'links.directlink'=the jobs.sbb.ch URL. We replicate the user's IT + Bern-region filter
so only commutable IT roles surface. Titles are German/generic (Application Engineer,
Network Security Engineer, OT Architekt) and won't match ENG_TITLE_FILTER or the keyword
scorer, so this company is given a _score_floor in COMPANIES to keep its pre-filtered
results visible. topic/region are configurable substrings."""
url = args.get("url", ("https://company.sbb.ch/content/internet/corporate/de/"
"jobs-karriere/jobs/job-suche/jcr:content/parmain/"
"jobfilter.results.json"))
topic = args.get("topic", "IT / Telekommunikation")
region = args.get("region", "Bern Mittelland")
data = http_get_json(url)
arr = data if isinstance(data, list) else (data.get("results") or data.get("jobs") or [])
jobs = []
for j in arr:
a = j.get("attributes", {}) or {}
blob = " ".join(str(x) for v in a.values() for x in (v if isinstance(v, list) else [v]))
if topic and topic not in blob:
continue
if region and region not in blob:
continue
region_v = " ".join(a.get("110", []) or [])
city_v = " ".join(a.get("100", []) or [])
field_v = " ".join(a.get("20", []) or [])
jobs.append({
"id": str(j.get("id") or j.get("viewkey") or ""),
"title": j.get("title", ""),
"location": f"{city_v} {region_v} Schweiz".strip(),
"url": (j.get("links") or {}).get("directlink", ""),
"posted": j.get("start_date", "") or "",
"description": (field_v + " " + (j.get("text", "") or ""))[:400],
})
return jobs
def fetch_bkw(args):
"""BKW Group (jobs.bkw.com) PMS structured-data API. The whole-group board is ~600 roles
dominated by building-tech / electrical / civil-engineering trades; we keep only the
Berufsfeld categories relevant to the user (Informatik / Trading / Finanzen), which
surfaces IT/data plus the energy-trading roles (Quant Risk Modeller, Solution Architect
Energiehandel, Energy Derivatives/Market-Risk analysts). locations[].address gives
city/country. Pre-filtered + German/generic titles, so paired with a _score_floor in
COMPANIES. The category allowlist is configurable."""
url = args.get("url", ("https://jobs.bkw.com/_api/v1/structureddata?"
"configFromContentElement=82381&language=de-ch"))
allow = [c.lower() for c in args.get("categories", ["Informatik", "Trading", "Finanzen"])]
data = http_get_json(url)
arr = data if isinstance(data, list) else []
if not arr and isinstance(data, dict):
for v in data.values():
if isinstance(v, list) and v and isinstance(v[0], dict) and "title" in v[0]:
arr = v
break
jobs = []
for j in arr:
if j.get("type") and j.get("type") != "jobs":
continue
cats = [c.get("title", "") for c in (j.get("relations", {}) or {}).get("Berufsfeld", []) or []]
if allow and not any(any(a in c.lower() for a in allow) for c in cats):
continue
locs = j.get("locations") or []
addr = (locs[0].get("address") if locs and isinstance(locs[0], dict) else {}) or {}
loc = " ".join(x for x in [addr.get("city", ""), addr.get("country", "")] if x) or "Schweiz"
jobs.append({
"id": str(j.get("id") or j.get("url") or ""),
"title": j.get("title", ""),
"location": loc,
"url": j.get("url", ""),
"posted": "",
"description": " ".join(cats + [j.get("subtitle", "") or ""])[:300],
})
return jobs
# Injected before page scripts run, to mask the most common headless-detection signals.
# Required for Google; harmless for the other sites.
STEALTH_JS = """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
const _q = navigator.permissions && navigator.permissions.query;
if (_q) {
navigator.permissions.query = (p) => p && p.name === 'notifications'
? Promise.resolve({state: Notification.permission}) : _q(p);
}
"""
_playwright_singleton = {"pw": None, "browser": None}
def _get_browser():
"""Lazy-init a single shared headless browser. Saves ~3s per company."""
if _playwright_singleton["browser"] is not None:
return _playwright_singleton["browser"]
try:
from playwright.sync_api import sync_playwright
except ImportError as e:
raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
pw = sync_playwright().start()
browser = pw.chromium.launch(
headless=True,
args=["--disable-blink-features=AutomationControlled"],
)
_playwright_singleton["pw"] = pw
_playwright_singleton["browser"] = browser
return browser
def _absolutize(href, prefix):
"""Join a possibly-relative href with the configured prefix."""
if not href or href.startswith("http"):
return href
cleaned = href.lstrip("./").lstrip("/")
if not prefix:
return href
return prefix.rstrip("/") + "/" + cleaned
def _close_browser():
if _playwright_singleton["browser"]:
try:
_playwright_singleton["browser"].close()
except Exception:
pass
if _playwright_singleton["pw"]:
try:
_playwright_singleton["pw"].stop()
except Exception:
pass
def fetch_playwright(args):
"""Generic headless-browser scraper. See COMPANIES entries for selector args."""
browser = _get_browser()
ctx = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
locale="en-US",
viewport={"width": 1366, "height": 768},
)
ctx.add_init_script(STEALTH_JS)
page = ctx.new_page()
jobs = []
seen_ids = set()
def scrape_current():
"""Extract cards from the currently-loaded page; append new ones to `jobs`.
Returns the count of newly-added (not-yet-seen) cards so a pagination loop can
stop once a page contributes nothing new."""
wait_for = args.get("wait_for")
if wait_for:
try:
page.wait_for_selector(wait_for, timeout=15000)
except Exception:
page.wait_for_timeout(4000)
# Scroll a few times to trigger any lazy-loaded results
for _ in range(args.get("scroll_count", 3)):
try:
page.mouse.wheel(0, 4000)
page.wait_for_timeout(700)
except Exception:
break
cards = page.locator(args["card"])
n = min(cards.count(), args.get("max_cards", 150))
added = 0
for i in range(n):
card = cards.nth(i)
try:
title = ""
if args.get("title_attr") == "text":
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
elif args.get("title_attr"):
title = (card.get_attribute(args["title_attr"]) or "").strip()
elif args.get("title_sel"):
t = card.locator(args["title_sel"]).first
if t.count():
# Read either an attribute (e.g. aria-label) or the inner text
if args.get("title_sel_attr"):
title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
else:
title = (t.inner_text() or "").strip()
if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
title = title[len(args["title_strip_prefix"]):].strip()
if not title:
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
location = args.get("default_location", "")
if args.get("location_sel"):
lsel = card.locator(args["location_sel"]).first
if lsel.count():
location = (lsel.inner_text() or location).strip()
link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
href = _absolutize(href, args.get("url_prefix", ""))
if not title:
continue
jid = href or f"{page.url}#{i}"
if jid in seen_ids:
continue
seen_ids.add(jid)
added += 1
description = ""
if args.get("use_inner_text_as_blob"):
# Use the full card text as both location source and description
full = (card.inner_text() or "")
description = full[:2000]
if not location:
location = full[:300]
jobs.append({
"id": jid,
"title": title,
"location": location,
"url": href or page.url,
"posted": "",
"description": description,
})
except Exception:
continue
return added
try:
page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
# Optional cookie banner acceptance (once, on the first page)
for sel in args.get("cookie_accept", []) or []:
try:
btn = page.locator(sel).first
if btn.is_visible(timeout=2000):
btn.click()
page.wait_for_timeout(500)
except Exception:
pass
# Optional query-param pagination (e.g. Drupal "?page=N", 0-indexed). The base URL is
# page 0 (already loaded); fetch successive pages until one adds no new cards.
page_param = args.get("page_param")
if page_param:
base = args["url"]
joiner = "&" if "?" in base else "?"
for p in range(args.get("max_pages", 8)):
if p > 0:
page.goto(f"{base}{joiner}{page_param}={p}", timeout=45000,
wait_until="domcontentloaded")
added = scrape_current()
if p > 0 and added == 0:
break
else:
scrape_current()
finally:
ctx.close()
return jobs
ADAPTERS = {
"workday": fetch_workday,
"ashby": fetch_ashby,
"greenhouse": fetch_greenhouse,
"pcsx": fetch_pcsx,
"wp_ajax": fetch_wp_ajax,
"smartrecruiters": fetch_smartrecruiters,
"rss": fetch_rss,
"getro": fetch_getro,
"onlyfy": fetch_onlyfy,
"lever": fetch_lever,
"json": fetch_json,
"sbb": fetch_sbb,
"bkw": fetch_bkw,
"playwright": fetch_playwright,
}
def location_matches(loc_text):
if not loc_text:
return False, False
low = loc_text.lower()
in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
has_remote = any(k in low for k in REMOTE_KEYWORDS)
is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
# Pan-European postings (location literally "Europe"/"EMEA", e.g. QuantCo's Lever board)
# are reachable for a DACH-based candidate even without an explicit "remote" keyword, so
# treat them as eligible too. City-specific EU roles (e.g. "Berlin or Munich") stay out.
is_eu_wide = any(k in low for k in ("europe", "emea")) and not is_us_only
# Count as remote/EU-eligible only if it isn't a US-only listing and has an EU/global hint
is_remote = (has_remote or is_eu_wide) and not is_us_only and has_eu_hint
return in_ch, is_remote
@lru_cache(maxsize=512)
def _kw_pattern(kw):
"""Word-boundary regex for a keyword. Plain substring matching produced false hits
('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
non-word char (c#, .net, c++) skip that side's guard so they still match."""
esc = re.escape(kw.strip())
left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
return re.compile(left + esc + right)
def _kw_in(kw, text):
return bool(_kw_pattern(kw).search(text))
def score_job(job, title_only=False):
# Title carries the real signal; the JD body is full of company boilerplate (every
# Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
# So title matches score at full weight and body-only matches at half (min 1) — enough
# to surface a role without letting boilerplate inflate it. Negatives count fully
# wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
# boards pass title_only=True and skip body scoring entirely.
title = (job.get("title") or "").lower()
desc = "" if title_only else (job.get("description") or "").lower()
score, pos, neg = 0, [], []
for kw, w in POSITIVE_KEYWORDS.items():
if _kw_in(kw, title):
score += w
pos.append(kw)
elif desc and _kw_in(kw, desc):
score += max(1, w // 2)
pos.append(kw)
for kw, w in NEGATIVE_KEYWORDS.items():
if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
score += w
neg.append(kw)
return score, pos, neg
def load_seen():
if STATE_FILE.exists():
return json.loads(STATE_FILE.read_text(encoding="utf-8"))
return {}
def save_seen(seen):
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")
def load_decisions():
"""Decision log keyed by job URL: {url: {company, title, decision, note, date}}.
Decisions persist across runs so we don't re-evaluate roles we've already judged
(shortlist / skip / applied / paused / rejected — free-text, not enforced)."""
if DECISIONS_FILE.exists():
return json.loads(DECISIONS_FILE.read_text(encoding="utf-8"))
return {}
def save_decisions(decisions):
DECISIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
DECISIONS_FILE.write_text(json.dumps(decisions, indent=2, ensure_ascii=False), encoding="utf-8")
def _parse_posted(s):
"""Best-effort parse of an adapter's `posted` field into a date, across the mix of
formats the boards use (ISO 8601 incl. trailing Z, YYYY-MM-DD, DD.MM.YYYY). Returns None
for unparseable values (e.g. Workday's relative "Posted 5 Days Ago", or empty)."""
if not s or not isinstance(s, str):
return None
s = s.strip()
try:
return datetime.fromisoformat(s.replace("Z", "+00:00")).date()
except ValueError:
pass
for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%Y/%m/%d", "%d/%m/%Y"):
try:
return datetime.strptime(s[:10], fmt).date()
except ValueError:
pass
m = re.search(r"\d{4}-\d{2}-\d{2}", s)
if m:
try:
return datetime.strptime(m.group(0), "%Y-%m-%d").date()
except ValueError:
pass
return None
def write_stats_table(stats, total_secs):
"""Render the per-company scan stats as a markdown table (+ a totals row)."""
out = ["## Scan stats\n",
"| Company | Scraped | CH/Remote | Match ≥2 | Newest posting | Time (s) |",
"|---|--:|--:|--:|:--|--:|"]
t_scraped = t_elig = t_match = 0
newest_all = None
for s in stats:
name = s["company"] + (" ⚠️" if s.get("error") else "")
newest = s["newest"].isoformat() if s["newest"] else "—"
out.append(f"| {name} | {s['scraped']:,} | {s['eligible']:,} | "
f"{s['match']:,} | {newest} | {s['secs']:.1f} |")
t_scraped += s["scraped"]; t_elig += s["eligible"]; t_match += s["match"]
if s["newest"] and (newest_all is None or s["newest"] > newest_all):
newest_all = s["newest"]
out.append(f"| **Total ({len(stats)})** | **{t_scraped:,}** | **{t_elig:,}** | "
f"**{t_match:,}** | **{newest_all.isoformat() if newest_all else '—'}** | "
f"**{total_secs:.1f}** |")
out.append("")
return out
def write_report(path, results, errors, new_only, include_weak, stats=None, total_secs=0.0,
decisions=None, hide_decided=False):
decisions = decisions or {}
today = datetime.now().strftime("%Y-%m-%d")
n_new = sum(1 for r in results if r["is_new"])
n_match = sum(1 for r in results if r["score"] >= 2)
lines = [
f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
f"Eligible (CH/remote): **{len(results)}** · interest matches (score ≥ 2): "
f"**{n_match}** · **{n_new}** new since last run\n",
]
if stats:
lines += write_stats_table(stats, total_secs)
if errors:
lines.append("## Errors\n")
for company, err in errors:
lines.append(f"- **{company}**: {err}")
lines.append("")
strong = [r for r in results if r["score"] >= 6]
medium = [r for r in results if 2 <= r["score"] < 6]
weak = [r for r in results if r["score"] < 2]
if not include_weak and weak:
lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")
n_decided = sum(1 for r in results if r["url"] in decisions)
if n_decided:
shown = "hidden" if hide_decided else "tagged inline"
lines.append(f"_{n_decided} role(s) already in the decision log ({shown}; "
f"see state/decisions.json)._")
buckets = [("Strong fit (score >= 6)", strong),
("Medium fit (score 2-5)", medium)]
if include_weak:
buckets.append(("Weak / noise (score < 2)", weak))
for bucket_name, bucket in buckets:
shown = [r for r in bucket if not (hide_decided and r["url"] in decisions)]
if not shown:
continue
lines.append(f"\n## {bucket_name} - {len(shown)} role(s)\n")
for r in shown:
d = decisions.get(r["url"])
new_tag = " [NEW]" if r["is_new"] else ""
decided_tag = f" — 🗂 {d['decision'].upper()}" if d else ""
loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}{decided_tag}")
lines.append(f"- Location: {r['location']} *({loc_tag})*")
if r.get("posted"):
lines.append(f"- Posted: {r['posted']}")
lines.append(f"- URL: {r['url']}")
if d:
note = f" — {d['note']}" if d.get("note") else ""
lines.append(f"- 🗂 Decision: **{d['decision']}**{note} ({d.get('date','')})")
if r["pos"]:
lines.append(f"- Positive: {', '.join(r['pos'])}")
if r["neg"]:
lines.append(f"- Negative: {', '.join(r['neg'])}")
lines.append("")
if MANUAL_CHECK:
lines.append("\n## Manual check (companies without scrapable APIs)\n")
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
for name, note, url in MANUAL_CHECK:
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
lines.append("")
path.write_text("\n".join(lines), encoding="utf-8")
def main():
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
# Record a decision and exit: --decide "<url>" <status> [note words...]
if "--decide" in sys.argv:
rest = sys.argv[sys.argv.index("--decide") + 1:]
if len(rest) < 2:
print('Usage: --decide "<url>" <status> [note...]', file=sys.stderr)
return
url, status, note = rest[0], rest[1], " ".join(rest[2:])
decisions = load_decisions()
prev = decisions.get(url, {})
decisions[url] = {"company": prev.get("company", ""), "title": prev.get("title", ""),
"decision": status, "note": note, "date": today}
save_decisions(decisions)
print(f"Recorded: {status}{url}", file=sys.stderr)
return
only, new_only, include_weak, hide_decided = None, False, False, False
for arg in sys.argv[1:]:
if arg == "--new-only":
new_only = True
elif arg == "--include-weak":
include_weak = True
elif arg == "--hide-decided":
hide_decided = True
elif arg.startswith("--only="):
only = arg.split("=", 1)[1]
seen = load_seen()
decisions = load_decisions()
all_results, errors, stats = [], [], []
run_start = time.perf_counter()
for cid, display, adapter, args in COMPANIES:
if only and cid != only:
continue
print(f"Fetching {display}...", file=sys.stderr)
t0 = time.perf_counter()
try:
jobs = ADAPTERS[adapter](args)
except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
errors.append((display, repr(e)))
stats.append({"company": display, "scraped": 0, "eligible": 0,
"match": 0, "newest": None, "secs": time.perf_counter() - t0,
"error": True})
continue
except Exception as e:
errors.append((display, f"unexpected: {e!r}"))
stats.append({"company": display, "scraped": 0, "eligible": 0,
"match": 0, "newest": None, "secs": time.perf_counter() - t0,
"error": True})
continue
scraped = len(jobs)
# Optional per-company title prefilter for high-volume boards
title_filter = args.get("_title_filter")
if title_filter:
jobs = [j for j in jobs
if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]
# Newest posting on the board (board freshness), across parseable dates.
dates = [d for j in jobs if (d := _parse_posted(j.get("posted")))]
newest = max(dates) if dates else None
company_seen = seen.setdefault(cid, {})
title_seen = set()
eligible = match = 0
for j in jobs:
jid = str(j.get("id") or j.get("url"))
in_ch, is_remote = location_matches(j.get("location", ""))
if not (in_ch or is_remote):
continue
# Collapse the same role posted once per remote country (title differs only
# by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
if norm_title in title_seen:
continue
title_seen.add(norm_title)
eligible += 1
is_new = jid not in company_seen
score, pos, neg = score_job(j, title_only=bool(title_filter))
# Pre-filtered boards (e.g. SBB, already narrowed to IT+Bern by the adapter) carry
# German/generic titles the profile scorer can't read; a _score_floor keeps their
# already-relevant results out of the hidden weak bucket.
floor = args.get("_score_floor")
if floor is not None and score < floor:
score = floor
if score >= 2:
match += 1
all_results.append({
"company": display, "company_id": cid,
"title": j["title"], "location": j["location"],
"url": j["url"], "posted": j.get("posted", ""),
"score": score, "pos": pos, "neg": neg,
"in_ch": in_ch, "remote": is_remote, "is_new": is_new,
})
company_seen[jid] = {"title": j["title"], "first_seen": today}
stats.append({"company": display, "scraped": scraped, "eligible": eligible,
"match": match, "newest": newest,
"secs": time.perf_counter() - t0, "error": False})
save_seen(seen)
_close_browser()
total_secs = time.perf_counter() - run_start
if new_only:
all_results = [r for r in all_results if r["is_new"]]
all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
report_path = REPORTS_DIR / f"{today}.md"
write_report(report_path, all_results, errors, new_only, include_weak,
stats=stats, total_secs=total_secs,
decisions=decisions, hide_decided=hide_decided)
n_new = sum(1 for r in all_results if r["is_new"])
print(f"\nReport written: {report_path}", file=sys.stderr)
print(f"Total matches: {len(all_results)} ({n_new} new) | "
f"scanned {len(stats)} companies in {total_secs:.1f}s", file=sys.stderr)
if errors:
print(f"Errors: {len(errors)} - see report", file=sys.stderr)
# === Adapter coverage (refreshed 2026-06-01) ==================================
# 25 companies automated across 13 adapter types; MANUAL_CHECK is empty.
#
# Automated (COMPANIES above):
# workday nvidia, novartis
# ashby kraken, openai, confluent
# greenhouse anthropic, gitlab, grafana
# pcsx microsoft (Eightfold position-search endpoint)
# smartrecruiters metgroup, ldc
# rss bis (vacancies.rss — RSS 1.0/RDF)
# getro coinbase_ventures (web3 portfolio network, collection 1625)
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
# lever palantir, quantco (api.lever.co; QuantCo slug is "quantco-")
# json swissgrid (Magnolia /.rest/cloud/component-data)
# sbb sbb (company.sbb.ch AEM jobfilter.results.json)
# bkw bkw (jobs.bkw.com PMS structureddata API)
# playwright google, apple, meta, roche, cisco, ruag (headless browser, 3-15s each)
#
# 2026-06-01 list review (verified live):
# - Palantir (lever): 221 postings, US/London-heavy so Swiss/Schwyz roles are rare but
# self-surface (FDSE/Deployment-Strategist titles map to his FDE drafts).
# - Swissgrid (json): Magnolia CMS endpoint; placeOfWork is bare city, so loc_suffix tags
# it Switzerland for the CH filter. ~13 roles incl. Data Scientist / Applied-ML.
# - RUAG (playwright + page_param): Drupal portal, 20 jobs/page, paginated ?page=N. Page 0
# is apprenticeship-heavy; eng roles (DevOps/Data/Software) are on later pages, so we
# page through (max_pages). ENG_TITLE_FILTER cuts the Lehrstelle bulk. ⚠️ DE-citizen
# limits on RUAG classified roles — verify per-role.
# - SBB (sbb): correct host is company.sbb.ch (not company-jobs.sbb.ch). Flat JSON list;
# fetch_sbb replicates the user's IT + Bern-region filter. German/generic titles, so a
# _score_floor keeps the pre-filtered results visible. ⚠️ DE-citizen limits possible.
# - BKW (bkw): real host is jobs.bkw.com (PMS structureddata API), ~600 group-wide roles;
# fetch_bkw keeps Berufsfeld categories Informatik/Trading/Finanzen (IT/data + energy
# trading: Quant Risk, Solution Architect Energiehandel, ...). _score_floor as above.
# - QuantCo (lever, slug "quantco-"): ~16 roles, most tagged "Europe" (hybrid; Zürich is
# QuantCo's continental hub), surfaced via the EU-wide rule in location_matches. Strong:
# AI Engineer; medium: Cloud Engineer, AI Applied Scientist, Data Scientist, Quant
# Researcher, Software Engineer. Interns/frontend suppressed by NEGATIVE_KEYWORDS.
# The Bern/Thun tier intentionally relaxes the comp bar (see user_comp_bar memory).
#
# MANUAL_CHECK is empty — every target company is automated. Dropped 2026-06-01: BFH
# (academic FH pay below the relaxed Bern/Thun floor, research-leaning, 403s anyway) and
# Dialectic (~50-person crypto VC, 0 open roles; crypto already covered by Kraken / Bitcoin
# Suisse / Coinbase Ventures).
#
# Earlier history: Google/Apple/Meta/Roche/Cisco automated via playwright; Microsoft via
# pcsx; BIS via rss; Coinbase Ventures via getro; Bitcoin Suisse via onlyfy. Dropped:
# ClickHouse, Vitol, Sygnum (Glassdoor/comp red flags), IBM Research + Sonova (low fit),
# Coinbase-the-employer (hiring freeze), AMINA (poor Glassdoor), Canonical (pay+culture).
# The Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase itself.
# ==============================================================================
if __name__ == "__main__":
main()