8a5955c0a8
Track per-job decisions across runs so we don't re-evaluate roles. - state/decisions.json (keyed by URL: company/title/decision/note/date), now git-tracked while seen_jobs.json stays local - --decide "<url>" <status> [note] records a decision; --hide-decided gives an undecided-only view; report tags each role inline with its decision - usage docstring updated - seed 18 decisions (9 shortlist, 7 skip, 1 paused, 1 maybe); flags Google Staff FDE GenAI as the paused prior session Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1299 lines
60 KiB
Python
1299 lines
60 KiB
Python
"""Job scout for Dennis's quarterly target companies.
|
|
|
|
Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/
|
|
SmartRecruiters/Lever/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser
|
|
(playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against
|
|
profile keywords, tracks which job IDs we've already seen, writes a markdown report.
|
|
|
|
Usage:
|
|
py scout.py # Pull all configured companies (strong + medium only)
|
|
py scout.py --only=nvidia # Pull a single company by id
|
|
py scout.py --new-only # Report only jobs not seen before
|
|
py scout.py --include-weak # Include weak/noise bucket (default hidden)
|
|
py scout.py --hide-decided # Drop roles already in the decision log (undecided-only view)
|
|
py scout.py --decide "<url>" <status> [note...] # Record a decision and exit
|
|
# status is free-text: shortlist | skip | applied | paused | ...
|
|
|
|
State : state/seen_jobs.json (job IDs seen) · state/decisions.json (per-URL decisions)
|
|
Output: reports/YYYY-MM-DD.md (scan-stats table + scored roles, decisions tagged inline)
|
|
|
|
To add a company: append to COMPANIES with one of the existing adapter types. A few sites
|
|
resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist).
|
|
See the adapter-coverage notes at the bottom for the current automated/manual split.
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from functools import lru_cache
|
|
import urllib.error
|
|
import urllib.parse
|
|
import urllib.request
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).parent
|
|
STATE_FILE = ROOT / "state" / "seen_jobs.json"
|
|
DECISIONS_FILE = ROOT / "state" / "decisions.json"
|
|
REPORTS_DIR = ROOT / "reports"
|
|
USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"
|
|
|
|
CH_LOCATION_KEYWORDS = [
|
|
"switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
|
|
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
|
|
]
|
|
|
|
REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
|
|
|
|
US_ONLY_PATTERNS = [
|
|
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
|
|
"remote-friendly us", "remote (us)", "united states - remote",
|
|
"remote, united states",
|
|
]
|
|
|
|
EU_HINT_KEYWORDS = [
|
|
"germany", "france", "spain", "portugal", "ireland", "netherlands",
|
|
"sweden", "norway", "finland", "denmark", "poland", "czech",
|
|
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
|
|
"europe", "emea", "global", "worldwide",
|
|
] + CH_LOCATION_KEYWORDS
|
|
|
|
POSITIVE_KEYWORDS = {
|
|
"genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
|
|
"applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
|
|
"mlops": 3, "ai platform": 3, "ml platform": 3,
|
|
"python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
|
|
# "data scientist" scored modestly (medium, not strong) — secondary to his data-eng/
|
|
# platform thesis, but the targeted band at boutiques like QuantCo (see target memory).
|
|
"data scientist": 2, "data science": 2,
|
|
"solutions architect": 2, "platform engineer": 2,
|
|
"ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
|
|
"kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
|
|
# Core CV lane — DevOps / data-platform / cloud (was scoring 0; surfaced only via "senior")
|
|
"data platform": 3, "platform engineering": 2, "devops": 2,
|
|
"sre": 2, "site reliability": 2, "cloud engineer": 2, "cloud": 1,
|
|
"software engineer": 1,
|
|
# Technical-architect pivot targets (cloud/data/platform = build on his stack; rank above
|
|
# bare "solutions architect" pre-sales). Generic "architect" catches the long tail.
|
|
"cloud architect": 3, "data architect": 3, "platform architect": 3,
|
|
"enterprise architect": 2, "architect": 1,
|
|
"crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
|
|
# Trading / quant-finance — explicit user interest (energy/finance/crypto trading)
|
|
"trading": 2, "trader": 2, "quant": 2, "quantitative": 2,
|
|
"market data": 2, "low latency": 2, "low-latency": 2, "fix protocol": 2,
|
|
"brokerage": 2, "commodity": 1, "execution": 1,
|
|
# "solutions architect" (plural) already scored above; add singular + adjacent stack
|
|
"solution architect": 2, "c#": 1, ".net": 1,
|
|
"senior": 1, "staff": 1, "lead": 1, "principal": 1,
|
|
}
|
|
|
|
NEGATIVE_KEYWORDS = {
|
|
"cuda": -3, "kernel driver": -3, "gpu programming": -3,
|
|
"compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
|
|
"rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
|
|
"frontend": -3, "front-end": -3, "react native": -3,
|
|
"ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
|
|
"ui engineer": -2, "ux engineer": -2,
|
|
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
|
|
"physical design": -3, "silicon": -2,
|
|
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
|
|
"intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
|
|
}
|
|
|
|
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
|
|
# post mostly non-tech roles). Only keep titles containing one of these specific role
|
|
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
|
|
# don't leak in. Matched as case-insensitive substrings against the title only.
|
|
ENG_TITLE_FILTER = [
|
|
"data engineer", "data engineering", "data platform", "platform engineer",
|
|
"data infrastructure", "data architect", "analytics engineer",
|
|
"mlops", "ml engineer", "ml platform", "machine learning engineer",
|
|
"site reliability", "sre", "backend engineer", "back-end engineer",
|
|
"devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
|
|
"kafka", "streaming", "big data", "quantitative developer", "quant developer",
|
|
]
|
|
|
|
# id, display, adapter, adapter_args
|
|
COMPANIES = [
|
|
("nvidia", "NVIDIA", "workday", {
|
|
"host": "nvidia.wd5.myworkdayjobs.com",
|
|
"tenant": "nvidia",
|
|
"site": "NVIDIAExternalCareerSite",
|
|
"search_text": "Switzerland",
|
|
}),
|
|
("kraken", "Kraken", "ashby", {"slug": "kraken.com"}),
|
|
("openai", "OpenAI", "ashby", {"slug": "openai"}),
|
|
("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
|
|
("novartis", "Novartis", "workday", {
|
|
"host": "novartis.wd3.myworkdayjobs.com",
|
|
"tenant": "novartis",
|
|
"site": "Novartis_Careers",
|
|
"search_text": "Switzerland",
|
|
}),
|
|
# PCSX (Eightfold) — Microsoft has a public position search endpoint
|
|
("microsoft", "Microsoft", "pcsx", {
|
|
"domain": "microsoft.com",
|
|
"location": "Switzerland",
|
|
}),
|
|
# --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
|
|
# Dropped: ClickHouse (Glassdoor 3.3, 36% recommend, toxic-culture flag — 2026-05).
|
|
("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
|
|
("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
|
|
("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
|
|
# --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
|
|
# Dropped: Vitol (Glassdoor 3.5, 55% recommend, grueling-hours/toxic flag — 2026-05).
|
|
# Dropped: Sygnum (Glassdoor 3.4, 51% recommend, comp 2.3/5 — below 180k bar — 2026-05).
|
|
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
|
|
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
|
|
# International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
|
|
# Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
|
|
("bis", "BIS (Basel)","rss", {
|
|
"url": "https://www.bis.org/doclist/vacancies.rss",
|
|
"default_location": "Basel, Switzerland",
|
|
}),
|
|
# Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles
|
|
# across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself —
|
|
# see fetch_getro. CH-filtered + eng title-filtered to stay relevant.
|
|
("coinbase_ventures", "Coinbase Ventures (web3)", "getro", {
|
|
"collection": 1625,
|
|
"locations": ["Switzerland"],
|
|
"job_functions": ["Software Engineering", "IT", "Data Science"],
|
|
"_title_filter": ENG_TITLE_FILTER,
|
|
}),
|
|
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
|
|
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
|
|
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
|
|
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
|
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
|
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
|
# aria-label link; location lives in the card text (captured via blob mode).
|
|
("google", "Google", "playwright", {
|
|
"url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
|
|
"wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
|
|
"card": "li:has(a[aria-label*='Learn more about'])",
|
|
"title_sel": "a[aria-label*='Learn more about']",
|
|
"title_sel_attr": "aria-label",
|
|
"title_strip_prefix": "Learn more about ",
|
|
"link_sel": "a[href*='jobs/results/']",
|
|
"link_attr": "href",
|
|
"url_prefix": "https://www.google.com/about/careers/applications/",
|
|
"default_location": "",
|
|
"scroll_count": 5,
|
|
"use_inner_text_as_blob": True,
|
|
"cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
|
|
}),
|
|
("apple", "Apple", "playwright", {
|
|
"url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
|
|
"wait_for": "a[href*='/en-us/details/']",
|
|
"card": "a[href*='/en-us/details/']",
|
|
"title_attr": "text",
|
|
"link_attr": "href",
|
|
"url_prefix": "https://jobs.apple.com",
|
|
"default_location": "Switzerland",
|
|
}),
|
|
# Meta job links are /profile/job_details/<id>; title + location are in the link text.
|
|
("meta", "Meta", "playwright", {
|
|
"url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
|
|
"wait_for": "a[href*='/profile/job_details/']",
|
|
"card": "a[href*='/profile/job_details/']",
|
|
"title_attr": "text",
|
|
"link_attr": "href",
|
|
"url_prefix": "https://www.metacareers.com",
|
|
"default_location": "Zurich, Switzerland",
|
|
"scroll_count": 5,
|
|
"use_inner_text_as_blob": True,
|
|
}),
|
|
# PhenomPeople pattern (Roche) uses li.jobs-list-item.
|
|
# Card inner text is structured like: "<title> | Location | <city, country> | Category | ..."
|
|
# We extract title from first line, full text becomes the "description" so our location
|
|
# filter still sees Switzerland mentions.
|
|
("roche", "Roche", "playwright", {
|
|
"url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
|
|
"wait_for": "li.jobs-list-item, a.au-target",
|
|
"card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
|
|
"title_attr": "text",
|
|
"link_sel": "a[href]",
|
|
"link_attr": "href",
|
|
"url_prefix": "https://careers.roche.com",
|
|
"default_location": "",
|
|
"cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
|
|
"scroll_count": 6,
|
|
"use_inner_text_as_blob": True,
|
|
}),
|
|
# Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
|
|
("cisco", "Cisco", "playwright", {
|
|
"url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
|
|
"wait_for": "a[href*='/job/'], div[role='listitem']",
|
|
"card": "div[role='listitem']:has(a[href*='/job/'])",
|
|
"title_sel": "a[href*='/job/']",
|
|
"link_sel": "a[href*='/job/']",
|
|
"link_attr": "href",
|
|
"url_prefix": "https://careers.cisco.com",
|
|
"default_location": "Switzerland",
|
|
"cookie_accept": ["#onetrust-accept-btn-handler"],
|
|
"scroll_count": 5,
|
|
"use_inner_text_as_blob": True,
|
|
}),
|
|
# --- Zürich/Zug high-comp additions (2026-05-31 list review) ---
|
|
# Palantir (Lever). Verified: 221 postings on the public board. It's US/London-heavy, so
|
|
# Swiss/Schwyz roles are rare but self-surface when posted (the location filter drops the
|
|
# US/London bulk). No title filter: his target titles (Forward Deployed Software Engineer,
|
|
# Deployment Strategist) aren't in ENG_TITLE_FILTER, so filtering would hide them.
|
|
("palantir", "Palantir", "lever", {"slug": "palantir"}),
|
|
# QuantCo (Lever — note the trailing-hyphen slug "quantco-"). ~16 roles, most tagged
|
|
# "Europe" (hybrid); QuantCo's continental hub is Zürich, so the EU-wide rule in
|
|
# location_matches surfaces them. No title filter: the target band is DS/Quant/AI/Cloud
|
|
# (see comp analysis), which ENG_TITLE_FILTER would drop; interns/frontend are caught by
|
|
# NEGATIVE_KEYWORDS instead.
|
|
("quantco", "QuantCo", "lever", {"slug": "quantco-"}),
|
|
# --- Bern/Thun local tier — WLB & proximity exception (comp bar relaxed; 2026-06-01) ---
|
|
# Wired after live endpoint discovery. ⚠️ German citizen: RUAG classified work may require
|
|
# Swiss citizenship — verify per-role before tailoring (see project_target_companies).
|
|
# Swissgrid (Aarau): Magnolia CMS JSON endpoint (verified). placeOfWork is a bare city
|
|
# (Aarau/Prilly/...), so loc_suffix tags it Switzerland for the CH filter. No title filter
|
|
# (small board ~13 roles; lets Data Scientist / Applied-ML roles surface).
|
|
("swissgrid", "Swissgrid (Aarau)", "json", {
|
|
"url": "https://www.swissgrid.ch/.rest/cloud/component-data?path=%2Fswissgrid%2Fen%2Fhome%2Fcareer%2Fjobs%2Fmain%2Fjoblist_transferred_11",
|
|
"jobs_key": "jobs",
|
|
"field_title": "title", "field_location": "placeOfWork",
|
|
"field_url": "descriptionUrl", "field_date": "onlineSince",
|
|
"loc_suffix": " Switzerland",
|
|
"desc_keys": ["department", "typeOfEmployment", "entryLevel"],
|
|
}),
|
|
# RUAG (Thun/Bern/Emmen). Jobs render on the portal as anchors to jobs.ruag.ch; the first
|
|
# line of each anchor is the title. All sites are Swiss, so default_location=Switzerland
|
|
# passes the CH filter. ENG_TITLE_FILTER cuts the apprenticeship/Lehrstelle bulk.
|
|
# Drupal portal: 20 jobs/page, server-rendered, paginated via ?page=N (0-indexed). The
|
|
# first page is apprenticeship-heavy; eng roles (DevOps/Data/Cloud) are on later pages,
|
|
# so we page through until a page adds nothing new (~5-6 pages).
|
|
("ruag", "RUAG (Thun/Bern)", "playwright", {
|
|
"url": "https://www.ruag.ch/en/working-us/job-portal",
|
|
"wait_for": "a[href*='/offene-stellen/']",
|
|
"card": "a[href*='/offene-stellen/']",
|
|
"title_attr": "text",
|
|
"link_attr": "href",
|
|
"default_location": "Switzerland",
|
|
"scroll_count": 1,
|
|
"page_param": "page",
|
|
"max_pages": 10,
|
|
"_title_filter": ENG_TITLE_FILTER,
|
|
}),
|
|
# SBB (company.sbb.ch — the correct host; company-jobs.sbb.ch was wrong). AEM job filter
|
|
# served as a flat JSON list; the fetch_sbb adapter replicates the user's IT + Bern-region
|
|
# filter. German/generic titles, so _score_floor keeps the pre-filtered results visible.
|
|
# ⚠️ DE-citizen limits may apply to some SBB security/critical-infra roles.
|
|
("sbb", "SBB", "sbb", {
|
|
"topic": "IT / Telekommunikation",
|
|
"region": "Bern Mittelland",
|
|
"_score_floor": 2,
|
|
}),
|
|
# BKW Group (jobs.bkw.com — the real ATS host). PMS structured-data API; ~600 roles
|
|
# group-wide, so fetch_bkw keeps only Berufsfeld categories Informatik/Trading/Finanzen
|
|
# (IT/data + energy-trading, incl. the flagged Energiehandel roles). German/generic
|
|
# titles, so _score_floor keeps the pre-filtered set visible.
|
|
("bkw", "BKW (Bern)", "bkw", {"_score_floor": 2}),
|
|
]
|
|
|
|
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
|
# These surface as a clickable checklist in the report so they're not forgotten.
|
|
# Companies that resist scraping stay here as a clickable report checklist. Currently empty —
|
|
# every target company is automated. (Dropped 2026-06-01: BFH — academic FH pay below even the
|
|
# relaxed Bern/Thun floor, research-leaning, 403s anyway; Dialectic — ~50-person crypto VC,
|
|
# 0 open roles, crypto angle already covered by Kraken/Bitcoin Suisse/Coinbase Ventures.)
|
|
MANUAL_CHECK = []
|
|
|
|
|
|
def http_get_json(url, headers=None, data=None, method="GET"):
|
|
headers = headers or {}
|
|
headers.setdefault("User-Agent", USER_AGENT)
|
|
headers.setdefault("Accept", "application/json")
|
|
if data is not None and isinstance(data, dict):
|
|
data = json.dumps(data).encode("utf-8")
|
|
headers.setdefault("Content-Type", "application/json")
|
|
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
def fetch_workday(args):
|
|
host, site, tenant = args["host"], args["site"], args["tenant"]
|
|
search_text = args.get("search_text", "")
|
|
url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
|
|
jobs, offset = [], 0
|
|
while True:
|
|
data = http_get_json(url, method="POST", data={
|
|
"appliedFacets": {}, "limit": 20, "offset": offset,
|
|
"searchText": search_text,
|
|
})
|
|
postings = data.get("jobPostings", [])
|
|
for p in postings:
|
|
ext = p.get("externalPath", "")
|
|
jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
|
|
jobs.append({
|
|
"id": jid,
|
|
"title": p.get("title", ""),
|
|
"location": p.get("locationsText", "") + " " + ext,
|
|
"url": f"https://{host}{ext}",
|
|
"posted": p.get("postedOn", ""),
|
|
"description": "",
|
|
})
|
|
total = data.get("total", 0)
|
|
offset += len(postings)
|
|
if not postings or offset >= total:
|
|
break
|
|
return jobs
|
|
|
|
|
|
def fetch_ashby(args):
|
|
slug = args["slug"]
|
|
url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
|
|
data = http_get_json(url)
|
|
jobs = []
|
|
for j in data.get("jobs", []):
|
|
secs = j.get("secondaryLocations", []) or []
|
|
sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
|
|
loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
|
|
jobs.append({
|
|
"id": j.get("id"),
|
|
"title": j.get("title", ""),
|
|
"location": loc_blob,
|
|
"url": j.get("jobUrl"),
|
|
"posted": j.get("publishedAt", ""),
|
|
"description": (j.get("descriptionPlain") or "")[:2500],
|
|
"department": j.get("department", ""),
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_greenhouse(args):
|
|
board = args["board"]
|
|
url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
|
|
data = http_get_json(url)
|
|
jobs = []
|
|
for j in data.get("jobs", []):
|
|
loc = (j.get("location") or {}).get("name", "")
|
|
offices = j.get("offices") or []
|
|
office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
|
|
loc_blob = " ".join(x for x in [loc, office_names] if x)
|
|
desc = j.get("content", "") or ""
|
|
desc = re.sub(r"<[^>]+>", " ", desc)
|
|
desc = re.sub(r"\s+", " ", desc).strip()
|
|
jobs.append({
|
|
"id": str(j.get("id")),
|
|
"title": j.get("title", ""),
|
|
"location": loc_blob,
|
|
"url": j.get("absolute_url"),
|
|
"posted": j.get("updated_at", ""),
|
|
"description": desc[:2500],
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_pcsx(args):
|
|
"""Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
|
|
The same endpoint pattern is used by other PCS-hosted boards."""
|
|
domain = args["domain"]
|
|
location = args.get("location", "")
|
|
base = "https://apply.careers.microsoft.com/api/pcsx/search"
|
|
jobs, start = [], 0
|
|
while True:
|
|
url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
|
|
data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
|
|
positions = (data.get("data") or {}).get("positions", []) or []
|
|
for p in positions:
|
|
locs = p.get("locations") or []
|
|
jobs.append({
|
|
"id": str(p.get("id")),
|
|
"title": p.get("name", ""),
|
|
"location": " | ".join(locs),
|
|
"url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
|
|
"posted": p.get("postedTs", ""),
|
|
"description": (p.get("description") or "")[:2000],
|
|
})
|
|
if not positions or len(positions) < 50:
|
|
break
|
|
start += len(positions)
|
|
if start >= 500:
|
|
break
|
|
return jobs
|
|
|
|
|
|
def fetch_smartrecruiters(args):
|
|
"""SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
|
|
company = args["company"]
|
|
base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
|
|
jobs, offset = [], 0
|
|
while True:
|
|
data = http_get_json(f"{base}?limit=100&offset={offset}")
|
|
content = data.get("content", []) or []
|
|
for p in content:
|
|
loc = p.get("location") or {}
|
|
parts = [loc.get("fullLocation") or loc.get("city") or ""]
|
|
if loc.get("remote"):
|
|
parts.append("Remote")
|
|
if loc.get("hybrid"):
|
|
parts.append("Hybrid")
|
|
loc_str = " ".join(x for x in parts if x)
|
|
dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
|
|
func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
|
|
jobs.append({
|
|
"id": str(p.get("id")),
|
|
"title": p.get("name", ""),
|
|
"location": loc_str,
|
|
"url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
|
|
"posted": p.get("releasedDate", ""),
|
|
"description": " ".join(filter(None, [dept, func])),
|
|
})
|
|
total = data.get("totalFound", 0)
|
|
offset += len(content)
|
|
if not content or offset >= total or offset >= 300:
|
|
break
|
|
return jobs
|
|
|
|
|
|
def fetch_rss(args):
|
|
"""Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
|
|
<item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
|
|
RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
|
|
import xml.etree.ElementTree as ET
|
|
req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
root = ET.fromstring(resp.read())
|
|
ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
|
|
items = root.findall(".//rss1:item", ns) or root.findall(".//item")
|
|
jobs = []
|
|
for it in items:
|
|
def field(tag, namespaced=True):
|
|
el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
|
|
if el is None and namespaced:
|
|
el = it.find(tag)
|
|
return (el.text or "").strip() if el is not None and el.text else ""
|
|
link = field("link")
|
|
jobs.append({
|
|
"id": link or field("title"),
|
|
"title": field("title"),
|
|
"location": args.get("default_location", ""),
|
|
"url": link,
|
|
"posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
|
|
"description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_wp_ajax(args):
|
|
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
|
url = args["url"]
|
|
data = http_get_json(url)
|
|
if not isinstance(data, list):
|
|
return []
|
|
jobs = []
|
|
for j in data:
|
|
jobs.append({
|
|
"id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
|
|
"title": j.get("title", ""),
|
|
"location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
|
|
"url": j.get("application_url") or args["url"],
|
|
"posted": "",
|
|
"description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_getro(args):
|
|
"""Getro network job-board search API (POST JSON). Powers VC portfolio talent
|
|
networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles
|
|
across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself;
|
|
Coinbase doesn't list its own openings on its Ventures board. Server-side filters:
|
|
searchable_locations and job_functions. Org name is folded into the title since this
|
|
is a multi-company board."""
|
|
collection = args["collection"]
|
|
url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs"
|
|
filters = {}
|
|
if args.get("locations"):
|
|
filters["searchable_locations"] = args["locations"]
|
|
if args.get("job_functions"):
|
|
filters["job_functions"] = args["job_functions"]
|
|
jobs, page = [], 0
|
|
while True:
|
|
data = http_get_json(url, method="POST", data={
|
|
"hitsPerPage": 100, "page": page, "query": "", "filters": filters,
|
|
})
|
|
res = data.get("results", {}) or {}
|
|
batch = res.get("jobs", []) or []
|
|
for j in batch:
|
|
org = (j.get("organization") or {}).get("name", "")
|
|
locs = j.get("searchable_locations") or j.get("locations") or []
|
|
loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs)
|
|
ts = j.get("created_at")
|
|
posted = ""
|
|
if isinstance(ts, (int, float)):
|
|
posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
|
|
title = j.get("title", "")
|
|
jobs.append({
|
|
"id": str(j.get("id")),
|
|
"title": f"{title} @ {org}" if org else title,
|
|
"location": loc_str,
|
|
"url": j.get("url", ""),
|
|
"posted": posted,
|
|
"description": " ".join(filter(None, [org] + (j.get("skills") or []))),
|
|
})
|
|
total = res.get("count", 0)
|
|
page += 1
|
|
if not batch or len(jobs) >= total or page >= 10:
|
|
break
|
|
return jobs
|
|
|
|
|
|
def fetch_onlyfy(args):
|
|
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
|
|
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
|
|
card carries a <a href="/job/ID">title</a> and a location cell flagged by an
|
|
icon-map-marker. Titles and locations appear in document order, one of each per card,
|
|
so we extract both lists and zip them. No JSON API and no headless browser needed."""
|
|
import html as _html
|
|
slug = args["slug"]
|
|
base = f"https://{slug}.onlyfy.jobs"
|
|
url = (f"{base}/candidate/job/ajax_list"
|
|
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
|
|
req = urllib.request.Request(url, headers={
|
|
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
|
|
})
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
page = resp.read().decode("utf-8", "replace")
|
|
titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
|
|
locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
|
|
jobs = []
|
|
for (href, raw_title), raw_loc in zip(titles, locs):
|
|
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
|
|
loc = _html.unescape(raw_loc).strip()
|
|
jobs.append({
|
|
"id": href.rsplit("/", 1)[-1],
|
|
"title": title,
|
|
"location": loc,
|
|
"url": base + href,
|
|
"posted": "",
|
|
"description": loc,
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_lever(args):
|
|
"""Lever public postings API. Palantir uses this. The board is US/London-heavy;
|
|
Swiss/Zurich (Schwyz hub) roles are rare on it but will surface here when posted —
|
|
location filtering downstream drops the US/London bulk. categories.allLocations
|
|
captures multi-location postings; createdAt is epoch-ms."""
|
|
slug = args["slug"]
|
|
data = http_get_json(f"https://api.lever.co/v0/postings/{slug}?mode=json")
|
|
jobs = []
|
|
for j in data:
|
|
cats = j.get("categories") or {}
|
|
all_locs = cats.get("allLocations") or []
|
|
loc_blob = " | ".join(x for x in ([cats.get("location") or ""] + [str(a) for a in all_locs]) if x)
|
|
ts = j.get("createdAt")
|
|
posted = ""
|
|
if isinstance(ts, (int, float)):
|
|
posted = datetime.fromtimestamp(ts / 1000, tz=timezone.utc).strftime("%Y-%m-%d")
|
|
jobs.append({
|
|
"id": j.get("id"),
|
|
"title": j.get("text", ""),
|
|
"location": loc_blob,
|
|
"url": j.get("hostedUrl"),
|
|
"posted": posted,
|
|
"description": (j.get("descriptionPlain") or "")[:2500],
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_json(args):
|
|
"""Generic JSON jobs API with configurable field names, for employer sites that expose
|
|
a clean public endpoint. Verified use: Swissgrid (Magnolia CMS
|
|
/.rest/cloud/component-data — {config, jobs:[...], filters}). Field names vary by site,
|
|
so they're configurable: field_title/field_location/field_url/field_date. loc_suffix
|
|
appends e.g. ' Switzerland' so the CH location filter matches city-only values such as
|
|
"Aarau"/"Prilly" (not every Swiss town is in CH_LOCATION_KEYWORDS). desc_keys fold extra
|
|
fields (department, employment type) into the description for keyword scoring.
|
|
|
|
Args: url, jobs_key (default "jobs"), field_* (defaults title/location/url/date),
|
|
url_prefix, loc_suffix, desc_keys."""
|
|
data = http_get_json(args["url"])
|
|
arr = data.get(args.get("jobs_key", "jobs"), []) if isinstance(data, dict) else (data or [])
|
|
ft, fl = args.get("field_title", "title"), args.get("field_location", "location")
|
|
fu, fd = args.get("field_url", "url"), args.get("field_date", "date")
|
|
prefix, suffix = args.get("url_prefix", ""), args.get("loc_suffix", "")
|
|
desc_keys = args.get("desc_keys", [])
|
|
jobs = []
|
|
for j in arr:
|
|
url = j.get(fu, "") or ""
|
|
if url and not url.startswith("http") and prefix:
|
|
url = prefix.rstrip("/") + "/" + url.lstrip("/")
|
|
loc = (j.get(fl, "") or "").strip() + suffix
|
|
desc = " ".join(str(j.get(k)) for k in desc_keys if j.get(k))
|
|
jobs.append({
|
|
"id": str(j.get("id") or url),
|
|
"title": j.get(ft, ""),
|
|
"location": loc,
|
|
"url": url,
|
|
"posted": j.get(fd, "") or "",
|
|
"description": desc[:500],
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_sbb(args):
|
|
"""SBB (company.sbb.ch) AEM job filter. The whole board is served as a flat JSON list
|
|
at .../jobfilter.results.json (~145 roles); the website filters client-side via each
|
|
job's numbered `attributes`: '20'=Berufsfeld/topic, '110'=region, '100'=city,
|
|
'links.directlink'=the jobs.sbb.ch URL. We replicate the user's IT + Bern-region filter
|
|
so only commutable IT roles surface. Titles are German/generic (Application Engineer,
|
|
Network Security Engineer, OT Architekt) and won't match ENG_TITLE_FILTER or the keyword
|
|
scorer, so this company is given a _score_floor in COMPANIES to keep its pre-filtered
|
|
results visible. topic/region are configurable substrings."""
|
|
url = args.get("url", ("https://company.sbb.ch/content/internet/corporate/de/"
|
|
"jobs-karriere/jobs/job-suche/jcr:content/parmain/"
|
|
"jobfilter.results.json"))
|
|
topic = args.get("topic", "IT / Telekommunikation")
|
|
region = args.get("region", "Bern Mittelland")
|
|
data = http_get_json(url)
|
|
arr = data if isinstance(data, list) else (data.get("results") or data.get("jobs") or [])
|
|
jobs = []
|
|
for j in arr:
|
|
a = j.get("attributes", {}) or {}
|
|
blob = " ".join(str(x) for v in a.values() for x in (v if isinstance(v, list) else [v]))
|
|
if topic and topic not in blob:
|
|
continue
|
|
if region and region not in blob:
|
|
continue
|
|
region_v = " ".join(a.get("110", []) or [])
|
|
city_v = " ".join(a.get("100", []) or [])
|
|
field_v = " ".join(a.get("20", []) or [])
|
|
jobs.append({
|
|
"id": str(j.get("id") or j.get("viewkey") or ""),
|
|
"title": j.get("title", ""),
|
|
"location": f"{city_v} {region_v} Schweiz".strip(),
|
|
"url": (j.get("links") or {}).get("directlink", ""),
|
|
"posted": j.get("start_date", "") or "",
|
|
"description": (field_v + " " + (j.get("text", "") or ""))[:400],
|
|
})
|
|
return jobs
|
|
|
|
|
|
def fetch_bkw(args):
|
|
"""BKW Group (jobs.bkw.com) PMS structured-data API. The whole-group board is ~600 roles
|
|
dominated by building-tech / electrical / civil-engineering trades; we keep only the
|
|
Berufsfeld categories relevant to the user (Informatik / Trading / Finanzen), which
|
|
surfaces IT/data plus the energy-trading roles (Quant Risk Modeller, Solution Architect
|
|
Energiehandel, Energy Derivatives/Market-Risk analysts). locations[].address gives
|
|
city/country. Pre-filtered + German/generic titles, so paired with a _score_floor in
|
|
COMPANIES. The category allowlist is configurable."""
|
|
url = args.get("url", ("https://jobs.bkw.com/_api/v1/structureddata?"
|
|
"configFromContentElement=82381&language=de-ch"))
|
|
allow = [c.lower() for c in args.get("categories", ["Informatik", "Trading", "Finanzen"])]
|
|
data = http_get_json(url)
|
|
arr = data if isinstance(data, list) else []
|
|
if not arr and isinstance(data, dict):
|
|
for v in data.values():
|
|
if isinstance(v, list) and v and isinstance(v[0], dict) and "title" in v[0]:
|
|
arr = v
|
|
break
|
|
jobs = []
|
|
for j in arr:
|
|
if j.get("type") and j.get("type") != "jobs":
|
|
continue
|
|
cats = [c.get("title", "") for c in (j.get("relations", {}) or {}).get("Berufsfeld", []) or []]
|
|
if allow and not any(any(a in c.lower() for a in allow) for c in cats):
|
|
continue
|
|
locs = j.get("locations") or []
|
|
addr = (locs[0].get("address") if locs and isinstance(locs[0], dict) else {}) or {}
|
|
loc = " ".join(x for x in [addr.get("city", ""), addr.get("country", "")] if x) or "Schweiz"
|
|
jobs.append({
|
|
"id": str(j.get("id") or j.get("url") or ""),
|
|
"title": j.get("title", ""),
|
|
"location": loc,
|
|
"url": j.get("url", ""),
|
|
"posted": "",
|
|
"description": " ".join(cats + [j.get("subtitle", "") or ""])[:300],
|
|
})
|
|
return jobs
|
|
|
|
|
|
# Injected before page scripts run, to mask the most common headless-detection signals.
|
|
# Required for Google; harmless for the other sites.
|
|
STEALTH_JS = """
|
|
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
|
|
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
|
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
|
|
const _q = navigator.permissions && navigator.permissions.query;
|
|
if (_q) {
|
|
navigator.permissions.query = (p) => p && p.name === 'notifications'
|
|
? Promise.resolve({state: Notification.permission}) : _q(p);
|
|
}
|
|
"""
|
|
|
|
_playwright_singleton = {"pw": None, "browser": None}
|
|
|
|
def _get_browser():
|
|
"""Lazy-init a single shared headless browser. Saves ~3s per company."""
|
|
if _playwright_singleton["browser"] is not None:
|
|
return _playwright_singleton["browser"]
|
|
try:
|
|
from playwright.sync_api import sync_playwright
|
|
except ImportError as e:
|
|
raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
|
|
pw = sync_playwright().start()
|
|
browser = pw.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled"],
|
|
)
|
|
_playwright_singleton["pw"] = pw
|
|
_playwright_singleton["browser"] = browser
|
|
return browser
|
|
|
|
|
|
def _absolutize(href, prefix):
|
|
"""Join a possibly-relative href with the configured prefix."""
|
|
if not href or href.startswith("http"):
|
|
return href
|
|
cleaned = href.lstrip("./").lstrip("/")
|
|
if not prefix:
|
|
return href
|
|
return prefix.rstrip("/") + "/" + cleaned
|
|
|
|
|
|
def _close_browser():
|
|
if _playwright_singleton["browser"]:
|
|
try:
|
|
_playwright_singleton["browser"].close()
|
|
except Exception:
|
|
pass
|
|
if _playwright_singleton["pw"]:
|
|
try:
|
|
_playwright_singleton["pw"].stop()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def fetch_playwright(args):
|
|
"""Generic headless-browser scraper. See COMPANIES entries for selector args."""
|
|
browser = _get_browser()
|
|
ctx = browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
locale="en-US",
|
|
viewport={"width": 1366, "height": 768},
|
|
)
|
|
ctx.add_init_script(STEALTH_JS)
|
|
page = ctx.new_page()
|
|
jobs = []
|
|
seen_ids = set()
|
|
|
|
def scrape_current():
|
|
"""Extract cards from the currently-loaded page; append new ones to `jobs`.
|
|
Returns the count of newly-added (not-yet-seen) cards so a pagination loop can
|
|
stop once a page contributes nothing new."""
|
|
wait_for = args.get("wait_for")
|
|
if wait_for:
|
|
try:
|
|
page.wait_for_selector(wait_for, timeout=15000)
|
|
except Exception:
|
|
page.wait_for_timeout(4000)
|
|
# Scroll a few times to trigger any lazy-loaded results
|
|
for _ in range(args.get("scroll_count", 3)):
|
|
try:
|
|
page.mouse.wheel(0, 4000)
|
|
page.wait_for_timeout(700)
|
|
except Exception:
|
|
break
|
|
|
|
cards = page.locator(args["card"])
|
|
n = min(cards.count(), args.get("max_cards", 150))
|
|
added = 0
|
|
for i in range(n):
|
|
card = cards.nth(i)
|
|
try:
|
|
title = ""
|
|
if args.get("title_attr") == "text":
|
|
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
|
|
elif args.get("title_attr"):
|
|
title = (card.get_attribute(args["title_attr"]) or "").strip()
|
|
elif args.get("title_sel"):
|
|
t = card.locator(args["title_sel"]).first
|
|
if t.count():
|
|
# Read either an attribute (e.g. aria-label) or the inner text
|
|
if args.get("title_sel_attr"):
|
|
title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
|
|
else:
|
|
title = (t.inner_text() or "").strip()
|
|
if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
|
|
title = title[len(args["title_strip_prefix"]):].strip()
|
|
if not title:
|
|
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
|
|
|
|
location = args.get("default_location", "")
|
|
if args.get("location_sel"):
|
|
lsel = card.locator(args["location_sel"]).first
|
|
if lsel.count():
|
|
location = (lsel.inner_text() or location).strip()
|
|
|
|
link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
|
|
href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
|
|
href = _absolutize(href, args.get("url_prefix", ""))
|
|
|
|
if not title:
|
|
continue
|
|
jid = href or f"{page.url}#{i}"
|
|
if jid in seen_ids:
|
|
continue
|
|
seen_ids.add(jid)
|
|
added += 1
|
|
description = ""
|
|
if args.get("use_inner_text_as_blob"):
|
|
# Use the full card text as both location source and description
|
|
full = (card.inner_text() or "")
|
|
description = full[:2000]
|
|
if not location:
|
|
location = full[:300]
|
|
jobs.append({
|
|
"id": jid,
|
|
"title": title,
|
|
"location": location,
|
|
"url": href or page.url,
|
|
"posted": "",
|
|
"description": description,
|
|
})
|
|
except Exception:
|
|
continue
|
|
return added
|
|
|
|
try:
|
|
page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
|
|
# Optional cookie banner acceptance (once, on the first page)
|
|
for sel in args.get("cookie_accept", []) or []:
|
|
try:
|
|
btn = page.locator(sel).first
|
|
if btn.is_visible(timeout=2000):
|
|
btn.click()
|
|
page.wait_for_timeout(500)
|
|
except Exception:
|
|
pass
|
|
# Optional query-param pagination (e.g. Drupal "?page=N", 0-indexed). The base URL is
|
|
# page 0 (already loaded); fetch successive pages until one adds no new cards.
|
|
page_param = args.get("page_param")
|
|
if page_param:
|
|
base = args["url"]
|
|
joiner = "&" if "?" in base else "?"
|
|
for p in range(args.get("max_pages", 8)):
|
|
if p > 0:
|
|
page.goto(f"{base}{joiner}{page_param}={p}", timeout=45000,
|
|
wait_until="domcontentloaded")
|
|
added = scrape_current()
|
|
if p > 0 and added == 0:
|
|
break
|
|
else:
|
|
scrape_current()
|
|
finally:
|
|
ctx.close()
|
|
|
|
return jobs
|
|
|
|
|
|
ADAPTERS = {
|
|
"workday": fetch_workday,
|
|
"ashby": fetch_ashby,
|
|
"greenhouse": fetch_greenhouse,
|
|
"pcsx": fetch_pcsx,
|
|
"wp_ajax": fetch_wp_ajax,
|
|
"smartrecruiters": fetch_smartrecruiters,
|
|
"rss": fetch_rss,
|
|
"getro": fetch_getro,
|
|
"onlyfy": fetch_onlyfy,
|
|
"lever": fetch_lever,
|
|
"json": fetch_json,
|
|
"sbb": fetch_sbb,
|
|
"bkw": fetch_bkw,
|
|
"playwright": fetch_playwright,
|
|
}
|
|
|
|
|
|
def location_matches(loc_text):
|
|
if not loc_text:
|
|
return False, False
|
|
low = loc_text.lower()
|
|
in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
|
|
has_remote = any(k in low for k in REMOTE_KEYWORDS)
|
|
is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
|
|
has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
|
|
# Pan-European postings (location literally "Europe"/"EMEA", e.g. QuantCo's Lever board)
|
|
# are reachable for a DACH-based candidate even without an explicit "remote" keyword, so
|
|
# treat them as eligible too. City-specific EU roles (e.g. "Berlin or Munich") stay out.
|
|
is_eu_wide = any(k in low for k in ("europe", "emea")) and not is_us_only
|
|
# Count as remote/EU-eligible only if it isn't a US-only listing and has an EU/global hint
|
|
is_remote = (has_remote or is_eu_wide) and not is_us_only and has_eu_hint
|
|
return in_ch, is_remote
|
|
|
|
|
|
@lru_cache(maxsize=512)
|
|
def _kw_pattern(kw):
|
|
"""Word-boundary regex for a keyword. Plain substring matching produced false hits
|
|
('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
|
|
require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
|
|
non-word char (c#, .net, c++) skip that side's guard so they still match."""
|
|
esc = re.escape(kw.strip())
|
|
left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
|
|
right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
|
|
return re.compile(left + esc + right)
|
|
|
|
|
|
def _kw_in(kw, text):
|
|
return bool(_kw_pattern(kw).search(text))
|
|
|
|
|
|
def score_job(job, title_only=False):
|
|
# Title carries the real signal; the JD body is full of company boilerplate (every
|
|
# Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
|
|
# So title matches score at full weight and body-only matches at half (min 1) — enough
|
|
# to surface a role without letting boilerplate inflate it. Negatives count fully
|
|
# wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
|
|
# boards pass title_only=True and skip body scoring entirely.
|
|
title = (job.get("title") or "").lower()
|
|
desc = "" if title_only else (job.get("description") or "").lower()
|
|
score, pos, neg = 0, [], []
|
|
for kw, w in POSITIVE_KEYWORDS.items():
|
|
if _kw_in(kw, title):
|
|
score += w
|
|
pos.append(kw)
|
|
elif desc and _kw_in(kw, desc):
|
|
score += max(1, w // 2)
|
|
pos.append(kw)
|
|
for kw, w in NEGATIVE_KEYWORDS.items():
|
|
if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
|
|
score += w
|
|
neg.append(kw)
|
|
return score, pos, neg
|
|
|
|
|
|
def load_seen():
|
|
if STATE_FILE.exists():
|
|
return json.loads(STATE_FILE.read_text(encoding="utf-8"))
|
|
return {}
|
|
|
|
|
|
def save_seen(seen):
|
|
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
|
|
def load_decisions():
|
|
"""Decision log keyed by job URL: {url: {company, title, decision, note, date}}.
|
|
Decisions persist across runs so we don't re-evaluate roles we've already judged
|
|
(shortlist / skip / applied / paused / rejected — free-text, not enforced)."""
|
|
if DECISIONS_FILE.exists():
|
|
return json.loads(DECISIONS_FILE.read_text(encoding="utf-8"))
|
|
return {}
|
|
|
|
|
|
def save_decisions(decisions):
|
|
DECISIONS_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
DECISIONS_FILE.write_text(json.dumps(decisions, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
|
|
def _parse_posted(s):
|
|
"""Best-effort parse of an adapter's `posted` field into a date, across the mix of
|
|
formats the boards use (ISO 8601 incl. trailing Z, YYYY-MM-DD, DD.MM.YYYY). Returns None
|
|
for unparseable values (e.g. Workday's relative "Posted 5 Days Ago", or empty)."""
|
|
if not s or not isinstance(s, str):
|
|
return None
|
|
s = s.strip()
|
|
try:
|
|
return datetime.fromisoformat(s.replace("Z", "+00:00")).date()
|
|
except ValueError:
|
|
pass
|
|
for fmt in ("%Y-%m-%d", "%d.%m.%Y", "%Y/%m/%d", "%d/%m/%Y"):
|
|
try:
|
|
return datetime.strptime(s[:10], fmt).date()
|
|
except ValueError:
|
|
pass
|
|
m = re.search(r"\d{4}-\d{2}-\d{2}", s)
|
|
if m:
|
|
try:
|
|
return datetime.strptime(m.group(0), "%Y-%m-%d").date()
|
|
except ValueError:
|
|
pass
|
|
return None
|
|
|
|
|
|
def write_stats_table(stats, total_secs):
|
|
"""Render the per-company scan stats as a markdown table (+ a totals row)."""
|
|
out = ["## Scan stats\n",
|
|
"| Company | Scraped | CH/Remote | Match ≥2 | Newest posting | Time (s) |",
|
|
"|---|--:|--:|--:|:--|--:|"]
|
|
t_scraped = t_elig = t_match = 0
|
|
newest_all = None
|
|
for s in stats:
|
|
name = s["company"] + (" ⚠️" if s.get("error") else "")
|
|
newest = s["newest"].isoformat() if s["newest"] else "—"
|
|
out.append(f"| {name} | {s['scraped']:,} | {s['eligible']:,} | "
|
|
f"{s['match']:,} | {newest} | {s['secs']:.1f} |")
|
|
t_scraped += s["scraped"]; t_elig += s["eligible"]; t_match += s["match"]
|
|
if s["newest"] and (newest_all is None or s["newest"] > newest_all):
|
|
newest_all = s["newest"]
|
|
out.append(f"| **Total ({len(stats)})** | **{t_scraped:,}** | **{t_elig:,}** | "
|
|
f"**{t_match:,}** | **{newest_all.isoformat() if newest_all else '—'}** | "
|
|
f"**{total_secs:.1f}** |")
|
|
out.append("")
|
|
return out
|
|
|
|
|
|
def write_report(path, results, errors, new_only, include_weak, stats=None, total_secs=0.0,
|
|
decisions=None, hide_decided=False):
|
|
decisions = decisions or {}
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
n_new = sum(1 for r in results if r["is_new"])
|
|
n_match = sum(1 for r in results if r["score"] >= 2)
|
|
lines = [
|
|
f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
|
|
f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
|
|
f"Eligible (CH/remote): **{len(results)}** · interest matches (score ≥ 2): "
|
|
f"**{n_match}** · **{n_new}** new since last run\n",
|
|
]
|
|
if stats:
|
|
lines += write_stats_table(stats, total_secs)
|
|
if errors:
|
|
lines.append("## Errors\n")
|
|
for company, err in errors:
|
|
lines.append(f"- **{company}**: {err}")
|
|
lines.append("")
|
|
|
|
strong = [r for r in results if r["score"] >= 6]
|
|
medium = [r for r in results if 2 <= r["score"] < 6]
|
|
weak = [r for r in results if r["score"] < 2]
|
|
|
|
if not include_weak and weak:
|
|
lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")
|
|
n_decided = sum(1 for r in results if r["url"] in decisions)
|
|
if n_decided:
|
|
shown = "hidden" if hide_decided else "tagged inline"
|
|
lines.append(f"_{n_decided} role(s) already in the decision log ({shown}; "
|
|
f"see state/decisions.json)._")
|
|
|
|
buckets = [("Strong fit (score >= 6)", strong),
|
|
("Medium fit (score 2-5)", medium)]
|
|
if include_weak:
|
|
buckets.append(("Weak / noise (score < 2)", weak))
|
|
|
|
for bucket_name, bucket in buckets:
|
|
shown = [r for r in bucket if not (hide_decided and r["url"] in decisions)]
|
|
if not shown:
|
|
continue
|
|
lines.append(f"\n## {bucket_name} - {len(shown)} role(s)\n")
|
|
for r in shown:
|
|
d = decisions.get(r["url"])
|
|
new_tag = " [NEW]" if r["is_new"] else ""
|
|
decided_tag = f" — 🗂 {d['decision'].upper()}" if d else ""
|
|
loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
|
|
lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}{decided_tag}")
|
|
lines.append(f"- Location: {r['location']} *({loc_tag})*")
|
|
if r.get("posted"):
|
|
lines.append(f"- Posted: {r['posted']}")
|
|
lines.append(f"- URL: {r['url']}")
|
|
if d:
|
|
note = f" — {d['note']}" if d.get("note") else ""
|
|
lines.append(f"- 🗂 Decision: **{d['decision']}**{note} ({d.get('date','')})")
|
|
if r["pos"]:
|
|
lines.append(f"- Positive: {', '.join(r['pos'])}")
|
|
if r["neg"]:
|
|
lines.append(f"- Negative: {', '.join(r['neg'])}")
|
|
lines.append("")
|
|
|
|
if MANUAL_CHECK:
|
|
lines.append("\n## Manual check (companies without scrapable APIs)\n")
|
|
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
|
|
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
|
|
for name, note, url in MANUAL_CHECK:
|
|
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
|
|
lines.append("")
|
|
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
def main():
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
|
|
# Record a decision and exit: --decide "<url>" <status> [note words...]
|
|
if "--decide" in sys.argv:
|
|
rest = sys.argv[sys.argv.index("--decide") + 1:]
|
|
if len(rest) < 2:
|
|
print('Usage: --decide "<url>" <status> [note...]', file=sys.stderr)
|
|
return
|
|
url, status, note = rest[0], rest[1], " ".join(rest[2:])
|
|
decisions = load_decisions()
|
|
prev = decisions.get(url, {})
|
|
decisions[url] = {"company": prev.get("company", ""), "title": prev.get("title", ""),
|
|
"decision": status, "note": note, "date": today}
|
|
save_decisions(decisions)
|
|
print(f"Recorded: {status} — {url}", file=sys.stderr)
|
|
return
|
|
|
|
only, new_only, include_weak, hide_decided = None, False, False, False
|
|
for arg in sys.argv[1:]:
|
|
if arg == "--new-only":
|
|
new_only = True
|
|
elif arg == "--include-weak":
|
|
include_weak = True
|
|
elif arg == "--hide-decided":
|
|
hide_decided = True
|
|
elif arg.startswith("--only="):
|
|
only = arg.split("=", 1)[1]
|
|
|
|
seen = load_seen()
|
|
decisions = load_decisions()
|
|
all_results, errors, stats = [], [], []
|
|
run_start = time.perf_counter()
|
|
|
|
for cid, display, adapter, args in COMPANIES:
|
|
if only and cid != only:
|
|
continue
|
|
print(f"Fetching {display}...", file=sys.stderr)
|
|
t0 = time.perf_counter()
|
|
try:
|
|
jobs = ADAPTERS[adapter](args)
|
|
except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
|
|
errors.append((display, repr(e)))
|
|
stats.append({"company": display, "scraped": 0, "eligible": 0,
|
|
"match": 0, "newest": None, "secs": time.perf_counter() - t0,
|
|
"error": True})
|
|
continue
|
|
except Exception as e:
|
|
errors.append((display, f"unexpected: {e!r}"))
|
|
stats.append({"company": display, "scraped": 0, "eligible": 0,
|
|
"match": 0, "newest": None, "secs": time.perf_counter() - t0,
|
|
"error": True})
|
|
continue
|
|
|
|
scraped = len(jobs)
|
|
# Optional per-company title prefilter for high-volume boards
|
|
title_filter = args.get("_title_filter")
|
|
if title_filter:
|
|
jobs = [j for j in jobs
|
|
if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]
|
|
|
|
# Newest posting on the board (board freshness), across parseable dates.
|
|
dates = [d for j in jobs if (d := _parse_posted(j.get("posted")))]
|
|
newest = max(dates) if dates else None
|
|
|
|
company_seen = seen.setdefault(cid, {})
|
|
title_seen = set()
|
|
eligible = match = 0
|
|
for j in jobs:
|
|
jid = str(j.get("id") or j.get("url"))
|
|
in_ch, is_remote = location_matches(j.get("location", ""))
|
|
if not (in_ch or is_remote):
|
|
continue
|
|
# Collapse the same role posted once per remote country (title differs only
|
|
# by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
|
|
norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
|
|
if norm_title in title_seen:
|
|
continue
|
|
title_seen.add(norm_title)
|
|
eligible += 1
|
|
is_new = jid not in company_seen
|
|
score, pos, neg = score_job(j, title_only=bool(title_filter))
|
|
# Pre-filtered boards (e.g. SBB, already narrowed to IT+Bern by the adapter) carry
|
|
# German/generic titles the profile scorer can't read; a _score_floor keeps their
|
|
# already-relevant results out of the hidden weak bucket.
|
|
floor = args.get("_score_floor")
|
|
if floor is not None and score < floor:
|
|
score = floor
|
|
if score >= 2:
|
|
match += 1
|
|
all_results.append({
|
|
"company": display, "company_id": cid,
|
|
"title": j["title"], "location": j["location"],
|
|
"url": j["url"], "posted": j.get("posted", ""),
|
|
"score": score, "pos": pos, "neg": neg,
|
|
"in_ch": in_ch, "remote": is_remote, "is_new": is_new,
|
|
})
|
|
company_seen[jid] = {"title": j["title"], "first_seen": today}
|
|
|
|
stats.append({"company": display, "scraped": scraped, "eligible": eligible,
|
|
"match": match, "newest": newest,
|
|
"secs": time.perf_counter() - t0, "error": False})
|
|
|
|
save_seen(seen)
|
|
_close_browser()
|
|
total_secs = time.perf_counter() - run_start
|
|
|
|
if new_only:
|
|
all_results = [r for r in all_results if r["is_new"]]
|
|
|
|
all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))
|
|
|
|
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
report_path = REPORTS_DIR / f"{today}.md"
|
|
write_report(report_path, all_results, errors, new_only, include_weak,
|
|
stats=stats, total_secs=total_secs,
|
|
decisions=decisions, hide_decided=hide_decided)
|
|
|
|
n_new = sum(1 for r in all_results if r["is_new"])
|
|
print(f"\nReport written: {report_path}", file=sys.stderr)
|
|
print(f"Total matches: {len(all_results)} ({n_new} new) | "
|
|
f"scanned {len(stats)} companies in {total_secs:.1f}s", file=sys.stderr)
|
|
if errors:
|
|
print(f"Errors: {len(errors)} - see report", file=sys.stderr)
|
|
|
|
|
|
# === Adapter coverage (refreshed 2026-06-01) ==================================
|
|
# 25 companies automated across 13 adapter types; MANUAL_CHECK is empty.
|
|
#
|
|
# Automated (COMPANIES above):
|
|
# workday nvidia, novartis
|
|
# ashby kraken, openai, confluent
|
|
# greenhouse anthropic, gitlab, grafana
|
|
# pcsx microsoft (Eightfold position-search endpoint)
|
|
# smartrecruiters metgroup, ldc
|
|
# rss bis (vacancies.rss — RSS 1.0/RDF)
|
|
# getro coinbase_ventures (web3 portfolio network, collection 1625)
|
|
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
|
|
# lever palantir, quantco (api.lever.co; QuantCo slug is "quantco-")
|
|
# json swissgrid (Magnolia /.rest/cloud/component-data)
|
|
# sbb sbb (company.sbb.ch AEM jobfilter.results.json)
|
|
# bkw bkw (jobs.bkw.com PMS structureddata API)
|
|
# playwright google, apple, meta, roche, cisco, ruag (headless browser, 3-15s each)
|
|
#
|
|
# 2026-06-01 list review (verified live):
|
|
# - Palantir (lever): 221 postings, US/London-heavy so Swiss/Schwyz roles are rare but
|
|
# self-surface (FDSE/Deployment-Strategist titles map to his FDE drafts).
|
|
# - Swissgrid (json): Magnolia CMS endpoint; placeOfWork is bare city, so loc_suffix tags
|
|
# it Switzerland for the CH filter. ~13 roles incl. Data Scientist / Applied-ML.
|
|
# - RUAG (playwright + page_param): Drupal portal, 20 jobs/page, paginated ?page=N. Page 0
|
|
# is apprenticeship-heavy; eng roles (DevOps/Data/Software) are on later pages, so we
|
|
# page through (max_pages). ENG_TITLE_FILTER cuts the Lehrstelle bulk. ⚠️ DE-citizen
|
|
# limits on RUAG classified roles — verify per-role.
|
|
# - SBB (sbb): correct host is company.sbb.ch (not company-jobs.sbb.ch). Flat JSON list;
|
|
# fetch_sbb replicates the user's IT + Bern-region filter. German/generic titles, so a
|
|
# _score_floor keeps the pre-filtered results visible. ⚠️ DE-citizen limits possible.
|
|
# - BKW (bkw): real host is jobs.bkw.com (PMS structureddata API), ~600 group-wide roles;
|
|
# fetch_bkw keeps Berufsfeld categories Informatik/Trading/Finanzen (IT/data + energy
|
|
# trading: Quant Risk, Solution Architect Energiehandel, ...). _score_floor as above.
|
|
# - QuantCo (lever, slug "quantco-"): ~16 roles, most tagged "Europe" (hybrid; Zürich is
|
|
# QuantCo's continental hub), surfaced via the EU-wide rule in location_matches. Strong:
|
|
# AI Engineer; medium: Cloud Engineer, AI Applied Scientist, Data Scientist, Quant
|
|
# Researcher, Software Engineer. Interns/frontend suppressed by NEGATIVE_KEYWORDS.
|
|
# The Bern/Thun tier intentionally relaxes the comp bar (see user_comp_bar memory).
|
|
#
|
|
# MANUAL_CHECK is empty — every target company is automated. Dropped 2026-06-01: BFH
|
|
# (academic FH pay below the relaxed Bern/Thun floor, research-leaning, 403s anyway) and
|
|
# Dialectic (~50-person crypto VC, 0 open roles; crypto already covered by Kraken / Bitcoin
|
|
# Suisse / Coinbase Ventures).
|
|
#
|
|
# Earlier history: Google/Apple/Meta/Roche/Cisco automated via playwright; Microsoft via
|
|
# pcsx; BIS via rss; Coinbase Ventures via getro; Bitcoin Suisse via onlyfy. Dropped:
|
|
# ClickHouse, Vitol, Sygnum (Glassdoor/comp red flags), IBM Research + Sonova (low fit),
|
|
# Coinbase-the-employer (hiring freeze), AMINA (poor Glassdoor), Canonical (pay+culture).
|
|
# The Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase itself.
|
|
# ==============================================================================
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|