|
|
|
@@ -0,0 +1,686 @@
|
|
|
|
|
"""Job scout for Dennis's quarterly target companies.
|
|
|
|
|
|
|
|
|
|
Pulls latest openings from companies with known public ATS APIs (Workday/Ashby/Greenhouse),
|
|
|
|
|
filters by Swiss location or remote eligibility, scores fit against profile keywords, tracks
|
|
|
|
|
which job IDs we've already seen, writes a markdown report.
|
|
|
|
|
|
|
|
|
|
Usage:
|
|
|
|
|
py scout.py # Pull all configured companies (strong + medium only)
|
|
|
|
|
py scout.py --only=nvidia # Pull a single company by id
|
|
|
|
|
py scout.py --new-only # Report only jobs not seen before
|
|
|
|
|
py scout.py --include-weak # Include weak/noise bucket (default hidden)
|
|
|
|
|
|
|
|
|
|
State : state/seen_jobs.json
|
|
|
|
|
Output: reports/YYYY-MM-DD.md
|
|
|
|
|
|
|
|
|
|
To add a company: append to COMPANIES with one of the existing adapter types.
|
|
|
|
|
For companies behind custom careers sites (Google, MS, Meta, Apple, Roche, Novartis, IBM,
|
|
|
|
|
Cisco, Sonova, Sygnum) — see TODO_ADAPTERS at the bottom.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
import json
|
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
|
import urllib.error
|
|
|
|
|
import urllib.parse
|
|
|
|
|
import urllib.request
|
|
|
|
|
from datetime import datetime, timezone
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
ROOT = Path(__file__).parent
|
|
|
|
|
STATE_FILE = ROOT / "state" / "seen_jobs.json"
|
|
|
|
|
REPORTS_DIR = ROOT / "reports"
|
|
|
|
|
USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"
|
|
|
|
|
|
|
|
|
|
CH_LOCATION_KEYWORDS = [
|
|
|
|
|
"switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
|
|
|
|
|
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
REMOTE_KEYWORDS = ["remote"]
|
|
|
|
|
|
|
|
|
|
US_ONLY_PATTERNS = [
|
|
|
|
|
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
|
|
|
|
|
"remote-friendly us", "remote (us)", "united states - remote",
|
|
|
|
|
"remote, united states",
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
EU_HINT_KEYWORDS = [
|
|
|
|
|
"germany", "france", "spain", "portugal", "ireland", "netherlands",
|
|
|
|
|
"sweden", "norway", "finland", "denmark", "poland", "czech",
|
|
|
|
|
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
|
|
|
|
|
"europe", "emea", "global",
|
|
|
|
|
] + CH_LOCATION_KEYWORDS
|
|
|
|
|
|
|
|
|
|
POSITIVE_KEYWORDS = {
|
|
|
|
|
"genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
|
|
|
|
|
"applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
|
|
|
|
|
"mlops": 3, "ai platform": 3, "ml platform": 3,
|
|
|
|
|
"python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
|
|
|
|
|
"solutions architect": 2, "platform engineer": 2,
|
|
|
|
|
"ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
|
|
|
|
|
"kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
|
|
|
|
|
"crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
|
|
|
|
|
"senior": 1, "staff": 1, "lead": 1, "principal": 1,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NEGATIVE_KEYWORDS = {
|
|
|
|
|
"cuda": -3, "kernel driver": -3, "gpu programming": -3,
|
|
|
|
|
"compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
|
|
|
|
|
"rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
|
|
|
|
|
"frontend": -3, "front-end": -3, "react native": -3,
|
|
|
|
|
"ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
|
|
|
|
|
"ui engineer": -2, "ux engineer": -2,
|
|
|
|
|
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
|
|
|
|
|
"physical design": -3, "silicon": -2,
|
|
|
|
|
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
|
|
|
|
|
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
# id, display, adapter, adapter_args
|
|
|
|
|
COMPANIES = [
|
|
|
|
|
("nvidia", "NVIDIA", "workday", {
|
|
|
|
|
"host": "nvidia.wd5.myworkdayjobs.com",
|
|
|
|
|
"tenant": "nvidia",
|
|
|
|
|
"site": "NVIDIAExternalCareerSite",
|
|
|
|
|
"search_text": "Switzerland",
|
|
|
|
|
}),
|
|
|
|
|
("kraken", "Kraken", "ashby", {"slug": "kraken.com"}),
|
|
|
|
|
("openai", "OpenAI", "ashby", {"slug": "openai"}),
|
|
|
|
|
("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
|
|
|
|
|
("novartis", "Novartis", "workday", {
|
|
|
|
|
"host": "novartis.wd3.myworkdayjobs.com",
|
|
|
|
|
"tenant": "novartis",
|
|
|
|
|
"site": "Novartis_Careers",
|
|
|
|
|
"search_text": "Switzerland",
|
|
|
|
|
}),
|
|
|
|
|
# PCSX (Eightfold) — Microsoft has a public position search endpoint
|
|
|
|
|
("microsoft", "Microsoft", "pcsx", {
|
|
|
|
|
"domain": "microsoft.com",
|
|
|
|
|
"location": "Switzerland",
|
|
|
|
|
}),
|
|
|
|
|
# Sygnum — WordPress AJAX endpoint returns clean JSON
|
|
|
|
|
("sygnum", "Sygnum", "wp_ajax", {
|
|
|
|
|
"url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
|
|
|
|
|
}),
|
|
|
|
|
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
|
|
|
|
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
|
|
|
|
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
|
|
|
|
# aria-label link; location lives in the card text (captured via blob mode).
|
|
|
|
|
("google", "Google", "playwright", {
|
|
|
|
|
"url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
|
|
|
|
|
"wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
|
|
|
|
|
"card": "li:has(a[aria-label*='Learn more about'])",
|
|
|
|
|
"title_sel": "a[aria-label*='Learn more about']",
|
|
|
|
|
"title_sel_attr": "aria-label",
|
|
|
|
|
"title_strip_prefix": "Learn more about ",
|
|
|
|
|
"link_sel": "a[href*='jobs/results/']",
|
|
|
|
|
"link_attr": "href",
|
|
|
|
|
"url_prefix": "https://www.google.com/about/careers/applications/",
|
|
|
|
|
"default_location": "",
|
|
|
|
|
"scroll_count": 5,
|
|
|
|
|
"use_inner_text_as_blob": True,
|
|
|
|
|
"cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
|
|
|
|
|
}),
|
|
|
|
|
("apple", "Apple", "playwright", {
|
|
|
|
|
"url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
|
|
|
|
|
"wait_for": "a[href*='/en-us/details/']",
|
|
|
|
|
"card": "a[href*='/en-us/details/']",
|
|
|
|
|
"title_attr": "text",
|
|
|
|
|
"link_attr": "href",
|
|
|
|
|
"url_prefix": "https://jobs.apple.com",
|
|
|
|
|
"default_location": "Switzerland",
|
|
|
|
|
}),
|
|
|
|
|
# Meta job links are /profile/job_details/<id>; title + location are in the link text.
|
|
|
|
|
("meta", "Meta", "playwright", {
|
|
|
|
|
"url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
|
|
|
|
|
"wait_for": "a[href*='/profile/job_details/']",
|
|
|
|
|
"card": "a[href*='/profile/job_details/']",
|
|
|
|
|
"title_attr": "text",
|
|
|
|
|
"link_attr": "href",
|
|
|
|
|
"url_prefix": "https://www.metacareers.com",
|
|
|
|
|
"default_location": "Zurich, Switzerland",
|
|
|
|
|
"scroll_count": 5,
|
|
|
|
|
"use_inner_text_as_blob": True,
|
|
|
|
|
}),
|
|
|
|
|
# PhenomPeople pattern (Roche) uses li.jobs-list-item.
|
|
|
|
|
# Card inner text is structured like: "<title> | Location | <city, country> | Category | ..."
|
|
|
|
|
# We extract title from first line, full text becomes the "description" so our location
|
|
|
|
|
# filter still sees Switzerland mentions.
|
|
|
|
|
("roche", "Roche", "playwright", {
|
|
|
|
|
"url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
|
|
|
|
|
"wait_for": "li.jobs-list-item, a.au-target",
|
|
|
|
|
"card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
|
|
|
|
|
"title_attr": "text",
|
|
|
|
|
"link_sel": "a[href]",
|
|
|
|
|
"link_attr": "href",
|
|
|
|
|
"url_prefix": "https://careers.roche.com",
|
|
|
|
|
"default_location": "",
|
|
|
|
|
"cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
|
|
|
|
|
"scroll_count": 6,
|
|
|
|
|
"use_inner_text_as_blob": True,
|
|
|
|
|
}),
|
|
|
|
|
# Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
|
|
|
|
|
("cisco", "Cisco", "playwright", {
|
|
|
|
|
"url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
|
|
|
|
|
"wait_for": "a[href*='/job/'], div[role='listitem']",
|
|
|
|
|
"card": "div[role='listitem']:has(a[href*='/job/'])",
|
|
|
|
|
"title_sel": "a[href*='/job/']",
|
|
|
|
|
"link_sel": "a[href*='/job/']",
|
|
|
|
|
"link_attr": "href",
|
|
|
|
|
"url_prefix": "https://careers.cisco.com",
|
|
|
|
|
"default_location": "Switzerland",
|
|
|
|
|
"cookie_accept": ["#onetrust-accept-btn-handler"],
|
|
|
|
|
"scroll_count": 5,
|
|
|
|
|
"use_inner_text_as_blob": True,
|
|
|
|
|
}),
|
|
|
|
|
("ibm", "IBM Research", "playwright", {
|
|
|
|
|
# IBM Research Zurich careers page is mostly a static intro with few openings.
|
|
|
|
|
# Use IBM's main careers search filtered to Switzerland instead.
|
|
|
|
|
"url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland",
|
|
|
|
|
"wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']",
|
|
|
|
|
"card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)",
|
|
|
|
|
"title_sel": "h3, h4",
|
|
|
|
|
"link_sel": "a[href*='/careers/']",
|
|
|
|
|
"link_attr": "href",
|
|
|
|
|
"url_prefix": "https://www.ibm.com",
|
|
|
|
|
"default_location": "Switzerland",
|
|
|
|
|
"scroll_count": 4,
|
|
|
|
|
}),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
|
|
|
|
# These surface as a clickable checklist in the report so they're not forgotten.
|
|
|
|
|
MANUAL_CHECK = [
|
|
|
|
|
("Sonova", "PhenomPeople serves empty shell to automation (body never renders); widgets API rejects requests",
|
|
|
|
|
"https://careers.sonova.com/us/en/search-results?keywords=Switzerland"),
|
|
|
|
|
("Coinbase", "/careers/positions 302-redirects to landing; no job links or ATS API exposed even with stealth",
|
|
|
|
|
"https://www.coinbase.com/careers"),
|
|
|
|
|
("AMINA Bank", "jobs are at /careers/ (#positions) via JS widget; only ~4 apply links, no scrapable list",
|
|
|
|
|
"https://aminagroup.com/careers/#positions"),
|
|
|
|
|
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
|
|
|
|
|
"https://bitcoinsuisse.com/careers#open-positions"),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def http_get_json(url, headers=None, data=None, method="GET"):
|
|
|
|
|
headers = headers or {}
|
|
|
|
|
headers.setdefault("User-Agent", USER_AGENT)
|
|
|
|
|
headers.setdefault("Accept", "application/json")
|
|
|
|
|
if data is not None and isinstance(data, dict):
|
|
|
|
|
data = json.dumps(data).encode("utf-8")
|
|
|
|
|
headers.setdefault("Content-Type", "application/json")
|
|
|
|
|
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
|
|
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
|
|
|
return json.loads(resp.read().decode("utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_workday(args):
|
|
|
|
|
host, site, tenant = args["host"], args["site"], args["tenant"]
|
|
|
|
|
search_text = args.get("search_text", "")
|
|
|
|
|
url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
|
|
|
|
|
jobs, offset = [], 0
|
|
|
|
|
while True:
|
|
|
|
|
data = http_get_json(url, method="POST", data={
|
|
|
|
|
"appliedFacets": {}, "limit": 20, "offset": offset,
|
|
|
|
|
"searchText": search_text,
|
|
|
|
|
})
|
|
|
|
|
postings = data.get("jobPostings", [])
|
|
|
|
|
for p in postings:
|
|
|
|
|
ext = p.get("externalPath", "")
|
|
|
|
|
jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
|
|
|
|
|
jobs.append({
|
|
|
|
|
"id": jid,
|
|
|
|
|
"title": p.get("title", ""),
|
|
|
|
|
"location": p.get("locationsText", "") + " " + ext,
|
|
|
|
|
"url": f"https://{host}{ext}",
|
|
|
|
|
"posted": p.get("postedOn", ""),
|
|
|
|
|
"description": "",
|
|
|
|
|
})
|
|
|
|
|
total = data.get("total", 0)
|
|
|
|
|
offset += len(postings)
|
|
|
|
|
if not postings or offset >= total:
|
|
|
|
|
break
|
|
|
|
|
return jobs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_ashby(args):
|
|
|
|
|
slug = args["slug"]
|
|
|
|
|
url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
|
|
|
|
|
data = http_get_json(url)
|
|
|
|
|
jobs = []
|
|
|
|
|
for j in data.get("jobs", []):
|
|
|
|
|
secs = j.get("secondaryLocations", []) or []
|
|
|
|
|
sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
|
|
|
|
|
loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
|
|
|
|
|
jobs.append({
|
|
|
|
|
"id": j.get("id"),
|
|
|
|
|
"title": j.get("title", ""),
|
|
|
|
|
"location": loc_blob,
|
|
|
|
|
"url": j.get("jobUrl"),
|
|
|
|
|
"posted": j.get("publishedAt", ""),
|
|
|
|
|
"description": (j.get("descriptionPlain") or "")[:2500],
|
|
|
|
|
"department": j.get("department", ""),
|
|
|
|
|
})
|
|
|
|
|
return jobs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_greenhouse(args):
|
|
|
|
|
board = args["board"]
|
|
|
|
|
url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
|
|
|
|
|
data = http_get_json(url)
|
|
|
|
|
jobs = []
|
|
|
|
|
for j in data.get("jobs", []):
|
|
|
|
|
loc = (j.get("location") or {}).get("name", "")
|
|
|
|
|
offices = j.get("offices") or []
|
|
|
|
|
office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
|
|
|
|
|
loc_blob = " ".join(x for x in [loc, office_names] if x)
|
|
|
|
|
desc = j.get("content", "") or ""
|
|
|
|
|
desc = re.sub(r"<[^>]+>", " ", desc)
|
|
|
|
|
desc = re.sub(r"\s+", " ", desc).strip()
|
|
|
|
|
jobs.append({
|
|
|
|
|
"id": str(j.get("id")),
|
|
|
|
|
"title": j.get("title", ""),
|
|
|
|
|
"location": loc_blob,
|
|
|
|
|
"url": j.get("absolute_url"),
|
|
|
|
|
"posted": j.get("updated_at", ""),
|
|
|
|
|
"description": desc[:2500],
|
|
|
|
|
})
|
|
|
|
|
return jobs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_pcsx(args):
|
|
|
|
|
"""Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
|
|
|
|
|
The same endpoint pattern is used by other PCS-hosted boards."""
|
|
|
|
|
domain = args["domain"]
|
|
|
|
|
location = args.get("location", "")
|
|
|
|
|
base = "https://apply.careers.microsoft.com/api/pcsx/search"
|
|
|
|
|
jobs, start = [], 0
|
|
|
|
|
while True:
|
|
|
|
|
url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
|
|
|
|
|
data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
|
|
|
|
|
positions = (data.get("data") or {}).get("positions", []) or []
|
|
|
|
|
for p in positions:
|
|
|
|
|
locs = p.get("locations") or []
|
|
|
|
|
jobs.append({
|
|
|
|
|
"id": str(p.get("id")),
|
|
|
|
|
"title": p.get("name", ""),
|
|
|
|
|
"location": " | ".join(locs),
|
|
|
|
|
"url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
|
|
|
|
|
"posted": p.get("postedTs", ""),
|
|
|
|
|
"description": (p.get("description") or "")[:2000],
|
|
|
|
|
})
|
|
|
|
|
if not positions or len(positions) < 50:
|
|
|
|
|
break
|
|
|
|
|
start += len(positions)
|
|
|
|
|
if start >= 500:
|
|
|
|
|
break
|
|
|
|
|
return jobs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_wp_ajax(args):
|
|
|
|
|
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
|
|
|
|
url = args["url"]
|
|
|
|
|
data = http_get_json(url)
|
|
|
|
|
if not isinstance(data, list):
|
|
|
|
|
return []
|
|
|
|
|
jobs = []
|
|
|
|
|
for j in data:
|
|
|
|
|
jobs.append({
|
|
|
|
|
"id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
|
|
|
|
|
"title": j.get("title", ""),
|
|
|
|
|
"location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
|
|
|
|
|
"url": j.get("application_url") or args["url"],
|
|
|
|
|
"posted": "",
|
|
|
|
|
"description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
|
|
|
|
|
})
|
|
|
|
|
return jobs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Injected before page scripts run, to mask the most common headless-detection signals.
|
|
|
|
|
# Required for Google; harmless for the other sites.
|
|
|
|
|
STEALTH_JS = """
|
|
|
|
|
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
|
|
|
window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
|
|
|
|
|
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
|
|
|
|
|
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
|
|
|
|
|
const _q = navigator.permissions && navigator.permissions.query;
|
|
|
|
|
if (_q) {
|
|
|
|
|
navigator.permissions.query = (p) => p && p.name === 'notifications'
|
|
|
|
|
? Promise.resolve({state: Notification.permission}) : _q(p);
|
|
|
|
|
}
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
_playwright_singleton = {"pw": None, "browser": None}
|
|
|
|
|
|
|
|
|
|
def _get_browser():
|
|
|
|
|
"""Lazy-init a single shared headless browser. Saves ~3s per company."""
|
|
|
|
|
if _playwright_singleton["browser"] is not None:
|
|
|
|
|
return _playwright_singleton["browser"]
|
|
|
|
|
try:
|
|
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
except ImportError as e:
|
|
|
|
|
raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
|
|
|
|
|
pw = sync_playwright().start()
|
|
|
|
|
browser = pw.chromium.launch(
|
|
|
|
|
headless=True,
|
|
|
|
|
args=["--disable-blink-features=AutomationControlled"],
|
|
|
|
|
)
|
|
|
|
|
_playwright_singleton["pw"] = pw
|
|
|
|
|
_playwright_singleton["browser"] = browser
|
|
|
|
|
return browser
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _absolutize(href, prefix):
|
|
|
|
|
"""Join a possibly-relative href with the configured prefix."""
|
|
|
|
|
if not href or href.startswith("http"):
|
|
|
|
|
return href
|
|
|
|
|
cleaned = href.lstrip("./").lstrip("/")
|
|
|
|
|
if not prefix:
|
|
|
|
|
return href
|
|
|
|
|
return prefix.rstrip("/") + "/" + cleaned
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _close_browser():
|
|
|
|
|
if _playwright_singleton["browser"]:
|
|
|
|
|
try:
|
|
|
|
|
_playwright_singleton["browser"].close()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
if _playwright_singleton["pw"]:
|
|
|
|
|
try:
|
|
|
|
|
_playwright_singleton["pw"].stop()
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_playwright(args):
|
|
|
|
|
"""Generic headless-browser scraper. See COMPANIES entries for selector args."""
|
|
|
|
|
browser = _get_browser()
|
|
|
|
|
ctx = browser.new_context(
|
|
|
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
|
|
|
locale="en-US",
|
|
|
|
|
viewport={"width": 1366, "height": 768},
|
|
|
|
|
)
|
|
|
|
|
ctx.add_init_script(STEALTH_JS)
|
|
|
|
|
page = ctx.new_page()
|
|
|
|
|
jobs = []
|
|
|
|
|
try:
|
|
|
|
|
page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
|
|
|
|
|
# Optional cookie banner acceptance
|
|
|
|
|
for sel in args.get("cookie_accept", []) or []:
|
|
|
|
|
try:
|
|
|
|
|
btn = page.locator(sel).first
|
|
|
|
|
if btn.is_visible(timeout=2000):
|
|
|
|
|
btn.click()
|
|
|
|
|
page.wait_for_timeout(500)
|
|
|
|
|
except Exception:
|
|
|
|
|
pass
|
|
|
|
|
# Wait for job content to render
|
|
|
|
|
wait_for = args.get("wait_for")
|
|
|
|
|
if wait_for:
|
|
|
|
|
try:
|
|
|
|
|
page.wait_for_selector(wait_for, timeout=15000)
|
|
|
|
|
except Exception:
|
|
|
|
|
page.wait_for_timeout(4000)
|
|
|
|
|
# Scroll a few times to trigger any lazy-loaded results
|
|
|
|
|
for _ in range(args.get("scroll_count", 3)):
|
|
|
|
|
try:
|
|
|
|
|
page.mouse.wheel(0, 4000)
|
|
|
|
|
page.wait_for_timeout(700)
|
|
|
|
|
except Exception:
|
|
|
|
|
break
|
|
|
|
|
|
|
|
|
|
cards = page.locator(args["card"])
|
|
|
|
|
n = min(cards.count(), args.get("max_cards", 150))
|
|
|
|
|
for i in range(n):
|
|
|
|
|
card = cards.nth(i)
|
|
|
|
|
try:
|
|
|
|
|
title = ""
|
|
|
|
|
if args.get("title_attr") == "text":
|
|
|
|
|
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
|
|
|
|
|
elif args.get("title_attr"):
|
|
|
|
|
title = (card.get_attribute(args["title_attr"]) or "").strip()
|
|
|
|
|
elif args.get("title_sel"):
|
|
|
|
|
t = card.locator(args["title_sel"]).first
|
|
|
|
|
if t.count():
|
|
|
|
|
# Read either an attribute (e.g. aria-label) or the inner text
|
|
|
|
|
if args.get("title_sel_attr"):
|
|
|
|
|
title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
|
|
|
|
|
else:
|
|
|
|
|
title = (t.inner_text() or "").strip()
|
|
|
|
|
if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
|
|
|
|
|
title = title[len(args["title_strip_prefix"]):].strip()
|
|
|
|
|
if not title:
|
|
|
|
|
title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
|
|
|
|
|
|
|
|
|
|
location = args.get("default_location", "")
|
|
|
|
|
if args.get("location_sel"):
|
|
|
|
|
lsel = card.locator(args["location_sel"]).first
|
|
|
|
|
if lsel.count():
|
|
|
|
|
location = (lsel.inner_text() or location).strip()
|
|
|
|
|
|
|
|
|
|
link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
|
|
|
|
|
href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
|
|
|
|
|
href = _absolutize(href, args.get("url_prefix", ""))
|
|
|
|
|
|
|
|
|
|
if not title:
|
|
|
|
|
continue
|
|
|
|
|
description = ""
|
|
|
|
|
if args.get("use_inner_text_as_blob"):
|
|
|
|
|
# Use the full card text as both location source and description
|
|
|
|
|
full = (card.inner_text() or "")
|
|
|
|
|
description = full[:2000]
|
|
|
|
|
if not location:
|
|
|
|
|
location = full[:300]
|
|
|
|
|
jobs.append({
|
|
|
|
|
"id": href or f"{args['url']}#{i}",
|
|
|
|
|
"title": title,
|
|
|
|
|
"location": location,
|
|
|
|
|
"url": href or args["url"],
|
|
|
|
|
"posted": "",
|
|
|
|
|
"description": description,
|
|
|
|
|
})
|
|
|
|
|
except Exception:
|
|
|
|
|
continue
|
|
|
|
|
finally:
|
|
|
|
|
ctx.close()
|
|
|
|
|
|
|
|
|
|
# Deduplicate within a single company by id
|
|
|
|
|
seen, deduped = set(), []
|
|
|
|
|
for j in jobs:
|
|
|
|
|
if j["id"] in seen:
|
|
|
|
|
continue
|
|
|
|
|
seen.add(j["id"])
|
|
|
|
|
deduped.append(j)
|
|
|
|
|
return deduped
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ADAPTERS = {
|
|
|
|
|
"workday": fetch_workday,
|
|
|
|
|
"ashby": fetch_ashby,
|
|
|
|
|
"greenhouse": fetch_greenhouse,
|
|
|
|
|
"pcsx": fetch_pcsx,
|
|
|
|
|
"wp_ajax": fetch_wp_ajax,
|
|
|
|
|
"playwright": fetch_playwright,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def location_matches(loc_text):
|
|
|
|
|
if not loc_text:
|
|
|
|
|
return False, False
|
|
|
|
|
low = loc_text.lower()
|
|
|
|
|
in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
|
|
|
|
|
has_remote = any(k in low for k in REMOTE_KEYWORDS)
|
|
|
|
|
is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
|
|
|
|
|
has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
|
|
|
|
|
# Count as remote-eligible only if it isn't a US-only remote listing
|
|
|
|
|
# and it has at least one EU/global hint
|
|
|
|
|
is_remote = has_remote and not is_us_only and has_eu_hint
|
|
|
|
|
return in_ch, is_remote
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def score_job(job):
|
|
|
|
|
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
|
|
|
|
score, pos, neg = 0, [], []
|
|
|
|
|
for kw, w in POSITIVE_KEYWORDS.items():
|
|
|
|
|
if kw in blob:
|
|
|
|
|
score += w
|
|
|
|
|
pos.append(kw)
|
|
|
|
|
for kw, w in NEGATIVE_KEYWORDS.items():
|
|
|
|
|
if kw in blob:
|
|
|
|
|
score += w
|
|
|
|
|
neg.append(kw)
|
|
|
|
|
return score, pos, neg
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_seen():
|
|
|
|
|
if STATE_FILE.exists():
|
|
|
|
|
return json.loads(STATE_FILE.read_text(encoding="utf-8"))
|
|
|
|
|
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_seen(seen):
|
|
|
|
|
STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def write_report(path, results, errors, new_only, include_weak):
|
|
|
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
|
|
|
n_new = sum(1 for r in results if r["is_new"])
|
|
|
|
|
lines = [
|
|
|
|
|
f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
|
|
|
|
|
f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
|
|
|
|
|
f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n",
|
|
|
|
|
]
|
|
|
|
|
if errors:
|
|
|
|
|
lines.append("## Errors\n")
|
|
|
|
|
for company, err in errors:
|
|
|
|
|
lines.append(f"- **{company}**: {err}")
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
strong = [r for r in results if r["score"] >= 6]
|
|
|
|
|
medium = [r for r in results if 2 <= r["score"] < 6]
|
|
|
|
|
weak = [r for r in results if r["score"] < 2]
|
|
|
|
|
|
|
|
|
|
if not include_weak and weak:
|
|
|
|
|
lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")
|
|
|
|
|
|
|
|
|
|
buckets = [("Strong fit (score >= 6)", strong),
|
|
|
|
|
("Medium fit (score 2-5)", medium)]
|
|
|
|
|
if include_weak:
|
|
|
|
|
buckets.append(("Weak / noise (score < 2)", weak))
|
|
|
|
|
|
|
|
|
|
for bucket_name, bucket in buckets:
|
|
|
|
|
if not bucket:
|
|
|
|
|
continue
|
|
|
|
|
lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n")
|
|
|
|
|
for r in bucket:
|
|
|
|
|
new_tag = " [NEW]" if r["is_new"] else ""
|
|
|
|
|
loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
|
|
|
|
|
lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}")
|
|
|
|
|
lines.append(f"- Location: {r['location']} *({loc_tag})*")
|
|
|
|
|
if r.get("posted"):
|
|
|
|
|
lines.append(f"- Posted: {r['posted']}")
|
|
|
|
|
lines.append(f"- URL: {r['url']}")
|
|
|
|
|
if r["pos"]:
|
|
|
|
|
lines.append(f"- Positive: {', '.join(r['pos'])}")
|
|
|
|
|
if r["neg"]:
|
|
|
|
|
lines.append(f"- Negative: {', '.join(r['neg'])}")
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
lines.append("\n## Manual check (companies without scrapable APIs)\n")
|
|
|
|
|
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
|
|
|
|
|
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
|
|
|
|
|
for name, note, url in MANUAL_CHECK:
|
|
|
|
|
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
|
|
|
|
|
lines.append("")
|
|
|
|
|
|
|
|
|
|
path.write_text("\n".join(lines), encoding="utf-8")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
|
only, new_only, include_weak = None, False, False
|
|
|
|
|
for arg in sys.argv[1:]:
|
|
|
|
|
if arg == "--new-only":
|
|
|
|
|
new_only = True
|
|
|
|
|
elif arg == "--include-weak":
|
|
|
|
|
include_weak = True
|
|
|
|
|
elif arg.startswith("--only="):
|
|
|
|
|
only = arg.split("=", 1)[1]
|
|
|
|
|
|
|
|
|
|
seen = load_seen()
|
|
|
|
|
today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
|
|
|
|
all_results, errors = [], []
|
|
|
|
|
|
|
|
|
|
for cid, display, adapter, args in COMPANIES:
|
|
|
|
|
if only and cid != only:
|
|
|
|
|
continue
|
|
|
|
|
print(f"Fetching {display}...", file=sys.stderr)
|
|
|
|
|
try:
|
|
|
|
|
jobs = ADAPTERS[adapter](args)
|
|
|
|
|
except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
|
|
|
|
|
errors.append((display, repr(e)))
|
|
|
|
|
continue
|
|
|
|
|
except Exception as e:
|
|
|
|
|
errors.append((display, f"unexpected: {e!r}"))
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
company_seen = seen.setdefault(cid, {})
|
|
|
|
|
for j in jobs:
|
|
|
|
|
jid = str(j.get("id") or j.get("url"))
|
|
|
|
|
in_ch, is_remote = location_matches(j.get("location", ""))
|
|
|
|
|
if not (in_ch or is_remote):
|
|
|
|
|
continue
|
|
|
|
|
is_new = jid not in company_seen
|
|
|
|
|
score, pos, neg = score_job(j)
|
|
|
|
|
all_results.append({
|
|
|
|
|
"company": display, "company_id": cid,
|
|
|
|
|
"title": j["title"], "location": j["location"],
|
|
|
|
|
"url": j["url"], "posted": j.get("posted", ""),
|
|
|
|
|
"score": score, "pos": pos, "neg": neg,
|
|
|
|
|
"in_ch": in_ch, "remote": is_remote, "is_new": is_new,
|
|
|
|
|
})
|
|
|
|
|
company_seen[jid] = {"title": j["title"], "first_seen": today}
|
|
|
|
|
|
|
|
|
|
save_seen(seen)
|
|
|
|
|
_close_browser()
|
|
|
|
|
|
|
|
|
|
if new_only:
|
|
|
|
|
all_results = [r for r in all_results if r["is_new"]]
|
|
|
|
|
|
|
|
|
|
all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))
|
|
|
|
|
|
|
|
|
|
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
report_path = REPORTS_DIR / f"{today}.md"
|
|
|
|
|
write_report(report_path, all_results, errors, new_only, include_weak)
|
|
|
|
|
|
|
|
|
|
n_new = sum(1 for r in all_results if r["is_new"])
|
|
|
|
|
print(f"\nReport written: {report_path}", file=sys.stderr)
|
|
|
|
|
print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr)
|
|
|
|
|
if errors:
|
|
|
|
|
print(f"Errors: {len(errors)} - see report", file=sys.stderr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# === Adapter probe results (2026-05-21) =======================================
|
|
|
|
|
# Tested all 15 target companies. The 5 working adapters are in COMPANIES above.
|
|
|
|
|
# The remaining 10 are in MANUAL_CHECK. To upgrade one of those from manual to
|
|
|
|
|
# automated, you'd need Playwright/Selenium (real browser) — different project.
|
|
|
|
|
#
|
|
|
|
|
# Google careers.google.com 404 on documented API; auth-gated
|
|
|
|
|
# Microsoft gcsservices.careers.ms.com TLS handshake hangs from non-MS clients
|
|
|
|
|
# Apple jobs.apple.com/api/v1 endpoint exists, location filter codes opaque
|
|
|
|
|
# Meta metacareers.com GraphQL with auth token
|
|
|
|
|
# Roche careers.roche.com PhenomPeople/Eightfold, JS-rendered
|
|
|
|
|
# IBM Research research.ibm.com static page, no API
|
|
|
|
|
# Cisco jobs.cisco.com JS-rendered SPA
|
|
|
|
|
# Sonova careers.sonova.com PhenomPeople SaaS, no public JSON
|
|
|
|
|
# Sygnum sygnum.com/careers Cloudflare-protected
|
|
|
|
|
# AMINA aminagroup.com/career static, low volume
|
|
|
|
|
# Bitcoin Suisse bitcoinsuisse.com/careers static, low volume
|
|
|
|
|
# Coinbase coinbase.com/careers Cloudflare-protected
|
|
|
|
|
# ==============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
main()
|