claude-resume-kit/job_scout/scout.py

"""Job scout for Dennis's quarterly target companies.

Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/
SmartRecruiters/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser
(playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against
profile keywords, tracks which job IDs we've already seen, writes a markdown report.

Usage:
    py scout.py                 # Pull all configured companies (strong + medium only)
    py scout.py --only=nvidia   # Pull a single company by id
    py scout.py --new-only      # Report only jobs not seen before
    py scout.py --include-weak  # Include weak/noise bucket (default hidden)

State : state/seen_jobs.json
Output: reports/YYYY-MM-DD.md

To add a company: append to COMPANIES with one of the existing adapter types. A few sites
resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist).
See the adapter-coverage notes at the bottom for the current automated/manual split.
"""

import json
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from pathlib import Path

ROOT = Path(__file__).parent
STATE_FILE = ROOT / "state" / "seen_jobs.json"
REPORTS_DIR = ROOT / "reports"
USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"

CH_LOCATION_KEYWORDS = [
    "switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
    "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
]

REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]

US_ONLY_PATTERNS = [
    "remote - us", "remote, us", "remote-us", "us remote", "us-remote",
    "remote-friendly us", "remote (us)", "united states - remote",
    "remote, united states",
]

EU_HINT_KEYWORDS = [
    "germany", "france", "spain", "portugal", "ireland", "netherlands",
    "sweden", "norway", "finland", "denmark", "poland", "czech",
    "romania", "italy", "austria", "belgium", "uk", "united kingdom",
    "europe", "emea", "global", "worldwide",
] + CH_LOCATION_KEYWORDS

POSITIVE_KEYWORDS = {
    "genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
    "applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
    "mlops": 3, "ai platform": 3, "ml platform": 3,
    "python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
    "solutions architect": 2, "platform engineer": 2,
    "ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
    "kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
    "crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
    "senior": 1, "staff": 1, "lead": 1, "principal": 1,
}

NEGATIVE_KEYWORDS = {
    "cuda": -3, "kernel driver": -3, "gpu programming": -3,
    "compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
    "rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
    "frontend": -3, "front-end": -3, "react native": -3,
    "ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
    "ui engineer": -2, "ux engineer": -2,
    "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
    "physical design": -3, "silicon": -2,
    "expert c++": -2, "5+ years c++": -2, "deep c++": -2,
    "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
}

# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
# post mostly non-tech roles). Only keep titles containing one of these specific role
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
# don't leak in. Matched as case-insensitive substrings against the title only.
ENG_TITLE_FILTER = [
    "data engineer", "data engineering", "data platform", "platform engineer",
    "data infrastructure", "data architect", "analytics engineer",
    "mlops", "ml engineer", "ml platform", "machine learning engineer",
    "site reliability", "sre", "backend engineer", "back-end engineer",
    "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
    "kafka", "streaming", "big data", "quantitative developer", "quant developer",
]

# id, display, adapter, adapter_args
COMPANIES = [
    ("nvidia",    "NVIDIA",    "workday",    {
        "host": "nvidia.wd5.myworkdayjobs.com",
        "tenant": "nvidia",
        "site": "NVIDIAExternalCareerSite",
        "search_text": "Switzerland",
    }),
    ("kraken",    "Kraken",    "ashby",      {"slug": "kraken.com"}),
    ("openai",    "OpenAI",    "ashby",      {"slug": "openai"}),
    ("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
    ("novartis",  "Novartis",  "workday", {
        "host": "novartis.wd3.myworkdayjobs.com",
        "tenant": "novartis",
        "site": "Novartis_Careers",
        "search_text": "Switzerland",
    }),
    # PCSX (Eightfold) — Microsoft has a public position search endpoint
    ("microsoft", "Microsoft", "pcsx", {
        "domain": "microsoft.com",
        "location": "Switzerland",
    }),
    # Sygnum — WordPress AJAX endpoint returns clean JSON
    ("sygnum",    "Sygnum",    "wp_ajax", {
        "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
    }),
    # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
    ("confluent", "Confluent", "ashby",      {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
    ("gitlab",    "GitLab",    "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
    ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
    ("grafana",   "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
    # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
    ("metgroup",  "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
    ("vitol",     "Vitol",     "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
    ("ldc",       "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
    # International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
    # Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
    ("bis",       "BIS (Basel)","rss", {
        "url": "https://www.bis.org/doclist/vacancies.rss",
        "default_location": "Basel, Switzerland",
    }),
    # Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles
    # across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself —
    # see fetch_getro. CH-filtered + eng title-filtered to stay relevant.
    ("coinbase_ventures", "Coinbase Ventures (web3)", "getro", {
        "collection": 1625,
        "locations": ["Switzerland"],
        "job_functions": ["Software Engineering", "IT", "Data Science"],
        "_title_filter": ENG_TITLE_FILTER,
    }),
    # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
    # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
    # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
    # aria-label link; location lives in the card text (captured via blob mode).
    ("google",    "Google",    "playwright", {
        "url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
        "wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
        "card": "li:has(a[aria-label*='Learn more about'])",
        "title_sel": "a[aria-label*='Learn more about']",
        "title_sel_attr": "aria-label",
        "title_strip_prefix": "Learn more about ",
        "link_sel": "a[href*='jobs/results/']",
        "link_attr": "href",
        "url_prefix": "https://www.google.com/about/careers/applications/",
        "default_location": "",
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
        "cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
    }),
    ("apple",     "Apple",     "playwright", {
        "url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
        "wait_for": "a[href*='/en-us/details/']",
        "card": "a[href*='/en-us/details/']",
        "title_attr": "text",
        "link_attr": "href",
        "url_prefix": "https://jobs.apple.com",
        "default_location": "Switzerland",
    }),
    # Meta job links are /profile/job_details/<id>; title + location are in the link text.
    ("meta",      "Meta",      "playwright", {
        "url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
        "wait_for": "a[href*='/profile/job_details/']",
        "card": "a[href*='/profile/job_details/']",
        "title_attr": "text",
        "link_attr": "href",
        "url_prefix": "https://www.metacareers.com",
        "default_location": "Zurich, Switzerland",
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
    }),
    # PhenomPeople pattern (Roche) uses li.jobs-list-item.
    # Card inner text is structured like: "<title> | Location | <city, country> | Category | ..."
    # We extract title from first line, full text becomes the "description" so our location
    # filter still sees Switzerland mentions.
    ("roche",     "Roche",     "playwright", {
        "url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
        "wait_for": "li.jobs-list-item, a.au-target",
        "card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
        "title_attr": "text",
        "link_sel": "a[href]",
        "link_attr": "href",
        "url_prefix": "https://careers.roche.com",
        "default_location": "",
        "cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
        "scroll_count": 6,
        "use_inner_text_as_blob": True,
    }),
    # Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
    ("cisco",     "Cisco",     "playwright", {
        "url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
        "wait_for": "a[href*='/job/'], div[role='listitem']",
        "card": "div[role='listitem']:has(a[href*='/job/'])",
        "title_sel": "a[href*='/job/']",
        "link_sel": "a[href*='/job/']",
        "link_attr": "href",
        "url_prefix": "https://careers.cisco.com",
        "default_location": "Switzerland",
        "cookie_accept": ["#onetrust-accept-btn-handler"],
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
    }),
]

# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
# These surface as a clickable checklist in the report so they're not forgotten.
MANUAL_CHECK = [
    ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
     "https://bitcoinsuisse.com/careers#open-positions"),
]


def http_get_json(url, headers=None, data=None, method="GET"):
    headers = headers or {}
    headers.setdefault("User-Agent", USER_AGENT)
    headers.setdefault("Accept", "application/json")
    if data is not None and isinstance(data, dict):
        data = json.dumps(data).encode("utf-8")
        headers.setdefault("Content-Type", "application/json")
    req = urllib.request.Request(url, data=data, headers=headers, method=method)
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read().decode("utf-8"))


def fetch_workday(args):
    host, site, tenant = args["host"], args["site"], args["tenant"]
    search_text = args.get("search_text", "")
    url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
    jobs, offset = [], 0
    while True:
        data = http_get_json(url, method="POST", data={
            "appliedFacets": {}, "limit": 20, "offset": offset,
            "searchText": search_text,
        })
        postings = data.get("jobPostings", [])
        for p in postings:
            ext = p.get("externalPath", "")
            jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
            jobs.append({
                "id": jid,
                "title": p.get("title", ""),
                "location": p.get("locationsText", "") + " " + ext,
                "url": f"https://{host}{ext}",
                "posted": p.get("postedOn", ""),
                "description": "",
            })
        total = data.get("total", 0)
        offset += len(postings)
        if not postings or offset >= total:
            break
    return jobs


def fetch_ashby(args):
    slug = args["slug"]
    url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
    data = http_get_json(url)
    jobs = []
    for j in data.get("jobs", []):
        secs = j.get("secondaryLocations", []) or []
        sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
        loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
        jobs.append({
            "id": j.get("id"),
            "title": j.get("title", ""),
            "location": loc_blob,
            "url": j.get("jobUrl"),
            "posted": j.get("publishedAt", ""),
            "description": (j.get("descriptionPlain") or "")[:2500],
            "department": j.get("department", ""),
        })
    return jobs


def fetch_greenhouse(args):
    board = args["board"]
    url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
    data = http_get_json(url)
    jobs = []
    for j in data.get("jobs", []):
        loc = (j.get("location") or {}).get("name", "")
        offices = j.get("offices") or []
        office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
        loc_blob = " ".join(x for x in [loc, office_names] if x)
        desc = j.get("content", "") or ""
        desc = re.sub(r"<[^>]+>", " ", desc)
        desc = re.sub(r"\s+", " ", desc).strip()
        jobs.append({
            "id": str(j.get("id")),
            "title": j.get("title", ""),
            "location": loc_blob,
            "url": j.get("absolute_url"),
            "posted": j.get("updated_at", ""),
            "description": desc[:2500],
        })
    return jobs


def fetch_pcsx(args):
    """Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
    The same endpoint pattern is used by other PCS-hosted boards."""
    domain = args["domain"]
    location = args.get("location", "")
    base = "https://apply.careers.microsoft.com/api/pcsx/search"
    jobs, start = [], 0
    while True:
        url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
        data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
        positions = (data.get("data") or {}).get("positions", []) or []
        for p in positions:
            locs = p.get("locations") or []
            jobs.append({
                "id": str(p.get("id")),
                "title": p.get("name", ""),
                "location": " | ".join(locs),
                "url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
                "posted": p.get("postedTs", ""),
                "description": (p.get("description") or "")[:2000],
            })
        if not positions or len(positions) < 50:
            break
        start += len(positions)
        if start >= 500:
            break
    return jobs


def fetch_smartrecruiters(args):
    """SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
    company = args["company"]
    base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
    jobs, offset = [], 0
    while True:
        data = http_get_json(f"{base}?limit=100&offset={offset}")
        content = data.get("content", []) or []
        for p in content:
            loc = p.get("location") or {}
            parts = [loc.get("fullLocation") or loc.get("city") or ""]
            if loc.get("remote"):
                parts.append("Remote")
            if loc.get("hybrid"):
                parts.append("Hybrid")
            loc_str = " ".join(x for x in parts if x)
            dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
            func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
            jobs.append({
                "id": str(p.get("id")),
                "title": p.get("name", ""),
                "location": loc_str,
                "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
                "posted": p.get("releasedDate", ""),
                "description": " ".join(filter(None, [dept, func])),
            })
        total = data.get("totalFound", 0)
        offset += len(content)
        if not content or offset >= total or offset >= 300:
            break
    return jobs


def fetch_rss(args):
    """Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
    <item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
    RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
    import xml.etree.ElementTree as ET
    req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
    with urllib.request.urlopen(req, timeout=30) as resp:
        root = ET.fromstring(resp.read())
    ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
    items = root.findall(".//rss1:item", ns) or root.findall(".//item")
    jobs = []
    for it in items:
        def field(tag, namespaced=True):
            el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
            if el is None and namespaced:
                el = it.find(tag)
            return (el.text or "").strip() if el is not None and el.text else ""
        link = field("link")
        jobs.append({
            "id": link or field("title"),
            "title": field("title"),
            "location": args.get("default_location", ""),
            "url": link,
            "posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
            "description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
        })
    return jobs


def fetch_wp_ajax(args):
    """WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
    url = args["url"]
    data = http_get_json(url)
    if not isinstance(data, list):
        return []
    jobs = []
    for j in data:
        jobs.append({
            "id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
            "title": j.get("title", ""),
            "location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
            "url": j.get("application_url") or args["url"],
            "posted": "",
            "description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
        })
    return jobs


def fetch_getro(args):
    """Getro network job-board search API (POST JSON). Powers VC portfolio talent
    networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles
    across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself;
    Coinbase doesn't list its own openings on its Ventures board. Server-side filters:
    searchable_locations and job_functions. Org name is folded into the title since this
    is a multi-company board."""
    collection = args["collection"]
    url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs"
    filters = {}
    if args.get("locations"):
        filters["searchable_locations"] = args["locations"]
    if args.get("job_functions"):
        filters["job_functions"] = args["job_functions"]
    jobs, page = [], 0
    while True:
        data = http_get_json(url, method="POST", data={
            "hitsPerPage": 100, "page": page, "query": "", "filters": filters,
        })
        res = data.get("results", {}) or {}
        batch = res.get("jobs", []) or []
        for j in batch:
            org = (j.get("organization") or {}).get("name", "")
            locs = j.get("searchable_locations") or j.get("locations") or []
            loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs)
            ts = j.get("created_at")
            posted = ""
            if isinstance(ts, (int, float)):
                posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
            title = j.get("title", "")
            jobs.append({
                "id": str(j.get("id")),
                "title": f"{title} @ {org}" if org else title,
                "location": loc_str,
                "url": j.get("url", ""),
                "posted": posted,
                "description": " ".join(filter(None, [org] + (j.get("skills") or []))),
            })
        total = res.get("count", 0)
        page += 1
        if not batch or len(jobs) >= total or page >= 10:
            break
    return jobs


# Injected before page scripts run, to mask the most common headless-detection signals.
# Required for Google; harmless for the other sites.
STEALTH_JS = """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
const _q = navigator.permissions && navigator.permissions.query;
if (_q) {
  navigator.permissions.query = (p) => p && p.name === 'notifications'
    ? Promise.resolve({state: Notification.permission}) : _q(p);
}
"""

_playwright_singleton = {"pw": None, "browser": None}

def _get_browser():
    """Lazy-init a single shared headless browser. Saves ~3s per company."""
    if _playwright_singleton["browser"] is not None:
        return _playwright_singleton["browser"]
    try:
        from playwright.sync_api import sync_playwright
    except ImportError as e:
        raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
    pw = sync_playwright().start()
    browser = pw.chromium.launch(
        headless=True,
        args=["--disable-blink-features=AutomationControlled"],
    )
    _playwright_singleton["pw"] = pw
    _playwright_singleton["browser"] = browser
    return browser


def _absolutize(href, prefix):
    """Join a possibly-relative href with the configured prefix."""
    if not href or href.startswith("http"):
        return href
    cleaned = href.lstrip("./").lstrip("/")
    if not prefix:
        return href
    return prefix.rstrip("/") + "/" + cleaned


def _close_browser():
    if _playwright_singleton["browser"]:
        try:
            _playwright_singleton["browser"].close()
        except Exception:
            pass
    if _playwright_singleton["pw"]:
        try:
            _playwright_singleton["pw"].stop()
        except Exception:
            pass


def fetch_playwright(args):
    """Generic headless-browser scraper. See COMPANIES entries for selector args."""
    browser = _get_browser()
    ctx = browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        locale="en-US",
        viewport={"width": 1366, "height": 768},
    )
    ctx.add_init_script(STEALTH_JS)
    page = ctx.new_page()
    jobs = []
    try:
        page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
        # Optional cookie banner acceptance
        for sel in args.get("cookie_accept", []) or []:
            try:
                btn = page.locator(sel).first
                if btn.is_visible(timeout=2000):
                    btn.click()
                    page.wait_for_timeout(500)
            except Exception:
                pass
        # Wait for job content to render
        wait_for = args.get("wait_for")
        if wait_for:
            try:
                page.wait_for_selector(wait_for, timeout=15000)
            except Exception:
                page.wait_for_timeout(4000)
        # Scroll a few times to trigger any lazy-loaded results
        for _ in range(args.get("scroll_count", 3)):
            try:
                page.mouse.wheel(0, 4000)
                page.wait_for_timeout(700)
            except Exception:
                break

        cards = page.locator(args["card"])
        n = min(cards.count(), args.get("max_cards", 150))
        for i in range(n):
            card = cards.nth(i)
            try:
                title = ""
                if args.get("title_attr") == "text":
                    title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
                elif args.get("title_attr"):
                    title = (card.get_attribute(args["title_attr"]) or "").strip()
                elif args.get("title_sel"):
                    t = card.locator(args["title_sel"]).first
                    if t.count():
                        # Read either an attribute (e.g. aria-label) or the inner text
                        if args.get("title_sel_attr"):
                            title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
                        else:
                            title = (t.inner_text() or "").strip()
                if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
                    title = title[len(args["title_strip_prefix"]):].strip()
                if not title:
                    title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]

                location = args.get("default_location", "")
                if args.get("location_sel"):
                    lsel = card.locator(args["location_sel"]).first
                    if lsel.count():
                        location = (lsel.inner_text() or location).strip()

                link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
                href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
                href = _absolutize(href, args.get("url_prefix", ""))

                if not title:
                    continue
                description = ""
                if args.get("use_inner_text_as_blob"):
                    # Use the full card text as both location source and description
                    full = (card.inner_text() or "")
                    description = full[:2000]
                    if not location:
                        location = full[:300]
                jobs.append({
                    "id": href or f"{args['url']}#{i}",
                    "title": title,
                    "location": location,
                    "url": href or args["url"],
                    "posted": "",
                    "description": description,
                })
            except Exception:
                continue
    finally:
        ctx.close()

    # Deduplicate within a single company by id
    seen, deduped = set(), []
    for j in jobs:
        if j["id"] in seen:
            continue
        seen.add(j["id"])
        deduped.append(j)
    return deduped


ADAPTERS = {
    "workday": fetch_workday,
    "ashby": fetch_ashby,
    "greenhouse": fetch_greenhouse,
    "pcsx": fetch_pcsx,
    "wp_ajax": fetch_wp_ajax,
    "smartrecruiters": fetch_smartrecruiters,
    "rss": fetch_rss,
    "getro": fetch_getro,
    "playwright": fetch_playwright,
}


def location_matches(loc_text):
    if not loc_text:
        return False, False
    low = loc_text.lower()
    in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
    has_remote = any(k in low for k in REMOTE_KEYWORDS)
    is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
    has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
    # Count as remote-eligible only if it isn't a US-only remote listing
    # and it has at least one EU/global hint
    is_remote = has_remote and not is_us_only and has_eu_hint
    return in_ch, is_remote


def score_job(job, title_only=False):
    # Title-filtered high-volume boards score on title only — the title filter already
    # gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
    # mention adds points), flooding the medium bucket.
    if title_only:
        blob = (job.get("title") or "").lower()
    else:
        blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
    score, pos, neg = 0, [], []
    for kw, w in POSITIVE_KEYWORDS.items():
        if kw in blob:
            score += w
            pos.append(kw)
    for kw, w in NEGATIVE_KEYWORDS.items():
        if kw in blob:
            score += w
            neg.append(kw)
    return score, pos, neg


def load_seen():
    if STATE_FILE.exists():
        return json.loads(STATE_FILE.read_text(encoding="utf-8"))
    return {}


def save_seen(seen):
    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
    STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")


def write_report(path, results, errors, new_only, include_weak):
    today = datetime.now().strftime("%Y-%m-%d")
    n_new = sum(1 for r in results if r["is_new"])
    lines = [
        f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
        f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
        f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n",
    ]
    if errors:
        lines.append("## Errors\n")
        for company, err in errors:
            lines.append(f"- **{company}**: {err}")
        lines.append("")

    strong = [r for r in results if r["score"] >= 6]
    medium = [r for r in results if 2 <= r["score"] < 6]
    weak   = [r for r in results if r["score"] < 2]

    if not include_weak and weak:
        lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")

    buckets = [("Strong fit (score >= 6)", strong),
               ("Medium fit (score 2-5)", medium)]
    if include_weak:
        buckets.append(("Weak / noise (score < 2)", weak))

    for bucket_name, bucket in buckets:
        if not bucket:
            continue
        lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n")
        for r in bucket:
            new_tag = " [NEW]" if r["is_new"] else ""
            loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
            lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}")
            lines.append(f"- Location: {r['location']} *({loc_tag})*")
            if r.get("posted"):
                lines.append(f"- Posted: {r['posted']}")
            lines.append(f"- URL: {r['url']}")
            if r["pos"]:
                lines.append(f"- Positive: {', '.join(r['pos'])}")
            if r["neg"]:
                lines.append(f"- Negative: {', '.join(r['neg'])}")
            lines.append("")

    lines.append("\n## Manual check (companies without scrapable APIs)\n")
    lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
    lines.append("Open each link, scan for new postings since your last quarterly review:\n")
    for name, note, url in MANUAL_CHECK:
        lines.append(f"- [ ] **{name}** — {note}: <{url}>")
    lines.append("")

    path.write_text("\n".join(lines), encoding="utf-8")


def main():
    only, new_only, include_weak = None, False, False
    for arg in sys.argv[1:]:
        if arg == "--new-only":
            new_only = True
        elif arg == "--include-weak":
            include_weak = True
        elif arg.startswith("--only="):
            only = arg.split("=", 1)[1]

    seen = load_seen()
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    all_results, errors = [], []

    for cid, display, adapter, args in COMPANIES:
        if only and cid != only:
            continue
        print(f"Fetching {display}...", file=sys.stderr)
        try:
            jobs = ADAPTERS[adapter](args)
        except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
            errors.append((display, repr(e)))
            continue
        except Exception as e:
            errors.append((display, f"unexpected: {e!r}"))
            continue

        # Optional per-company title prefilter for high-volume boards
        title_filter = args.get("_title_filter")
        if title_filter:
            jobs = [j for j in jobs
                    if any(k in (j.get("title") or "").lower() for k in title_filter)]

        company_seen = seen.setdefault(cid, {})
        title_seen = set()
        for j in jobs:
            jid = str(j.get("id") or j.get("url"))
            in_ch, is_remote = location_matches(j.get("location", ""))
            if not (in_ch or is_remote):
                continue
            # Collapse the same role posted once per remote country (title differs only
            # by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
            norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
            if norm_title in title_seen:
                continue
            title_seen.add(norm_title)
            is_new = jid not in company_seen
            score, pos, neg = score_job(j, title_only=bool(title_filter))
            all_results.append({
                "company": display, "company_id": cid,
                "title": j["title"], "location": j["location"],
                "url": j["url"], "posted": j.get("posted", ""),
                "score": score, "pos": pos, "neg": neg,
                "in_ch": in_ch, "remote": is_remote, "is_new": is_new,
            })
            company_seen[jid] = {"title": j["title"], "first_seen": today}

    save_seen(seen)
    _close_browser()

    if new_only:
        all_results = [r for r in all_results if r["is_new"]]

    all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))

    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    report_path = REPORTS_DIR / f"{today}.md"
    write_report(report_path, all_results, errors, new_only, include_weak)

    n_new = sum(1 for r in all_results if r["is_new"])
    print(f"\nReport written: {report_path}", file=sys.stderr)
    print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr)
    if errors:
        print(f"Errors: {len(errors)} - see report", file=sys.stderr)


# === Adapter coverage (refreshed 2026-05-24) ==================================
# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK.
#
# Automated (COMPANIES above):
#   workday        nvidia, novartis
#   ashby          kraken, openai, confluent
#   greenhouse     anthropic, gitlab, clickhouse, grafana
#   pcsx           microsoft                    (Eightfold position-search endpoint)
#   wp_ajax        sygnum                        (WordPress admin-ajax JSON)
#   smartrecruiters metgroup, vitol, ldc
#   rss            bis                           (vacancies.rss — RSS 1.0/RDF)
#   getro          coinbase_ventures             (web3 portfolio network, collection 1625)
#   playwright     google, apple, meta, roche, cisco  (headless browser, 3-15s each)
#
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the
# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and
# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
#
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring
# freeze as of 2026-05; re-add coinbase.com/careers if they reopen).
#
# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe:
#   Bitcoin Suisse bitcoinsuisse.com/careers  JS widget, empty at scrape time, low volume
# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
# ==============================================================================


if __name__ == "__main__":
    main()