"""Job scout for Dennis's quarterly target companies.

Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/
SmartRecruiters/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser
(playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against
profile keywords, tracks which job IDs we've already seen, writes a markdown report.

Usage:
    py scout.py                 # Pull all configured companies (strong + medium only)
    py scout.py --only=nvidia   # Pull a single company by id
    py scout.py --new-only      # Report only jobs not seen before
    py scout.py --include-weak  # Include weak/noise bucket (default hidden)

State : state/seen_jobs.json
Output: reports/YYYY-MM-DD.md

To add a company: append to COMPANIES with one of the existing adapter types. A few sites
resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist).
See the adapter-coverage notes at the bottom for the current automated/manual split.
"""

import json
import re
import sys
import urllib.error
import urllib.parse
import urllib.request
from datetime import datetime, timezone
from pathlib import Path

ROOT = Path(__file__).parent
STATE_FILE = ROOT / "state" / "seen_jobs.json"
REPORTS_DIR = ROOT / "reports"
USER_AGENT = "Mozilla/5.0 (compatible; job-scout/0.1)"

CH_LOCATION_KEYWORDS = [
    "switzerland", "zurich", "zürich", "basel", "bern", "geneva", "genf",
    "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
]

REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]

US_ONLY_PATTERNS = [
    "remote - us", "remote, us", "remote-us", "us remote", "us-remote",
    "remote-friendly us", "remote (us)", "united states - remote",
    "remote, united states",
]

EU_HINT_KEYWORDS = [
    "germany", "france", "spain", "portugal", "ireland", "netherlands",
    "sweden", "norway", "finland", "denmark", "poland", "czech",
    "romania", "italy", "austria", "belgium", "uk", "united kingdom",
    "europe", "emea", "global", "worldwide",
] + CH_LOCATION_KEYWORDS

POSITIVE_KEYWORDS = {
    "genai": 3, "generative ai": 3, "llm": 3, "large language model": 3,
    "applied ai": 3, "applied ml": 3, "ai engineer": 3, "ml engineer": 3,
    "mlops": 3, "ai platform": 3, "ml platform": 3,
    "python": 2, "java": 2, "data engineer": 2, "data engineering": 2,
    "solutions architect": 2, "platform engineer": 2,
    "ai infrastructure": 2, "inference": 2, "rag": 2, "agentic": 2,
    "kubernetes": 1, "docker": 1, "etl": 1, "pipeline": 1,
    "crypto": 2, "blockchain": 2, "web3": 2, "solidity": 3,
    "senior": 1, "staff": 1, "lead": 1, "principal": 1,
}

NEGATIVE_KEYWORDS = {
    "cuda": -3, "kernel driver": -3, "gpu programming": -3,
    "compiler engineer": -3, "pytorch internals": -3, "jax internals": -3,
    "rdma": -2, "infiniband": -2, "nccl": -3, "hpc cluster": -2,
    "frontend": -3, "front-end": -3, "react native": -3,
    "ios engineer": -3, "android engineer": -3, "mobile engineer": -3,
    "ui engineer": -2, "ux engineer": -2,
    "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
    "physical design": -3, "silicon": -2,
    "expert c++": -2, "5+ years c++": -2, "deep c++": -2,
    "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
}

# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
# post mostly non-tech roles). Only keep titles containing one of these specific role
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
# don't leak in. Matched as case-insensitive substrings against the title only.
ENG_TITLE_FILTER = [
    "data engineer", "data engineering", "data platform", "platform engineer",
    "data infrastructure", "data architect", "analytics engineer",
    "mlops", "ml engineer", "ml platform", "machine learning engineer",
    "site reliability", "sre", "backend engineer", "back-end engineer",
    "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
    "kafka", "streaming", "big data", "quantitative developer", "quant developer",
]

# id, display, adapter, adapter_args
COMPANIES = [
    ("nvidia",    "NVIDIA",    "workday",    {
        "host": "nvidia.wd5.myworkdayjobs.com",
        "tenant": "nvidia",
        "site": "NVIDIAExternalCareerSite",
        "search_text": "Switzerland",
    }),
    ("kraken",    "Kraken",    "ashby",      {"slug": "kraken.com"}),
    ("openai",    "OpenAI",    "ashby",      {"slug": "openai"}),
    ("anthropic", "Anthropic", "greenhouse", {"board": "anthropic"}),
    ("novartis",  "Novartis",  "workday", {
        "host": "novartis.wd3.myworkdayjobs.com",
        "tenant": "novartis",
        "site": "Novartis_Careers",
        "search_text": "Switzerland",
    }),
    # PCSX (Eightfold) — Microsoft has a public position search endpoint
    ("microsoft", "Microsoft", "pcsx", {
        "domain": "microsoft.com",
        "location": "Switzerland",
    }),
    # Sygnum — WordPress AJAX endpoint returns clean JSON
    ("sygnum",    "Sygnum",    "wp_ajax", {
        "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
    }),
    # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
    ("confluent", "Confluent", "ashby",      {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
    ("gitlab",    "GitLab",    "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
    ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
    ("grafana",   "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
    # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
    ("metgroup",  "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
    ("vitol",     "Vitol",     "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
    ("ldc",       "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
    # International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
    # Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
    ("bis",       "BIS (Basel)","rss", {
        "url": "https://www.bis.org/doclist/vacancies.rss",
        "default_location": "Basel, Switzerland",
    }),
    # Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles
    # across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself —
    # see fetch_getro. CH-filtered + eng title-filtered to stay relevant.
    ("coinbase_ventures", "Coinbase Ventures (web3)", "getro", {
        "collection": 1625,
        "locations": ["Switzerland"],
        "job_functions": ["Software Engineering", "IT", "Data Science"],
        "_title_filter": ENG_TITLE_FILTER,
    }),
    # Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
    # firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
    ("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
    # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
    # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
    # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
    # aria-label link; location lives in the card text (captured via blob mode).
    ("google",    "Google",    "playwright", {
        "url": "https://www.google.com/about/careers/applications/jobs/results/?location=Switzerland",
        "wait_for": "a[href*='jobs/results/'][aria-label*='Learn more']",
        "card": "li:has(a[aria-label*='Learn more about'])",
        "title_sel": "a[aria-label*='Learn more about']",
        "title_sel_attr": "aria-label",
        "title_strip_prefix": "Learn more about ",
        "link_sel": "a[href*='jobs/results/']",
        "link_attr": "href",
        "url_prefix": "https://www.google.com/about/careers/applications/",
        "default_location": "",
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
        "cookie_accept": ["button:has-text('Accept all')", "button:has-text('Reject all')"],
    }),
    ("apple",     "Apple",     "playwright", {
        "url": "https://jobs.apple.com/en-us/search?location=switzerland-CHE",
        "wait_for": "a[href*='/en-us/details/']",
        "card": "a[href*='/en-us/details/']",
        "title_attr": "text",
        "link_attr": "href",
        "url_prefix": "https://jobs.apple.com",
        "default_location": "Switzerland",
    }),
    # Meta job links are /profile/job_details/<id>; title + location are in the link text.
    ("meta",      "Meta",      "playwright", {
        "url": "https://www.metacareers.com/jobs?offices[0]=Zurich%2C%20Switzerland",
        "wait_for": "a[href*='/profile/job_details/']",
        "card": "a[href*='/profile/job_details/']",
        "title_attr": "text",
        "link_attr": "href",
        "url_prefix": "https://www.metacareers.com",
        "default_location": "Zurich, Switzerland",
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
    }),
    # PhenomPeople pattern (Roche) uses li.jobs-list-item.
    # Card inner text is structured like: "<title> | Location | <city, country> | Category | ..."
    # We extract title from first line, full text becomes the "description" so our location
    # filter still sees Switzerland mentions.
    ("roche",     "Roche",     "playwright", {
        "url": "https://careers.roche.com/global/en/search-results?keywords=&locationsearch=Switzerland",
        "wait_for": "li.jobs-list-item, a.au-target",
        "card": "li.jobs-list-item:not(:has-text('Saved jobs'))",
        "title_attr": "text",
        "link_sel": "a[href]",
        "link_attr": "href",
        "url_prefix": "https://careers.roche.com",
        "default_location": "",
        "cookie_accept": ["#onetrust-accept-btn-handler", "button:has-text('Accept All Cookies')"],
        "scroll_count": 6,
        "use_inner_text_as_blob": True,
    }),
    # Cisco (PhenomPeople, new careers.cisco.com domain). Keyword search surfaces CH roles.
    ("cisco",     "Cisco",     "playwright", {
        "url": "https://careers.cisco.com/global/en/search-results?keywords=Switzerland",
        "wait_for": "a[href*='/job/'], div[role='listitem']",
        "card": "div[role='listitem']:has(a[href*='/job/'])",
        "title_sel": "a[href*='/job/']",
        "link_sel": "a[href*='/job/']",
        "link_attr": "href",
        "url_prefix": "https://careers.cisco.com",
        "default_location": "Switzerland",
        "cookie_accept": ["#onetrust-accept-btn-handler"],
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
    }),
]

# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
# These surface as a clickable checklist in the report so they're not forgotten.
# (Empty — all current target companies are automated.)
MANUAL_CHECK = []


def http_get_json(url, headers=None, data=None, method="GET"):
    headers = headers or {}
    headers.setdefault("User-Agent", USER_AGENT)
    headers.setdefault("Accept", "application/json")
    if data is not None and isinstance(data, dict):
        data = json.dumps(data).encode("utf-8")
        headers.setdefault("Content-Type", "application/json")
    req = urllib.request.Request(url, data=data, headers=headers, method=method)
    with urllib.request.urlopen(req, timeout=30) as resp:
        return json.loads(resp.read().decode("utf-8"))


def fetch_workday(args):
    host, site, tenant = args["host"], args["site"], args["tenant"]
    search_text = args.get("search_text", "")
    url = f"https://{host}/wday/cxs/{tenant}/{site}/jobs"
    jobs, offset = [], 0
    while True:
        data = http_get_json(url, method="POST", data={
            "appliedFacets": {}, "limit": 20, "offset": offset,
            "searchText": search_text,
        })
        postings = data.get("jobPostings", [])
        for p in postings:
            ext = p.get("externalPath", "")
            jid = (p.get("bulletFields") or [ext])[0] if p.get("bulletFields") else ext
            jobs.append({
                "id": jid,
                "title": p.get("title", ""),
                "location": p.get("locationsText", "") + " " + ext,
                "url": f"https://{host}{ext}",
                "posted": p.get("postedOn", ""),
                "description": "",
            })
        total = data.get("total", 0)
        offset += len(postings)
        if not postings or offset >= total:
            break
    return jobs


def fetch_ashby(args):
    slug = args["slug"]
    url = f"https://api.ashbyhq.com/posting-api/job-board/{slug}?includeCompensation=true"
    data = http_get_json(url)
    jobs = []
    for j in data.get("jobs", []):
        secs = j.get("secondaryLocations", []) or []
        sec_names = [s.get("location", "") if isinstance(s, dict) else str(s) for s in secs]
        loc_blob = " | ".join([j.get("location", "") or ""] + sec_names)
        jobs.append({
            "id": j.get("id"),
            "title": j.get("title", ""),
            "location": loc_blob,
            "url": j.get("jobUrl"),
            "posted": j.get("publishedAt", ""),
            "description": (j.get("descriptionPlain") or "")[:2500],
            "department": j.get("department", ""),
        })
    return jobs


def fetch_greenhouse(args):
    board = args["board"]
    url = f"https://boards-api.greenhouse.io/v1/boards/{board}/jobs?content=true"
    data = http_get_json(url)
    jobs = []
    for j in data.get("jobs", []):
        loc = (j.get("location") or {}).get("name", "")
        offices = j.get("offices") or []
        office_names = " | ".join(o.get("name", "") for o in offices if isinstance(o, dict))
        loc_blob = " ".join(x for x in [loc, office_names] if x)
        desc = j.get("content", "") or ""
        desc = re.sub(r"<[^>]+>", " ", desc)
        desc = re.sub(r"\s+", " ", desc).strip()
        jobs.append({
            "id": str(j.get("id")),
            "title": j.get("title", ""),
            "location": loc_blob,
            "url": j.get("absolute_url"),
            "posted": j.get("updated_at", ""),
            "description": desc[:2500],
        })
    return jobs


def fetch_pcsx(args):
    """Eightfold PCSX search API. Microsoft uses apply.careers.microsoft.com.
    The same endpoint pattern is used by other PCS-hosted boards."""
    domain = args["domain"]
    location = args.get("location", "")
    base = "https://apply.careers.microsoft.com/api/pcsx/search"
    jobs, start = [], 0
    while True:
        url = f"{base}?domain={domain}&query=&location={urllib.parse.quote(location)}&start={start}&num=50"
        data = http_get_json(url, headers={"Referer": f"https://apply.careers.microsoft.com/careers?location={urllib.parse.quote(location)}"})
        positions = (data.get("data") or {}).get("positions", []) or []
        for p in positions:
            locs = p.get("locations") or []
            jobs.append({
                "id": str(p.get("id")),
                "title": p.get("name", ""),
                "location": " | ".join(locs),
                "url": f"https://jobs.careers.microsoft.com/global/en/job/{p.get('displayJobId') or p.get('id')}",
                "posted": p.get("postedTs", ""),
                "description": (p.get("description") or "")[:2000],
            })
        if not positions or len(positions) < 50:
            break
        start += len(positions)
        if start >= 500:
            break
    return jobs


def fetch_smartrecruiters(args):
    """SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
    company = args["company"]
    base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
    jobs, offset = [], 0
    while True:
        data = http_get_json(f"{base}?limit=100&offset={offset}")
        content = data.get("content", []) or []
        for p in content:
            loc = p.get("location") or {}
            parts = [loc.get("fullLocation") or loc.get("city") or ""]
            if loc.get("remote"):
                parts.append("Remote")
            if loc.get("hybrid"):
                parts.append("Hybrid")
            loc_str = " ".join(x for x in parts if x)
            dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
            func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
            jobs.append({
                "id": str(p.get("id")),
                "title": p.get("name", ""),
                "location": loc_str,
                "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
                "posted": p.get("releasedDate", ""),
                "description": " ".join(filter(None, [dept, func])),
            })
        total = data.get("totalFound", 0)
        offset += len(content)
        if not content or offset >= total or offset >= 300:
            break
    return jobs


def fetch_rss(args):
    """Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
    <item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
    RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
    import xml.etree.ElementTree as ET
    req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
    with urllib.request.urlopen(req, timeout=30) as resp:
        root = ET.fromstring(resp.read())
    ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
    items = root.findall(".//rss1:item", ns) or root.findall(".//item")
    jobs = []
    for it in items:
        def field(tag, namespaced=True):
            el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
            if el is None and namespaced:
                el = it.find(tag)
            return (el.text or "").strip() if el is not None and el.text else ""
        link = field("link")
        jobs.append({
            "id": link or field("title"),
            "title": field("title"),
            "location": args.get("default_location", ""),
            "url": link,
            "posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
            "description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
        })
    return jobs


def fetch_wp_ajax(args):
    """WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
    url = args["url"]
    data = http_get_json(url)
    if not isinstance(data, list):
        return []
    jobs = []
    for j in data:
        jobs.append({
            "id": (j.get("title", "") + "|" + j.get("location", ""))[:120],
            "title": j.get("title", ""),
            "location": " ".join(filter(None, [j.get("location", ""), j.get("work_type", "")])),
            "url": j.get("application_url") or args["url"],
            "posted": "",
            "description": " ".join(filter(None, [j.get("department", ""), j.get("role_type", "")])),
        })
    return jobs


def fetch_getro(args):
    """Getro network job-board search API (POST JSON). Powers VC portfolio talent
    networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles
    across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself;
    Coinbase doesn't list its own openings on its Ventures board. Server-side filters:
    searchable_locations and job_functions. Org name is folded into the title since this
    is a multi-company board."""
    collection = args["collection"]
    url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs"
    filters = {}
    if args.get("locations"):
        filters["searchable_locations"] = args["locations"]
    if args.get("job_functions"):
        filters["job_functions"] = args["job_functions"]
    jobs, page = [], 0
    while True:
        data = http_get_json(url, method="POST", data={
            "hitsPerPage": 100, "page": page, "query": "", "filters": filters,
        })
        res = data.get("results", {}) or {}
        batch = res.get("jobs", []) or []
        for j in batch:
            org = (j.get("organization") or {}).get("name", "")
            locs = j.get("searchable_locations") or j.get("locations") or []
            loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs)
            ts = j.get("created_at")
            posted = ""
            if isinstance(ts, (int, float)):
                posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
            title = j.get("title", "")
            jobs.append({
                "id": str(j.get("id")),
                "title": f"{title} @ {org}" if org else title,
                "location": loc_str,
                "url": j.get("url", ""),
                "posted": posted,
                "description": " ".join(filter(None, [org] + (j.get("skills") or []))),
            })
        total = res.get("count", 0)
        page += 1
        if not batch or len(jobs) >= total or page >= 10:
            break
    return jobs


def fetch_onlyfy(args):
    """onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
    candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
    card carries a <a href="/job/ID">title</a> and a location cell flagged by an
    icon-map-marker. Titles and locations appear in document order, one of each per card,
    so we extract both lists and zip them. No JSON API and no headless browser needed."""
    import html as _html
    slug = args["slug"]
    base = f"https://{slug}.onlyfy.jobs"
    url = (f"{base}/candidate/job/ajax_list"
           f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
    req = urllib.request.Request(url, headers={
        "User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
    })
    with urllib.request.urlopen(req, timeout=30) as resp:
        page = resp.read().decode("utf-8", "replace")
    titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
    locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
    jobs = []
    for (href, raw_title), raw_loc in zip(titles, locs):
        title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
        loc = _html.unescape(raw_loc).strip()
        jobs.append({
            "id": href.rsplit("/", 1)[-1],
            "title": title,
            "location": loc,
            "url": base + href,
            "posted": "",
            "description": loc,
        })
    return jobs


# Injected before page scripts run, to mask the most common headless-detection signals.
# Required for Google; harmless for the other sites.
STEALTH_JS = """
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
window.chrome = {runtime: {}, loadTimes: () => {}, csi: () => {}, app: {}};
Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5]});
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'de']});
const _q = navigator.permissions && navigator.permissions.query;
if (_q) {
  navigator.permissions.query = (p) => p && p.name === 'notifications'
    ? Promise.resolve({state: Notification.permission}) : _q(p);
}
"""

_playwright_singleton = {"pw": None, "browser": None}

def _get_browser():
    """Lazy-init a single shared headless browser. Saves ~3s per company."""
    if _playwright_singleton["browser"] is not None:
        return _playwright_singleton["browser"]
    try:
        from playwright.sync_api import sync_playwright
    except ImportError as e:
        raise RuntimeError("playwright not installed - run: pip install -r requirements.txt") from e
    pw = sync_playwright().start()
    browser = pw.chromium.launch(
        headless=True,
        args=["--disable-blink-features=AutomationControlled"],
    )
    _playwright_singleton["pw"] = pw
    _playwright_singleton["browser"] = browser
    return browser


def _absolutize(href, prefix):
    """Join a possibly-relative href with the configured prefix."""
    if not href or href.startswith("http"):
        return href
    cleaned = href.lstrip("./").lstrip("/")
    if not prefix:
        return href
    return prefix.rstrip("/") + "/" + cleaned


def _close_browser():
    if _playwright_singleton["browser"]:
        try:
            _playwright_singleton["browser"].close()
        except Exception:
            pass
    if _playwright_singleton["pw"]:
        try:
            _playwright_singleton["pw"].stop()
        except Exception:
            pass


def fetch_playwright(args):
    """Generic headless-browser scraper. See COMPANIES entries for selector args."""
    browser = _get_browser()
    ctx = browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
        locale="en-US",
        viewport={"width": 1366, "height": 768},
    )
    ctx.add_init_script(STEALTH_JS)
    page = ctx.new_page()
    jobs = []
    try:
        page.goto(args["url"], timeout=45000, wait_until="domcontentloaded")
        # Optional cookie banner acceptance
        for sel in args.get("cookie_accept", []) or []:
            try:
                btn = page.locator(sel).first
                if btn.is_visible(timeout=2000):
                    btn.click()
                    page.wait_for_timeout(500)
            except Exception:
                pass
        # Wait for job content to render
        wait_for = args.get("wait_for")
        if wait_for:
            try:
                page.wait_for_selector(wait_for, timeout=15000)
            except Exception:
                page.wait_for_timeout(4000)
        # Scroll a few times to trigger any lazy-loaded results
        for _ in range(args.get("scroll_count", 3)):
            try:
                page.mouse.wheel(0, 4000)
                page.wait_for_timeout(700)
            except Exception:
                break

        cards = page.locator(args["card"])
        n = min(cards.count(), args.get("max_cards", 150))
        for i in range(n):
            card = cards.nth(i)
            try:
                title = ""
                if args.get("title_attr") == "text":
                    title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]
                elif args.get("title_attr"):
                    title = (card.get_attribute(args["title_attr"]) or "").strip()
                elif args.get("title_sel"):
                    t = card.locator(args["title_sel"]).first
                    if t.count():
                        # Read either an attribute (e.g. aria-label) or the inner text
                        if args.get("title_sel_attr"):
                            title = (t.get_attribute(args["title_sel_attr"]) or "").strip()
                        else:
                            title = (t.inner_text() or "").strip()
                if args.get("title_strip_prefix") and title.startswith(args["title_strip_prefix"]):
                    title = title[len(args["title_strip_prefix"]):].strip()
                if not title:
                    title = (card.inner_text() or "").strip().split("\n", 1)[0][:200]

                location = args.get("default_location", "")
                if args.get("location_sel"):
                    lsel = card.locator(args["location_sel"]).first
                    if lsel.count():
                        location = (lsel.inner_text() or location).strip()

                link_el = card if not args.get("link_sel") else card.locator(args["link_sel"]).first
                href = (link_el.get_attribute(args.get("link_attr", "href")) or "") if link_el.count() else ""
                href = _absolutize(href, args.get("url_prefix", ""))

                if not title:
                    continue
                description = ""
                if args.get("use_inner_text_as_blob"):
                    # Use the full card text as both location source and description
                    full = (card.inner_text() or "")
                    description = full[:2000]
                    if not location:
                        location = full[:300]
                jobs.append({
                    "id": href or f"{args['url']}#{i}",
                    "title": title,
                    "location": location,
                    "url": href or args["url"],
                    "posted": "",
                    "description": description,
                })
            except Exception:
                continue
    finally:
        ctx.close()

    # Deduplicate within a single company by id
    seen, deduped = set(), []
    for j in jobs:
        if j["id"] in seen:
            continue
        seen.add(j["id"])
        deduped.append(j)
    return deduped


ADAPTERS = {
    "workday": fetch_workday,
    "ashby": fetch_ashby,
    "greenhouse": fetch_greenhouse,
    "pcsx": fetch_pcsx,
    "wp_ajax": fetch_wp_ajax,
    "smartrecruiters": fetch_smartrecruiters,
    "rss": fetch_rss,
    "getro": fetch_getro,
    "onlyfy": fetch_onlyfy,
    "playwright": fetch_playwright,
}


def location_matches(loc_text):
    if not loc_text:
        return False, False
    low = loc_text.lower()
    in_ch = any(k in low for k in CH_LOCATION_KEYWORDS)
    has_remote = any(k in low for k in REMOTE_KEYWORDS)
    is_us_only = any(p in low for p in US_ONLY_PATTERNS) and not in_ch
    has_eu_hint = any(k in low for k in EU_HINT_KEYWORDS)
    # Count as remote-eligible only if it isn't a US-only remote listing
    # and it has at least one EU/global hint
    is_remote = has_remote and not is_us_only and has_eu_hint
    return in_ch, is_remote


def score_job(job, title_only=False):
    # Title-filtered high-volume boards score on title only — the title filter already
    # gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
    # mention adds points), flooding the medium bucket.
    if title_only:
        blob = (job.get("title") or "").lower()
    else:
        blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
    score, pos, neg = 0, [], []
    for kw, w in POSITIVE_KEYWORDS.items():
        if kw in blob:
            score += w
            pos.append(kw)
    for kw, w in NEGATIVE_KEYWORDS.items():
        if kw in blob:
            score += w
            neg.append(kw)
    return score, pos, neg


def load_seen():
    if STATE_FILE.exists():
        return json.loads(STATE_FILE.read_text(encoding="utf-8"))
    return {}


def save_seen(seen):
    STATE_FILE.parent.mkdir(parents=True, exist_ok=True)
    STATE_FILE.write_text(json.dumps(seen, indent=2, ensure_ascii=False), encoding="utf-8")


def write_report(path, results, errors, new_only, include_weak):
    today = datetime.now().strftime("%Y-%m-%d")
    n_new = sum(1 for r in results if r["is_new"])
    lines = [
        f"# Job scout report {today}{' (new only)' if new_only else ''}\n",
        f"Automated coverage: **{len(COMPANIES)}** companies. Manual checks: **{len(MANUAL_CHECK)}**.",
        f"Total matches from automated companies: **{len(results)}** ({n_new} new since last run)\n",
    ]
    if errors:
        lines.append("## Errors\n")
        for company, err in errors:
            lines.append(f"- **{company}**: {err}")
        lines.append("")

    strong = [r for r in results if r["score"] >= 6]
    medium = [r for r in results if 2 <= r["score"] < 6]
    weak   = [r for r in results if r["score"] < 2]

    if not include_weak and weak:
        lines.append(f"\n_Hiding {len(weak)} weak/noise roles (score < 2). Use --include-weak to show._")

    buckets = [("Strong fit (score >= 6)", strong),
               ("Medium fit (score 2-5)", medium)]
    if include_weak:
        buckets.append(("Weak / noise (score < 2)", weak))

    for bucket_name, bucket in buckets:
        if not bucket:
            continue
        lines.append(f"\n## {bucket_name} - {len(bucket)} role(s)\n")
        for r in bucket:
            new_tag = " [NEW]" if r["is_new"] else ""
            loc_tag = "CH" if r["in_ch"] else ("Remote" if r["remote"] else "?")
            lines.append(f"### [{r['score']}] {r['company']} - {r['title']}{new_tag}")
            lines.append(f"- Location: {r['location']} *({loc_tag})*")
            if r.get("posted"):
                lines.append(f"- Posted: {r['posted']}")
            lines.append(f"- URL: {r['url']}")
            if r["pos"]:
                lines.append(f"- Positive: {', '.join(r['pos'])}")
            if r["neg"]:
                lines.append(f"- Negative: {', '.join(r['neg'])}")
            lines.append("")

    if MANUAL_CHECK:
        lines.append("\n## Manual check (companies without scrapable APIs)\n")
        lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
        lines.append("Open each link, scan for new postings since your last quarterly review:\n")
        for name, note, url in MANUAL_CHECK:
            lines.append(f"- [ ] **{name}** — {note}: <{url}>")
        lines.append("")

    path.write_text("\n".join(lines), encoding="utf-8")


def main():
    only, new_only, include_weak = None, False, False
    for arg in sys.argv[1:]:
        if arg == "--new-only":
            new_only = True
        elif arg == "--include-weak":
            include_weak = True
        elif arg.startswith("--only="):
            only = arg.split("=", 1)[1]

    seen = load_seen()
    today = datetime.now(timezone.utc).strftime("%Y-%m-%d")
    all_results, errors = [], []

    for cid, display, adapter, args in COMPANIES:
        if only and cid != only:
            continue
        print(f"Fetching {display}...", file=sys.stderr)
        try:
            jobs = ADAPTERS[adapter](args)
        except (urllib.error.URLError, urllib.error.HTTPError, ValueError) as e:
            errors.append((display, repr(e)))
            continue
        except Exception as e:
            errors.append((display, f"unexpected: {e!r}"))
            continue

        # Optional per-company title prefilter for high-volume boards
        title_filter = args.get("_title_filter")
        if title_filter:
            jobs = [j for j in jobs
                    if any(k in (j.get("title") or "").lower() for k in title_filter)]

        company_seen = seen.setdefault(cid, {})
        title_seen = set()
        for j in jobs:
            jid = str(j.get("id") or j.get("url"))
            in_ch, is_remote = location_matches(j.get("location", ""))
            if not (in_ch or is_remote):
                continue
            # Collapse the same role posted once per remote country (title differs only
            # by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
            norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
            if norm_title in title_seen:
                continue
            title_seen.add(norm_title)
            is_new = jid not in company_seen
            score, pos, neg = score_job(j, title_only=bool(title_filter))
            all_results.append({
                "company": display, "company_id": cid,
                "title": j["title"], "location": j["location"],
                "url": j["url"], "posted": j.get("posted", ""),
                "score": score, "pos": pos, "neg": neg,
                "in_ch": in_ch, "remote": is_remote, "is_new": is_new,
            })
            company_seen[jid] = {"title": j["title"], "first_seen": today}

    save_seen(seen)
    _close_browser()

    if new_only:
        all_results = [r for r in all_results if r["is_new"]]

    all_results.sort(key=lambda r: (-r["score"], r["company"], r["title"]))

    REPORTS_DIR.mkdir(parents=True, exist_ok=True)
    report_path = REPORTS_DIR / f"{today}.md"
    write_report(report_path, all_results, errors, new_only, include_weak)

    n_new = sum(1 for r in all_results if r["is_new"])
    print(f"\nReport written: {report_path}", file=sys.stderr)
    print(f"Total matches: {len(all_results)} ({n_new} new)", file=sys.stderr)
    if errors:
        print(f"Errors: {len(errors)} - see report", file=sys.stderr)


# === Adapter coverage (refreshed 2026-05-24) ==================================
# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
#
# Automated (COMPANIES above):
#   workday        nvidia, novartis
#   ashby          kraken, openai, confluent
#   greenhouse     anthropic, gitlab, clickhouse, grafana
#   pcsx           microsoft                    (Eightfold position-search endpoint)
#   wp_ajax        sygnum                        (WordPress admin-ajax JSON)
#   smartrecruiters metgroup, vitol, ldc
#   rss            bis                           (vacancies.rss — RSS 1.0/RDF)
#   getro          coinbase_ventures             (web3 portfolio network, collection 1625)
#   onlyfy         bitcoin_suisse                (onlyfy.jobs ajax_list HTML fragment)
#   playwright     google, apple, meta, roche, cisco  (headless browser, 3-15s each)
#
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
#
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
#
# MANUAL_CHECK is now empty — every current target company is automated.
# ==============================================================================


if __name__ == "__main__":
    main()