2026-05-22 12:38:35 +02:00
parent eeec77b1e3
commit 3b07c4b900
3 changed files with 83 additions and 19 deletions
@@ -82,7 +82,8 @@
      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)",
      "Bash(job_scout/.venv/Scripts/python.exe -c ' *)",
      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)",
      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)"
      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)",
      "Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=confluent)"
    ]
  }
}
 
@@ -136,6 +136,7 @@ _Update this section when starting/finishing a JD._
| Infineon AI Engineer | Critique DONE Pass 2 (78.5/100) | Submit or Tier 2 polish |
| Apple Data Engineer (ISE, Zurich) | Critique DONE Pass 1 (78.5/100) | /edit-resume for Tier 1 fixes or submit |
| Kraken AI Infrastructure | Critique DONE Pass 2 (84.5/100) — converged near max | Submit, or apply Tier 2 polish (agent orchestration / guardrails in skills) |
| Google FDE GenAI (Zurich) | PAUSED — GenAI evidence gap too large; redirecting to data-eng/MLOps roles | Likely abandon |
---
 
@@ -37,7 +37,7 @@ CH_LOCATION_KEYWORDS = [
    "lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
]
REMOTE_KEYWORDS = ["remote"]
REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
US_ONLY_PATTERNS = [
    "remote - us", "remote, us", "remote-us", "us remote", "us-remote",
@@ -49,7 +49,7 @@ EU_HINT_KEYWORDS = [
    "germany", "france", "spain", "portugal", "ireland", "netherlands",
    "sweden", "norway", "finland", "denmark", "poland", "czech",
    "romania", "italy", "austria", "belgium", "uk", "united kingdom",
    "europe", "emea", "global",
    "europe", "emea", "global", "worldwide",
] + CH_LOCATION_KEYWORDS
POSITIVE_KEYWORDS = {
@@ -77,6 +77,19 @@ NEGATIVE_KEYWORDS = {
    "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
}
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
# post mostly non-tech roles). Only keep titles containing one of these specific role
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
# don't leak in. Matched as case-insensitive substrings against the title only.
ENG_TITLE_FILTER = [
    "data engineer", "data engineering", "data platform", "platform engineer",
    "data infrastructure", "data architect", "analytics engineer",
    "mlops", "ml engineer", "ml platform", "machine learning engineer",
    "site reliability", "sre", "backend engineer", "back-end engineer",
    "devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
    "kafka", "streaming", "big data", "quantitative developer", "quant developer",
]
# id, display, adapter, adapter_args
COMPANIES = [
    ("nvidia",    "NVIDIA",    "workday",    {
@@ -103,6 +116,15 @@ COMPANIES = [
    ("sygnum",    "Sygnum",    "wp_ajax", {
        "url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
    }),
    # --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
    ("confluent", "Confluent", "ashby",      {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
    ("gitlab",    "GitLab",    "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
    ("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
    ("grafana",   "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
    # --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
    ("metgroup",  "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
    ("vitol",     "Vitol",     "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
    ("ldc",       "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
    # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
    # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
    # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -174,19 +196,6 @@ COMPANIES = [
        "scroll_count": 5,
        "use_inner_text_as_blob": True,
    }),
    ("ibm",       "IBM Research", "playwright", {
        # IBM Research Zurich careers page is mostly a static intro with few openings.
        # Use IBM's main careers search filtered to Switzerland instead.
        "url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland",
        "wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']",
        "card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)",
        "title_sel": "h3, h4",
        "link_sel": "a[href*='/careers/']",
        "link_attr": "href",
        "url_prefix": "https://www.ibm.com",
        "default_location": "Switzerland",
        "scroll_count": 4,
    }),
]
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
@@ -318,6 +327,39 @@ def fetch_pcsx(args):
    return jobs
def fetch_smartrecruiters(args):
    """SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
    company = args["company"]
    base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
    jobs, offset = [], 0
    while True:
        data = http_get_json(f"{base}?limit=100&offset={offset}")
        content = data.get("content", []) or []
        for p in content:
            loc = p.get("location") or {}
            parts = [loc.get("fullLocation") or loc.get("city") or ""]
            if loc.get("remote"):
                parts.append("Remote")
            if loc.get("hybrid"):
                parts.append("Hybrid")
            loc_str = " ".join(x for x in parts if x)
            dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
            func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
            jobs.append({
                "id": str(p.get("id")),
                "title": p.get("name", ""),
                "location": loc_str,
                "url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
                "posted": p.get("releasedDate", ""),
                "description": " ".join(filter(None, [dept, func])),
            })
        total = data.get("totalFound", 0)
        offset += len(content)
        if not content or offset >= total or offset >= 300:
            break
    return jobs
def fetch_wp_ajax(args):
    """WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
    url = args["url"]
@@ -502,6 +544,7 @@ ADAPTERS = {
    "greenhouse": fetch_greenhouse,
    "pcsx": fetch_pcsx,
    "wp_ajax": fetch_wp_ajax,
    "smartrecruiters": fetch_smartrecruiters,
    "playwright": fetch_playwright,
}
@@ -520,8 +563,14 @@ def location_matches(loc_text):
    return in_ch, is_remote
def score_job(job):
    blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
def score_job(job, title_only=False):
    # Title-filtered high-volume boards score on title only — the title filter already
    # gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
    # mention adds points), flooding the medium bucket.
    if title_only:
        blob = (job.get("title") or "").lower()
    else:
        blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
    score, pos, neg = 0, [], []
    for kw, w in POSITIVE_KEYWORDS.items():
        if kw in blob:
@@ -626,14 +675,27 @@ def main():
            errors.append((display, f"unexpected: {e!r}"))
            continue
        # Optional per-company title prefilter for high-volume boards
        title_filter = args.get("_title_filter")
        if title_filter:
            jobs = [j for j in jobs
                    if any(k in (j.get("title") or "").lower() for k in title_filter)]
        company_seen = seen.setdefault(cid, {})
        title_seen = set()
        for j in jobs:
            jid = str(j.get("id") or j.get("url"))
            in_ch, is_remote = location_matches(j.get("location", ""))
            if not (in_ch or is_remote):
                continue
            # Collapse the same role posted once per remote country (title differs only
            # by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
            norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
            if norm_title in title_seen:
                continue
            title_seen.add(norm_title)
            is_new = jid not in company_seen
            score, pos, neg = score_job(j)
            score, pos, neg = score_job(j, title_only=bool(title_filter))
            all_results.append({
                "company": display, "company_id": cid,
                "title": j["title"], "location": j["location"],