From cb27fa3ecfa7744039372697c5c2e5c8841ef352 Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Sun, 24 May 2026 18:47:29 +0200 Subject: [PATCH] feat(job_scout): add getro adapter for Coinbase Ventures web3 network MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a getro adapter (POST JSON search API) and wire up the Coinbase Ventures portfolio talent network (collection 1625), CH + eng-title filtered. Note this covers portfolio companies (Ashby, Notion, VALR, World, ...), not Coinbase itself, which doesn't list on its Ventures board — Coinbase-the-employer stays in MANUAL_CHECK. Also clean up stale comments: drop Sonova (MedTech, off-thesis, dead scrape) from MANUAL_CHECK, remove the dangling BIS comment now that BIS is automated via rss, and refresh the adapter-coverage notes and module docstring to the current 21-automated / 3-manual state. Co-Authored-By: Claude Opus 4.7 --- job_scout/scout.py | 114 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 26 deletions(-) diff --git a/job_scout/scout.py b/job_scout/scout.py index 3b05323..5e8b8bc 100644 --- a/job_scout/scout.py +++ b/job_scout/scout.py @@ -1,8 +1,9 @@ """Job scout for Dennis's quarterly target companies. -Pulls latest openings from companies with known public ATS APIs (Workday/Ashby/Greenhouse), -filters by Swiss location or remote eligibility, scores fit against profile keywords, tracks -which job IDs we've already seen, writes a markdown report. +Pulls latest openings from companies via public ATS APIs (Workday/Ashby/Greenhouse/ +SmartRecruiters/Eightfold/RSS) and, for JS-rendered careers sites, a headless-browser +(playwright) adapter. Filters by Swiss location or remote eligibility, scores fit against +profile keywords, tracks which job IDs we've already seen, writes a markdown report. Usage: py scout.py # Pull all configured companies (strong + medium only) @@ -13,9 +14,9 @@ Usage: State : state/seen_jobs.json Output: reports/YYYY-MM-DD.md -To add a company: append to COMPANIES with one of the existing adapter types. -For companies behind custom careers sites (Google, MS, Meta, Apple, Roche, Novartis, IBM, -Cisco, Sonova, Sygnum) — see TODO_ADAPTERS at the bottom. +To add a company: append to COMPANIES with one of the existing adapter types. A few sites +resist scraping even headless and stay in MANUAL_CHECK (surfaced as a report checklist). +See the adapter-coverage notes at the bottom for the current automated/manual split. """ import json @@ -131,6 +132,15 @@ COMPANIES = [ "url": "https://www.bis.org/doclist/vacancies.rss", "default_location": "Basel, Switzerland", }), + # Coinbase Ventures web3 talent network (Getro collection 1625). Aggregates roles + # across portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself — + # see fetch_getro. CH-filtered + eng title-filtered to stay relevant. + ("coinbase_ventures", "Coinbase Ventures (web3)", "getro", { + "collection": 1625, + "locations": ["Switzerland"], + "job_functions": ["Software Engineering", "IT", "Data Science"], + "_title_filter": ENG_TITLE_FILTER, + }), # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Google actively bot-detects; the STEALTH_JS init script (applied to every context) # is what makes its job list render. Cards are
  • with a "Learn more about " @@ -207,16 +217,12 @@ COMPANIES = [ # Companies where adapter probing did not yield a reliable scrape. Reasons noted. # These surface as a clickable checklist in the report so they're not forgotten. MANUAL_CHECK = [ - ("Sonova", "PhenomPeople serves empty shell to automation (body never renders); widgets API rejects requests", - "https://careers.sonova.com/us/en/search-results?keywords=Switzerland"), ("Coinbase", "/careers/positions 302-redirects to landing; no job links or ATS API exposed even with stealth", "https://www.coinbase.com/careers"), ("AMINA Bank", "jobs are at /careers/ (#positions) via JS widget; only ~4 apply links, no scrapable list", "https://aminagroup.com/careers/#positions"), ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)", "https://bitcoinsuisse.com/careers#open-positions"), - # International org — qualifies (Basel, commutable from Bern, salary net of Swiss tax), - # but uses a JS-heavy Taleo widget that doesn't render requisitions headless. Manual check. ] @@ -416,6 +422,51 @@ def fetch_wp_ajax(args): return jobs +def fetch_getro(args): + """Getro network job-board search API (POST JSON). Powers VC portfolio talent + networks — here the Coinbase Ventures web3 network (collection 1625). Returns roles + across ALL portfolio companies (Notion, Ashby, VALR, World, ...), NOT Coinbase itself; + Coinbase doesn't list its own openings on its Ventures board. Server-side filters: + searchable_locations and job_functions. Org name is folded into the title since this + is a multi-company board.""" + collection = args["collection"] + url = f"https://api.getro.com/api/v2/collections/{collection}/search/jobs" + filters = {} + if args.get("locations"): + filters["searchable_locations"] = args["locations"] + if args.get("job_functions"): + filters["job_functions"] = args["job_functions"] + jobs, page = [], 0 + while True: + data = http_get_json(url, method="POST", data={ + "hitsPerPage": 100, "page": page, "query": "", "filters": filters, + }) + res = data.get("results", {}) or {} + batch = res.get("jobs", []) or [] + for j in batch: + org = (j.get("organization") or {}).get("name", "") + locs = j.get("searchable_locations") or j.get("locations") or [] + loc_str = " | ".join(locs) if isinstance(locs, list) else str(locs) + ts = j.get("created_at") + posted = "" + if isinstance(ts, (int, float)): + posted = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") + title = j.get("title", "") + jobs.append({ + "id": str(j.get("id")), + "title": f"{title} @ {org}" if org else title, + "location": loc_str, + "url": j.get("url", ""), + "posted": posted, + "description": " ".join(filter(None, [org] + (j.get("skills") or []))), + }) + total = res.get("count", 0) + page += 1 + if not batch or len(jobs) >= total or page >= 10: + break + return jobs + + # Injected before page scripts run, to mask the most common headless-detection signals. # Required for Google; harmless for the other sites. STEALTH_JS = """ @@ -583,6 +634,7 @@ ADAPTERS = { "wp_ajax": fetch_wp_ajax, "smartrecruiters": fetch_smartrecruiters, "rss": fetch_rss, + "getro": fetch_getro, "playwright": fetch_playwright, } @@ -762,23 +814,33 @@ def main(): print(f"Errors: {len(errors)} - see report", file=sys.stderr) -# === Adapter probe results (2026-05-21) ======================================= -# Tested all 15 target companies. The 5 working adapters are in COMPANIES above. -# The remaining 10 are in MANUAL_CHECK. To upgrade one of those from manual to -# automated, you'd need Playwright/Selenium (real browser) — different project. +# === Adapter coverage (refreshed 2026-05-24) ================================== +# 21 companies automated across 9 adapter types; 3 remain in MANUAL_CHECK. # -# Google careers.google.com 404 on documented API; auth-gated -# Microsoft gcsservices.careers.ms.com TLS handshake hangs from non-MS clients -# Apple jobs.apple.com/api/v1 endpoint exists, location filter codes opaque -# Meta metacareers.com GraphQL with auth token -# Roche careers.roche.com PhenomPeople/Eightfold, JS-rendered -# IBM Research research.ibm.com static page, no API -# Cisco jobs.cisco.com JS-rendered SPA -# Sonova careers.sonova.com PhenomPeople SaaS, no public JSON -# Sygnum sygnum.com/careers Cloudflare-protected -# AMINA aminagroup.com/career static, low volume -# Bitcoin Suisse bitcoinsuisse.com/careers static, low volume -# Coinbase coinbase.com/careers Cloudflare-protected +# Automated (COMPANIES above): +# workday nvidia, novartis +# ashby kraken, openai, confluent +# greenhouse anthropic, gitlab, clickhouse, grafana +# pcsx microsoft (Eightfold position-search endpoint) +# wp_ajax sygnum (WordPress admin-ajax JSON) +# smartrecruiters metgroup, vitol, ldc +# rss bis (vacancies.rss — RSS 1.0/RDF) +# getro coinbase_ventures (web3 portfolio network, collection 1625) +# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each) +# +# Since the 2026-05-21 probe, six originally-manual sites moved to automated: +# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and +# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the +# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and +# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis). +# +# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase +# itself — Coinbase-the-employer's own careers site stays in MANUAL_CHECK below. +# +# Still manual (MANUAL_CHECK above) — to automate, each needs a real-browser probe: +# Coinbase coinbase.com/careers Cloudflare-protected, 302 to landing +# AMINA aminagroup.com/careers JS widget, ~4 apply links, low volume +# Bitcoin Suisse bitcoinsuisse.com/careers JS widget, empty at scrape time, low volume # ==============================================================================