feat(job_scout): automate Bitcoin Suisse via onlyfy adapter

Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the underlying onlyfy.jobs ATS exposes a plain HTML job list at candidate/job/ajax_list (title + location per card). Add an onlyfy adapter that parses it — no headless browser needed. Surfaces the 3 current Zug roles correctly (CH filter + pre-dedup location filtering keep the Swiss posting over its Bratislava/Copenhagen cross-posts). MANUAL_CHECK is now empty (all 22 target companies automated), so the report's manual-check section is suppressed when the list is empty. Coverage notes updated. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 19:06:31 +02:00
parent f4d2c6c969
commit b44360f99a
1 changed files with 56 additions and 19 deletions
@@ -141,6 +141,9 @@ COMPANIES = [
        "job_functions": ["Software Engineering", "IT", "Data Science"],
        "_title_filter": ENG_TITLE_FILTER,
    }),
    # Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
    # firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
    ("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
    # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
    # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
    # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -216,10 +219,8 @@ COMPANIES = [
 # Companies where adapter probing did not yield a reliable scrape. Reasons noted.
 # These surface as a clickable checklist in the report so they're not forgotten.
-MANUAL_CHECK = [
+# (Empty — all current target companies are automated.)
-    ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
+MANUAL_CHECK = []
     "https://bitcoinsuisse.com/careers#open-positions"),
 ]
 def http_get_json(url, headers=None, data=None, method="GET"):
@@ -463,6 +464,39 @@ def fetch_getro(args):
    return jobs
 def fetch_onlyfy(args):
    """onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
    candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
    card carries a <a href="/job/ID">title</a> and a location cell flagged by an
    icon-map-marker. Titles and locations appear in document order, one of each per card,
    so we extract both lists and zip them. No JSON API and no headless browser needed."""
    import html as _html
    slug = args["slug"]
    base = f"https://{slug}.onlyfy.jobs"
    url = (f"{base}/candidate/job/ajax_list"
           f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
    req = urllib.request.Request(url, headers={
        "User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
    })
    with urllib.request.urlopen(req, timeout=30) as resp:
        page = resp.read().decode("utf-8", "replace")
    titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
    locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
    jobs = []
    for (href, raw_title), raw_loc in zip(titles, locs):
        title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
        loc = _html.unescape(raw_loc).strip()
        jobs.append({
            "id": href.rsplit("/", 1)[-1],
            "title": title,
            "location": loc,
            "url": base + href,
            "posted": "",
            "description": loc,
        })
    return jobs
 # Injected before page scripts run, to mask the most common headless-detection signals.
 # Required for Google; harmless for the other sites.
 STEALTH_JS = """
@@ -631,6 +665,7 @@ ADAPTERS = {
    "smartrecruiters": fetch_smartrecruiters,
    "rss": fetch_rss,
    "getro": fetch_getro,
    "onlyfy": fetch_onlyfy,
    "playwright": fetch_playwright,
 }
@@ -724,6 +759,7 @@ def write_report(path, results, errors, new_only, include_weak):
                lines.append(f"- Negative: {', '.join(r['neg'])}")
            lines.append("")
    if MANUAL_CHECK:
        lines.append("\n## Manual check (companies without scrapable APIs)\n")
        lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
        lines.append("Open each link, scan for new postings since your last quarterly review:\n")
@@ -811,7 +847,7 @@ def main():
 # === Adapter coverage (refreshed 2026-05-24) ==================================
-# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK.
+# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
 #
 # Automated (COMPANIES above):
 #   workday        nvidia, novartis
@@ -822,21 +858,22 @@ def main():
 #   smartrecruiters metgroup, vitol, ldc
 #   rss            bis                           (vacancies.rss — RSS 1.0/RDF)
 #   getro          coinbase_ventures             (web3 portfolio network, collection 1625)
 #   onlyfy         bitcoin_suisse                (onlyfy.jobs ajax_list HTML fragment)
 #   playwright     google, apple, meta, roche, cisco  (headless browser, 3-15s each)
 #
 # Since the 2026-05-21 probe, six originally-manual sites moved to automated:
 # Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
-# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the
+# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
-# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and
+# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
-# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
+# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
 # onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
 # dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
 #
 # Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
-# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring
+# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
-# freeze as of 2026-05; re-add coinbase.com/careers if they reopen).
+# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
 #
-# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe:
+# MANUAL_CHECK is now empty — every current target company is automated.
 #   Bitcoin Suisse bitcoinsuisse.com/careers  JS widget, empty at scrape time, low volume
 # (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
 # ==============================================================================