feat(job_scout): automate Bitcoin Suisse via onlyfy adapter

Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the underlying onlyfy.jobs ATS exposes a plain HTML job list at candidate/job/ajax_list (title + location per card). Add an onlyfy adapter that parses it — no headless browser needed. Surfaces the 3 current Zug roles correctly (CH filter + pre-dedup location filtering keep the Swiss posting over its Bratislava/Copenhagen cross-posts). MANUAL_CHECK is now empty (all 22 target companies automated), so the report's manual-check section is suppressed when the list is empty. Coverage notes updated. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 19:06:31 +02:00
parent f4d2c6c969
commit b44360f99a
1 changed files with 56 additions and 19 deletions
@@ -141,6 +141,9 @@ COMPANIES = [
        "job_functions": ["Software Engineering", "IT", "Data Science"],
        "_title_filter": ENG_TITLE_FILTER,
    }),
+    # Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
+    # firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
+    ("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
    # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
    # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
    # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -216,10 +219,8 @@ COMPANIES = [

 # Companies where adapter probing did not yield a reliable scrape. Reasons noted.
 # These surface as a clickable checklist in the report so they're not forgotten.
-MANUAL_CHECK = [
-    ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
-     "https://bitcoinsuisse.com/careers#open-positions"),
-]
+# (Empty — all current target companies are automated.)
+MANUAL_CHECK = []


 def http_get_json(url, headers=None, data=None, method="GET"):
@@ -463,6 +464,39 @@ def fetch_getro(args):
    return jobs


+def fetch_onlyfy(args):
+    """onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
+    candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
+    card carries a <a href="/job/ID">title</a> and a location cell flagged by an
+    icon-map-marker. Titles and locations appear in document order, one of each per card,
+    so we extract both lists and zip them. No JSON API and no headless browser needed."""
+    import html as _html
+    slug = args["slug"]
+    base = f"https://{slug}.onlyfy.jobs"
+    url = (f"{base}/candidate/job/ajax_list"
+           f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
+    req = urllib.request.Request(url, headers={
+        "User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
+    })
+    with urllib.request.urlopen(req, timeout=30) as resp:
+        page = resp.read().decode("utf-8", "replace")
+    titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
+    locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
+    jobs = []
+    for (href, raw_title), raw_loc in zip(titles, locs):
+        title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
+        loc = _html.unescape(raw_loc).strip()
+        jobs.append({
+            "id": href.rsplit("/", 1)[-1],
+            "title": title,
+            "location": loc,
+            "url": base + href,
+            "posted": "",
+            "description": loc,
+        })
+    return jobs
+
+
 # Injected before page scripts run, to mask the most common headless-detection signals.
 # Required for Google; harmless for the other sites.
 STEALTH_JS = """
@@ -631,6 +665,7 @@ ADAPTERS = {
    "smartrecruiters": fetch_smartrecruiters,
    "rss": fetch_rss,
    "getro": fetch_getro,
+    "onlyfy": fetch_onlyfy,
    "playwright": fetch_playwright,
 }

@@ -724,12 +759,13 @@ def write_report(path, results, errors, new_only, include_weak):
                lines.append(f"- Negative: {', '.join(r['neg'])}")
            lines.append("")

-    lines.append("\n## Manual check (companies without scrapable APIs)\n")
-    lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
-    lines.append("Open each link, scan for new postings since your last quarterly review:\n")
-    for name, note, url in MANUAL_CHECK:
-        lines.append(f"- [ ] **{name}** — {note}: <{url}>")
-    lines.append("")
+    if MANUAL_CHECK:
+        lines.append("\n## Manual check (companies without scrapable APIs)\n")
+        lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
+        lines.append("Open each link, scan for new postings since your last quarterly review:\n")
+        for name, note, url in MANUAL_CHECK:
+            lines.append(f"- [ ] **{name}** — {note}: <{url}>")
+        lines.append("")

    path.write_text("\n".join(lines), encoding="utf-8")

@@ -811,7 +847,7 @@ def main():


 # === Adapter coverage (refreshed 2026-05-24) ==================================
-# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK.
+# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
 #
 # Automated (COMPANIES above):
 #   workday        nvidia, novartis
@@ -822,21 +858,22 @@ def main():
 #   smartrecruiters metgroup, vitol, ldc
 #   rss            bis                           (vacancies.rss — RSS 1.0/RDF)
 #   getro          coinbase_ventures             (web3 portfolio network, collection 1625)
+#   onlyfy         bitcoin_suisse                (onlyfy.jobs ajax_list HTML fragment)
 #   playwright     google, apple, meta, roche, cisco  (headless browser, 3-15s each)
 #
 # Since the 2026-05-21 probe, six originally-manual sites moved to automated:
 # Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
-# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the
-# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and
-# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
+# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
+# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
+# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
+# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
+# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
 #
 # Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
-# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring
-# freeze as of 2026-05; re-add coinbase.com/careers if they reopen).
+# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
+# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
 #
-# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe:
-#   Bitcoin Suisse bitcoinsuisse.com/careers  JS widget, empty at scrape time, low volume
-# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
+# MANUAL_CHECK is now empty — every current target company is automated.
 # ==============================================================================