feat(job_scout): automate Bitcoin Suisse via onlyfy adapter

Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the
underlying onlyfy.jobs ATS exposes a plain HTML job list at
candidate/job/ajax_list (title + location per card). Add an onlyfy
adapter that parses it — no headless browser needed. Surfaces the 3
current Zug roles correctly (CH filter + pre-dedup location filtering
keep the Swiss posting over its Bratislava/Copenhagen cross-posts).

MANUAL_CHECK is now empty (all 22 target companies automated), so the
report's manual-check section is suppressed when the list is empty.
Coverage notes updated.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 19:06:31 +02:00
parent f4d2c6c969
commit b44360f99a
+50 -13
View File
@@ -141,6 +141,9 @@ COMPANIES = [
"job_functions": ["Software Engineering", "IT", "Data Science"], "job_functions": ["Software Engineering", "IT", "Data Science"],
"_title_filter": ENG_TITLE_FILTER, "_title_filter": ENG_TITLE_FILTER,
}), }),
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
# Google actively bot-detects; the STEALTH_JS init script (applied to every context) # Google actively bot-detects; the STEALTH_JS init script (applied to every context)
# is what makes its job list render. Cards are <li> with a "Learn more about <title>" # is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -216,10 +219,8 @@ COMPANIES = [
# Companies where adapter probing did not yield a reliable scrape. Reasons noted. # Companies where adapter probing did not yield a reliable scrape. Reasons noted.
# These surface as a clickable checklist in the report so they're not forgotten. # These surface as a clickable checklist in the report so they're not forgotten.
MANUAL_CHECK = [ # (Empty — all current target companies are automated.)
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)", MANUAL_CHECK = []
"https://bitcoinsuisse.com/careers#open-positions"),
]
def http_get_json(url, headers=None, data=None, method="GET"): def http_get_json(url, headers=None, data=None, method="GET"):
@@ -463,6 +464,39 @@ def fetch_getro(args):
return jobs return jobs
def fetch_onlyfy(args):
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
card carries a <a href="/job/ID">title</a> and a location cell flagged by an
icon-map-marker. Titles and locations appear in document order, one of each per card,
so we extract both lists and zip them. No JSON API and no headless browser needed."""
import html as _html
slug = args["slug"]
base = f"https://{slug}.onlyfy.jobs"
url = (f"{base}/candidate/job/ajax_list"
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
})
with urllib.request.urlopen(req, timeout=30) as resp:
page = resp.read().decode("utf-8", "replace")
titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
jobs = []
for (href, raw_title), raw_loc in zip(titles, locs):
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
loc = _html.unescape(raw_loc).strip()
jobs.append({
"id": href.rsplit("/", 1)[-1],
"title": title,
"location": loc,
"url": base + href,
"posted": "",
"description": loc,
})
return jobs
# Injected before page scripts run, to mask the most common headless-detection signals. # Injected before page scripts run, to mask the most common headless-detection signals.
# Required for Google; harmless for the other sites. # Required for Google; harmless for the other sites.
STEALTH_JS = """ STEALTH_JS = """
@@ -631,6 +665,7 @@ ADAPTERS = {
"smartrecruiters": fetch_smartrecruiters, "smartrecruiters": fetch_smartrecruiters,
"rss": fetch_rss, "rss": fetch_rss,
"getro": fetch_getro, "getro": fetch_getro,
"onlyfy": fetch_onlyfy,
"playwright": fetch_playwright, "playwright": fetch_playwright,
} }
@@ -724,6 +759,7 @@ def write_report(path, results, errors, new_only, include_weak):
lines.append(f"- Negative: {', '.join(r['neg'])}") lines.append(f"- Negative: {', '.join(r['neg'])}")
lines.append("") lines.append("")
if MANUAL_CHECK:
lines.append("\n## Manual check (companies without scrapable APIs)\n") lines.append("\n## Manual check (companies without scrapable APIs)\n")
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.") lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
lines.append("Open each link, scan for new postings since your last quarterly review:\n") lines.append("Open each link, scan for new postings since your last quarterly review:\n")
@@ -811,7 +847,7 @@ def main():
# === Adapter coverage (refreshed 2026-05-24) ================================== # === Adapter coverage (refreshed 2026-05-24) ==================================
# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK. # 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
# #
# Automated (COMPANIES above): # Automated (COMPANIES above):
# workday nvidia, novartis # workday nvidia, novartis
@@ -822,21 +858,22 @@ def main():
# smartrecruiters metgroup, vitol, ldc # smartrecruiters metgroup, vitol, ldc
# rss bis (vacancies.rss — RSS 1.0/RDF) # rss bis (vacancies.rss — RSS 1.0/RDF)
# getro coinbase_ventures (web3 portfolio network, collection 1625) # getro coinbase_ventures (web3 portfolio network, collection 1625)
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each) # playwright google, apple, meta, roche, cisco (headless browser, 3-15s each)
# #
# Since the 2026-05-21 probe, six originally-manual sites moved to automated: # Since the 2026-05-21 probe, six originally-manual sites moved to automated:
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and # Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the # Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and # Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis). # via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
# #
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase # Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring # itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
# freeze as of 2026-05; re-add coinbase.com/careers if they reopen). # re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
# #
# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe: # MANUAL_CHECK is now empty — every current target company is automated.
# Bitcoin Suisse bitcoinsuisse.com/careers JS widget, empty at scrape time, low volume
# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
# ============================================================================== # ==============================================================================