feat(job_scout): automate Bitcoin Suisse via onlyfy adapter

Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the
underlying onlyfy.jobs ATS exposes a plain HTML job list at
candidate/job/ajax_list (title + location per card). Add an onlyfy
adapter that parses it — no headless browser needed. Surfaces the 3
current Zug roles correctly (CH filter + pre-dedup location filtering
keep the Swiss posting over its Bratislava/Copenhagen cross-posts).

MANUAL_CHECK is now empty (all 22 target companies automated), so the
report's manual-check section is suppressed when the list is empty.
Coverage notes updated.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 19:06:31 +02:00
parent f4d2c6c969
commit b44360f99a
+56 -19
View File
@@ -141,6 +141,9 @@ COMPANIES = [
"job_functions": ["Software Engineering", "IT", "Data Science"],
"_title_filter": ENG_TITLE_FILTER,
}),
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -216,10 +219,8 @@ COMPANIES = [
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
# These surface as a clickable checklist in the report so they're not forgotten.
MANUAL_CHECK = [
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
"https://bitcoinsuisse.com/careers#open-positions"),
]
# (Empty — all current target companies are automated.)
MANUAL_CHECK = []
def http_get_json(url, headers=None, data=None, method="GET"):
@@ -463,6 +464,39 @@ def fetch_getro(args):
return jobs
def fetch_onlyfy(args):
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
card carries a <a href="/job/ID">title</a> and a location cell flagged by an
icon-map-marker. Titles and locations appear in document order, one of each per card,
so we extract both lists and zip them. No JSON API and no headless browser needed."""
import html as _html
slug = args["slug"]
base = f"https://{slug}.onlyfy.jobs"
url = (f"{base}/candidate/job/ajax_list"
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
req = urllib.request.Request(url, headers={
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
})
with urllib.request.urlopen(req, timeout=30) as resp:
page = resp.read().decode("utf-8", "replace")
titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
jobs = []
for (href, raw_title), raw_loc in zip(titles, locs):
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
loc = _html.unescape(raw_loc).strip()
jobs.append({
"id": href.rsplit("/", 1)[-1],
"title": title,
"location": loc,
"url": base + href,
"posted": "",
"description": loc,
})
return jobs
# Injected before page scripts run, to mask the most common headless-detection signals.
# Required for Google; harmless for the other sites.
STEALTH_JS = """
@@ -631,6 +665,7 @@ ADAPTERS = {
"smartrecruiters": fetch_smartrecruiters,
"rss": fetch_rss,
"getro": fetch_getro,
"onlyfy": fetch_onlyfy,
"playwright": fetch_playwright,
}
@@ -724,12 +759,13 @@ def write_report(path, results, errors, new_only, include_weak):
lines.append(f"- Negative: {', '.join(r['neg'])}")
lines.append("")
lines.append("\n## Manual check (companies without scrapable APIs)\n")
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
for name, note, url in MANUAL_CHECK:
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
lines.append("")
if MANUAL_CHECK:
lines.append("\n## Manual check (companies without scrapable APIs)\n")
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
for name, note, url in MANUAL_CHECK:
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
lines.append("")
path.write_text("\n".join(lines), encoding="utf-8")
@@ -811,7 +847,7 @@ def main():
# === Adapter coverage (refreshed 2026-05-24) ==================================
# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK.
# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
#
# Automated (COMPANIES above):
# workday nvidia, novartis
@@ -822,21 +858,22 @@ def main():
# smartrecruiters metgroup, vitol, ldc
# rss bis (vacancies.rss — RSS 1.0/RDF)
# getro coinbase_ventures (web3 portfolio network, collection 1625)
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each)
#
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the
# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and
# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
#
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring
# freeze as of 2026-05; re-add coinbase.com/careers if they reopen).
# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
#
# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe:
# Bitcoin Suisse bitcoinsuisse.com/careers JS widget, empty at scrape time, low volume
# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
# MANUAL_CHECK is now empty — every current target company is automated.
# ==============================================================================