feat(job_scout): automate Bitcoin Suisse via onlyfy adapter
Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the underlying onlyfy.jobs ATS exposes a plain HTML job list at candidate/job/ajax_list (title + location per card). Add an onlyfy adapter that parses it — no headless browser needed. Surfaces the 3 current Zug roles correctly (CH filter + pre-dedup location filtering keep the Swiss posting over its Bratislava/Copenhagen cross-posts). MANUAL_CHECK is now empty (all 22 target companies automated), so the report's manual-check section is suppressed when the list is empty. Coverage notes updated. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+56
-19
@@ -141,6 +141,9 @@ COMPANIES = [
|
||||
"job_functions": ["Software Engineering", "IT", "Data Science"],
|
||||
"_title_filter": ENG_TITLE_FILTER,
|
||||
}),
|
||||
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
|
||||
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
|
||||
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
|
||||
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
||||
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
||||
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
||||
@@ -216,10 +219,8 @@ COMPANIES = [
|
||||
|
||||
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
||||
# These surface as a clickable checklist in the report so they're not forgotten.
|
||||
MANUAL_CHECK = [
|
||||
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
|
||||
"https://bitcoinsuisse.com/careers#open-positions"),
|
||||
]
|
||||
# (Empty — all current target companies are automated.)
|
||||
MANUAL_CHECK = []
|
||||
|
||||
|
||||
def http_get_json(url, headers=None, data=None, method="GET"):
|
||||
@@ -463,6 +464,39 @@ def fetch_getro(args):
|
||||
return jobs
|
||||
|
||||
|
||||
def fetch_onlyfy(args):
|
||||
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
|
||||
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
|
||||
card carries a <a href="/job/ID">title</a> and a location cell flagged by an
|
||||
icon-map-marker. Titles and locations appear in document order, one of each per card,
|
||||
so we extract both lists and zip them. No JSON API and no headless browser needed."""
|
||||
import html as _html
|
||||
slug = args["slug"]
|
||||
base = f"https://{slug}.onlyfy.jobs"
|
||||
url = (f"{base}/candidate/job/ajax_list"
|
||||
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
|
||||
req = urllib.request.Request(url, headers={
|
||||
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
|
||||
})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
page = resp.read().decode("utf-8", "replace")
|
||||
titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
|
||||
locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
|
||||
jobs = []
|
||||
for (href, raw_title), raw_loc in zip(titles, locs):
|
||||
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
|
||||
loc = _html.unescape(raw_loc).strip()
|
||||
jobs.append({
|
||||
"id": href.rsplit("/", 1)[-1],
|
||||
"title": title,
|
||||
"location": loc,
|
||||
"url": base + href,
|
||||
"posted": "",
|
||||
"description": loc,
|
||||
})
|
||||
return jobs
|
||||
|
||||
|
||||
# Injected before page scripts run, to mask the most common headless-detection signals.
|
||||
# Required for Google; harmless for the other sites.
|
||||
STEALTH_JS = """
|
||||
@@ -631,6 +665,7 @@ ADAPTERS = {
|
||||
"smartrecruiters": fetch_smartrecruiters,
|
||||
"rss": fetch_rss,
|
||||
"getro": fetch_getro,
|
||||
"onlyfy": fetch_onlyfy,
|
||||
"playwright": fetch_playwright,
|
||||
}
|
||||
|
||||
@@ -724,12 +759,13 @@ def write_report(path, results, errors, new_only, include_weak):
|
||||
lines.append(f"- Negative: {', '.join(r['neg'])}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("\n## Manual check (companies without scrapable APIs)\n")
|
||||
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
|
||||
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
|
||||
for name, note, url in MANUAL_CHECK:
|
||||
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
|
||||
lines.append("")
|
||||
if MANUAL_CHECK:
|
||||
lines.append("\n## Manual check (companies without scrapable APIs)\n")
|
||||
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
|
||||
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
|
||||
for name, note, url in MANUAL_CHECK:
|
||||
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
|
||||
lines.append("")
|
||||
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
@@ -811,7 +847,7 @@ def main():
|
||||
|
||||
|
||||
# === Adapter coverage (refreshed 2026-05-24) ==================================
|
||||
# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK.
|
||||
# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
|
||||
#
|
||||
# Automated (COMPANIES above):
|
||||
# workday nvidia, novartis
|
||||
@@ -822,21 +858,22 @@ def main():
|
||||
# smartrecruiters metgroup, vitol, ldc
|
||||
# rss bis (vacancies.rss — RSS 1.0/RDF)
|
||||
# getro coinbase_ventures (web3 portfolio network, collection 1625)
|
||||
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
|
||||
# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each)
|
||||
#
|
||||
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
|
||||
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
|
||||
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the
|
||||
# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and
|
||||
# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
|
||||
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
|
||||
# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
|
||||
# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
|
||||
# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
|
||||
# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
|
||||
#
|
||||
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
|
||||
# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring
|
||||
# freeze as of 2026-05; re-add coinbase.com/careers if they reopen).
|
||||
# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
|
||||
# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
|
||||
#
|
||||
# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe:
|
||||
# Bitcoin Suisse bitcoinsuisse.com/careers JS widget, empty at scrape time, low volume
|
||||
# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
|
||||
# MANUAL_CHECK is now empty — every current target company is automated.
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user