feat(job_scout): automate Bitcoin Suisse via onlyfy adapter
Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the underlying onlyfy.jobs ATS exposes a plain HTML job list at candidate/job/ajax_list (title + location per card). Add an onlyfy adapter that parses it — no headless browser needed. Surfaces the 3 current Zug roles correctly (CH filter + pre-dedup location filtering keep the Swiss posting over its Bratislava/Copenhagen cross-posts). MANUAL_CHECK is now empty (all 22 target companies automated), so the report's manual-check section is suppressed when the list is empty. Coverage notes updated. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+56
-19
@@ -141,6 +141,9 @@ COMPANIES = [
|
|||||||
"job_functions": ["Software Engineering", "IT", "Data Science"],
|
"job_functions": ["Software Engineering", "IT", "Data Science"],
|
||||||
"_title_filter": ENG_TITLE_FILTER,
|
"_title_filter": ENG_TITLE_FILTER,
|
||||||
}),
|
}),
|
||||||
|
# Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto
|
||||||
|
# firm, only a handful of CH roles; let scoring rank them (CH filter does the rest).
|
||||||
|
("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}),
|
||||||
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
||||||
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
||||||
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
||||||
@@ -216,10 +219,8 @@ COMPANIES = [
|
|||||||
|
|
||||||
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
||||||
# These surface as a clickable checklist in the report so they're not forgotten.
|
# These surface as a clickable checklist in the report so they're not forgotten.
|
||||||
MANUAL_CHECK = [
|
# (Empty — all current target companies are automated.)
|
||||||
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
|
MANUAL_CHECK = []
|
||||||
"https://bitcoinsuisse.com/careers#open-positions"),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def http_get_json(url, headers=None, data=None, method="GET"):
|
def http_get_json(url, headers=None, data=None, method="GET"):
|
||||||
@@ -463,6 +464,39 @@ def fetch_getro(args):
|
|||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_onlyfy(args):
|
||||||
|
"""onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The
|
||||||
|
candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each
|
||||||
|
card carries a <a href="/job/ID">title</a> and a location cell flagged by an
|
||||||
|
icon-map-marker. Titles and locations appear in document order, one of each per card,
|
||||||
|
so we extract both lists and zip them. No JSON API and no headless browser needed."""
|
||||||
|
import html as _html
|
||||||
|
slug = args["slug"]
|
||||||
|
base = f"https://{slug}.onlyfy.jobs"
|
||||||
|
url = (f"{base}/candidate/job/ajax_list"
|
||||||
|
f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=")
|
||||||
|
req = urllib.request.Request(url, headers={
|
||||||
|
"User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest",
|
||||||
|
})
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
page = resp.read().decode("utf-8", "replace")
|
||||||
|
titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S)
|
||||||
|
locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page)
|
||||||
|
jobs = []
|
||||||
|
for (href, raw_title), raw_loc in zip(titles, locs):
|
||||||
|
title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip()
|
||||||
|
loc = _html.unescape(raw_loc).strip()
|
||||||
|
jobs.append({
|
||||||
|
"id": href.rsplit("/", 1)[-1],
|
||||||
|
"title": title,
|
||||||
|
"location": loc,
|
||||||
|
"url": base + href,
|
||||||
|
"posted": "",
|
||||||
|
"description": loc,
|
||||||
|
})
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
# Injected before page scripts run, to mask the most common headless-detection signals.
|
# Injected before page scripts run, to mask the most common headless-detection signals.
|
||||||
# Required for Google; harmless for the other sites.
|
# Required for Google; harmless for the other sites.
|
||||||
STEALTH_JS = """
|
STEALTH_JS = """
|
||||||
@@ -631,6 +665,7 @@ ADAPTERS = {
|
|||||||
"smartrecruiters": fetch_smartrecruiters,
|
"smartrecruiters": fetch_smartrecruiters,
|
||||||
"rss": fetch_rss,
|
"rss": fetch_rss,
|
||||||
"getro": fetch_getro,
|
"getro": fetch_getro,
|
||||||
|
"onlyfy": fetch_onlyfy,
|
||||||
"playwright": fetch_playwright,
|
"playwright": fetch_playwright,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -724,12 +759,13 @@ def write_report(path, results, errors, new_only, include_weak):
|
|||||||
lines.append(f"- Negative: {', '.join(r['neg'])}")
|
lines.append(f"- Negative: {', '.join(r['neg'])}")
|
||||||
lines.append("")
|
lines.append("")
|
||||||
|
|
||||||
lines.append("\n## Manual check (companies without scrapable APIs)\n")
|
if MANUAL_CHECK:
|
||||||
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
|
lines.append("\n## Manual check (companies without scrapable APIs)\n")
|
||||||
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
|
lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.")
|
||||||
for name, note, url in MANUAL_CHECK:
|
lines.append("Open each link, scan for new postings since your last quarterly review:\n")
|
||||||
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
|
for name, note, url in MANUAL_CHECK:
|
||||||
lines.append("")
|
lines.append(f"- [ ] **{name}** — {note}: <{url}>")
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
path.write_text("\n".join(lines), encoding="utf-8")
|
path.write_text("\n".join(lines), encoding="utf-8")
|
||||||
|
|
||||||
@@ -811,7 +847,7 @@ def main():
|
|||||||
|
|
||||||
|
|
||||||
# === Adapter coverage (refreshed 2026-05-24) ==================================
|
# === Adapter coverage (refreshed 2026-05-24) ==================================
|
||||||
# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK.
|
# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK.
|
||||||
#
|
#
|
||||||
# Automated (COMPANIES above):
|
# Automated (COMPANIES above):
|
||||||
# workday nvidia, novartis
|
# workday nvidia, novartis
|
||||||
@@ -822,21 +858,22 @@ def main():
|
|||||||
# smartrecruiters metgroup, vitol, ldc
|
# smartrecruiters metgroup, vitol, ldc
|
||||||
# rss bis (vacancies.rss — RSS 1.0/RDF)
|
# rss bis (vacancies.rss — RSS 1.0/RDF)
|
||||||
# getro coinbase_ventures (web3 portfolio network, collection 1625)
|
# getro coinbase_ventures (web3 portfolio network, collection 1625)
|
||||||
|
# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment)
|
||||||
# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each)
|
# playwright google, apple, meta, roche, cisco (headless browser, 3-15s each)
|
||||||
#
|
#
|
||||||
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
|
# Since the 2026-05-21 probe, six originally-manual sites moved to automated:
|
||||||
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
|
# Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and
|
||||||
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the
|
# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the
|
||||||
# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and
|
# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse
|
||||||
# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
|
# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying
|
||||||
|
# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were
|
||||||
|
# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis).
|
||||||
#
|
#
|
||||||
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
|
# Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase
|
||||||
# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring
|
# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05;
|
||||||
# freeze as of 2026-05; re-add coinbase.com/careers if they reopen).
|
# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor).
|
||||||
#
|
#
|
||||||
# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe:
|
# MANUAL_CHECK is now empty — every current target company is automated.
|
||||||
# Bitcoin Suisse bitcoinsuisse.com/careers JS widget, empty at scrape time, low volume
|
|
||||||
# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.)
|
|
||||||
# ==============================================================================
|
# ==============================================================================
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user