From b44360f99a0c567088b587d19f9d3c78c21c4927 Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Sun, 24 May 2026 19:06:31 +0200 Subject: [PATCH] feat(job_scout): automate Bitcoin Suisse via onlyfy adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bitcoin Suisse's careers page is a JS-rendered Next.js SPA, but the underlying onlyfy.jobs ATS exposes a plain HTML job list at candidate/job/ajax_list (title + location per card). Add an onlyfy adapter that parses it — no headless browser needed. Surfaces the 3 current Zug roles correctly (CH filter + pre-dedup location filtering keep the Swiss posting over its Bratislava/Copenhagen cross-posts). MANUAL_CHECK is now empty (all 22 target companies automated), so the report's manual-check section is suppressed when the list is empty. Coverage notes updated. Co-Authored-By: Claude Opus 4.7 --- job_scout/scout.py | 75 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 19 deletions(-) diff --git a/job_scout/scout.py b/job_scout/scout.py index 5c93c04..d3f4791 100644 --- a/job_scout/scout.py +++ b/job_scout/scout.py @@ -141,6 +141,9 @@ COMPANIES = [ "job_functions": ["Software Engineering", "IT", "Data Science"], "_title_filter": ENG_TITLE_FILTER, }), + # Bitcoin Suisse (Zug) uses the onlyfy.jobs ATS. No title filter — small crypto + # firm, only a handful of CH roles; let scoring rank them (CH filter does the rest). + ("bitcoin_suisse", "Bitcoin Suisse", "onlyfy", {"slug": "bitcoin-suisse"}), # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Google actively bot-detects; the STEALTH_JS init script (applied to every context) # is what makes its job list render. Cards are
  • with a "Learn more about " @@ -216,10 +219,8 @@ COMPANIES = [ # Companies where adapter probing did not yield a reliable scrape. Reasons noted. # These surface as a clickable checklist in the report so they're not forgotten. -MANUAL_CHECK = [ - ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)", - "https://bitcoinsuisse.com/careers#open-positions"), -] +# (Empty — all current target companies are automated.) +MANUAL_CHECK = [] def http_get_json(url, headers=None, data=None, method="GET"): @@ -463,6 +464,39 @@ def fetch_getro(args): return jobs +def fetch_onlyfy(args): + """onlyfy.jobs board (XING E-Recruiting / ex-Prinzip), used by Bitcoin Suisse. The + candidate/job/ajax_list endpoint returns an HTML fragment listing every posting; each + card carries a <a href="/job/ID">title</a> and a location cell flagged by an + icon-map-marker. Titles and locations appear in document order, one of each per card, + so we extract both lists and zip them. No JSON API and no headless browser needed.""" + import html as _html + slug = args["slug"] + base = f"https://{slug}.onlyfy.jobs" + url = (f"{base}/candidate/job/ajax_list" + f"?display_length=100&page=1&sort=date&sort_dir=DESC&search=") + req = urllib.request.Request(url, headers={ + "User-Agent": USER_AGENT, "X-Requested-With": "XMLHttpRequest", + }) + with urllib.request.urlopen(req, timeout=30) as resp: + page = resp.read().decode("utf-8", "replace") + titles = re.findall(r'<a href="(/job/[a-z0-9]+)">(.*?)</a>', page, re.S) + locs = re.findall(r'icon-map-marker[^>]*></i>\s*([^<]+)', page) + jobs = [] + for (href, raw_title), raw_loc in zip(titles, locs): + title = _html.unescape(re.sub(r"<[^>]+>", "", raw_title)).strip() + loc = _html.unescape(raw_loc).strip() + jobs.append({ + "id": href.rsplit("/", 1)[-1], + "title": title, + "location": loc, + "url": base + href, + "posted": "", + "description": loc, + }) + return jobs + + # Injected before page scripts run, to mask the most common headless-detection signals. # Required for Google; harmless for the other sites. STEALTH_JS = """ @@ -631,6 +665,7 @@ ADAPTERS = { "smartrecruiters": fetch_smartrecruiters, "rss": fetch_rss, "getro": fetch_getro, + "onlyfy": fetch_onlyfy, "playwright": fetch_playwright, } @@ -724,12 +759,13 @@ def write_report(path, results, errors, new_only, include_weak): lines.append(f"- Negative: {', '.join(r['neg'])}") lines.append("") - lines.append("\n## Manual check (companies without scrapable APIs)\n") - lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.") - lines.append("Open each link, scan for new postings since your last quarterly review:\n") - for name, note, url in MANUAL_CHECK: - lines.append(f"- [ ] **{name}** — {note}: <{url}>") - lines.append("") + if MANUAL_CHECK: + lines.append("\n## Manual check (companies without scrapable APIs)\n") + lines.append("These use Cloudflare-protected sites, custom GraphQL APIs, or JS-rendered SPAs.") + lines.append("Open each link, scan for new postings since your last quarterly review:\n") + for name, note, url in MANUAL_CHECK: + lines.append(f"- [ ] **{name}** — {note}: <{url}>") + lines.append("") path.write_text("\n".join(lines), encoding="utf-8") @@ -811,7 +847,7 @@ def main(): # === Adapter coverage (refreshed 2026-05-24) ================================== -# 21 companies automated across 9 adapter types; 1 remains in MANUAL_CHECK. +# 22 companies automated across 10 adapter types; 0 remain in MANUAL_CHECK. # # Automated (COMPANIES above): # workday nvidia, novartis @@ -822,21 +858,22 @@ def main(): # smartrecruiters metgroup, vitol, ldc # rss bis (vacancies.rss — RSS 1.0/RDF) # getro coinbase_ventures (web3 portfolio network, collection 1625) +# onlyfy bitcoin_suisse (onlyfy.jobs ajax_list HTML fragment) # playwright google, apple, meta, roche, cisco (headless browser, 3-15s each) # # Since the 2026-05-21 probe, six originally-manual sites moved to automated: # Google/Apple/Meta/Roche/Cisco via the playwright adapter, Microsoft via pcsx, and -# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, and the -# Coinbase Ventures web3 portfolio network via the new getro adapter. IBM Research and -# Sonova were dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis). +# Sygnum via its WordPress AJAX endpoint. BIS was added via the new rss adapter, the +# Coinbase Ventures web3 portfolio network via the new getro adapter, and Bitcoin Suisse +# via the new onlyfy adapter (its bitcoinsuisse.com page is a JS SPA, but the underlying +# onlyfy.jobs ATS serves a plain HTML list with locations). IBM Research and Sonova were +# dropped from the target list (no API / low fit; Sonova is MedTech, off-thesis). # # Note: the Coinbase Ventures board (getro) covers PORTFOLIO companies, not Coinbase -# itself. Coinbase-the-employer was dropped from MANUAL_CHECK (mass layoffs / hiring -# freeze as of 2026-05; re-add coinbase.com/careers if they reopen). +# itself. Coinbase-the-employer was dropped (mass layoffs / hiring freeze as of 2026-05; +# re-add coinbase.com/careers if they reopen). AMINA Bank was dropped (poor Glassdoor). # -# Still manual (MANUAL_CHECK above) — to automate, it needs a real-browser probe: -# Bitcoin Suisse bitcoinsuisse.com/careers JS widget, empty at scrape time, low volume -# (AMINA Bank was dropped — poor Glassdoor rating, not worth tracking.) +# MANUAL_CHECK is now empty — every current target company is automated. # ==============================================================================