feat(job_scout): add RSS adapter, automate BIS vacancies

- Generic RSS/RDF feed parser (handles RSS 1.0 namespaced items + RSS 2.0)
- BIS (Bank for International Settlements, Basel) promoted from manual-check
  to an automated source via its vacancies RSS feed — commutable from Bern,
  salary net of Swiss tax. Scout now 20 automated + 4 manual-check.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 09:55:50 +02:00
parent 3b07c4b900
commit 1331a7f1f5
+38
View File
@@ -125,6 +125,12 @@ COMPANIES = [
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
# International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
# Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
("bis", "BIS (Basel)","rss", {
"url": "https://www.bis.org/doclist/vacancies.rss",
"default_location": "Basel, Switzerland",
}),
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
@@ -209,6 +215,8 @@ MANUAL_CHECK = [
"https://aminagroup.com/careers/#positions"),
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
"https://bitcoinsuisse.com/careers#open-positions"),
# International org — qualifies (Basel, commutable from Bern, salary net of Swiss tax),
# but uses a JS-heavy Taleo widget that doesn't render requisitions headless. Manual check.
]
@@ -360,6 +368,35 @@ def fetch_smartrecruiters(args):
return jobs
def fetch_rss(args):
"""Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
<item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
import xml.etree.ElementTree as ET
req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=30) as resp:
root = ET.fromstring(resp.read())
ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
items = root.findall(".//rss1:item", ns) or root.findall(".//item")
jobs = []
for it in items:
def field(tag, namespaced=True):
el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
if el is None and namespaced:
el = it.find(tag)
return (el.text or "").strip() if el is not None and el.text else ""
link = field("link")
jobs.append({
"id": link or field("title"),
"title": field("title"),
"location": args.get("default_location", ""),
"url": link,
"posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
"description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
})
return jobs
def fetch_wp_ajax(args):
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
url = args["url"]
@@ -545,6 +582,7 @@ ADAPTERS = {
"pcsx": fetch_pcsx,
"wp_ajax": fetch_wp_ajax,
"smartrecruiters": fetch_smartrecruiters,
"rss": fetch_rss,
"playwright": fetch_playwright,
}