feat(job_scout): add RSS adapter, automate BIS vacancies
- Generic RSS/RDF feed parser (handles RSS 1.0 namespaced items + RSS 2.0) - BIS (Bank for International Settlements, Basel) promoted from manual-check to an automated source via its vacancies RSS feed — commutable from Bern, salary net of Swiss tax. Scout now 20 automated + 4 manual-check. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -125,6 +125,12 @@ COMPANIES = [
|
||||
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
|
||||
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
|
||||
# International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
|
||||
# Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
|
||||
("bis", "BIS (Basel)","rss", {
|
||||
"url": "https://www.bis.org/doclist/vacancies.rss",
|
||||
"default_location": "Basel, Switzerland",
|
||||
}),
|
||||
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
||||
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
||||
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
||||
@@ -209,6 +215,8 @@ MANUAL_CHECK = [
|
||||
"https://aminagroup.com/careers/#positions"),
|
||||
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
|
||||
"https://bitcoinsuisse.com/careers#open-positions"),
|
||||
# International org — qualifies (Basel, commutable from Bern, salary net of Swiss tax),
|
||||
# but uses a JS-heavy Taleo widget that doesn't render requisitions headless. Manual check.
|
||||
]
|
||||
|
||||
|
||||
@@ -360,6 +368,35 @@ def fetch_smartrecruiters(args):
|
||||
return jobs
|
||||
|
||||
|
||||
def fetch_rss(args):
|
||||
"""Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
|
||||
<item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
|
||||
RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
|
||||
import xml.etree.ElementTree as ET
|
||||
req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
root = ET.fromstring(resp.read())
|
||||
ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
|
||||
items = root.findall(".//rss1:item", ns) or root.findall(".//item")
|
||||
jobs = []
|
||||
for it in items:
|
||||
def field(tag, namespaced=True):
|
||||
el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
|
||||
if el is None and namespaced:
|
||||
el = it.find(tag)
|
||||
return (el.text or "").strip() if el is not None and el.text else ""
|
||||
link = field("link")
|
||||
jobs.append({
|
||||
"id": link or field("title"),
|
||||
"title": field("title"),
|
||||
"location": args.get("default_location", ""),
|
||||
"url": link,
|
||||
"posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
|
||||
"description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
|
||||
})
|
||||
return jobs
|
||||
|
||||
|
||||
def fetch_wp_ajax(args):
|
||||
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
||||
url = args["url"]
|
||||
@@ -545,6 +582,7 @@ ADAPTERS = {
|
||||
"pcsx": fetch_pcsx,
|
||||
"wp_ajax": fetch_wp_ajax,
|
||||
"smartrecruiters": fetch_smartrecruiters,
|
||||
"rss": fetch_rss,
|
||||
"playwright": fetch_playwright,
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user