feat(job_scout): add RSS adapter, automate BIS vacancies
- Generic RSS/RDF feed parser (handles RSS 1.0 namespaced items + RSS 2.0) - BIS (Bank for International Settlements, Basel) promoted from manual-check to an automated source via its vacancies RSS feed — commutable from Bern, salary net of Swiss tax. Scout now 20 automated + 4 manual-check. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -125,6 +125,12 @@ COMPANIES = [
|
|||||||
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
|
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
|
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
|
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
# International org — BIS (Basel), commutable from Bern, salary net of Swiss tax.
|
||||||
|
# Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled).
|
||||||
|
("bis", "BIS (Basel)","rss", {
|
||||||
|
"url": "https://www.bis.org/doclist/vacancies.rss",
|
||||||
|
"default_location": "Basel, Switzerland",
|
||||||
|
}),
|
||||||
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
||||||
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
||||||
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
||||||
@@ -209,6 +215,8 @@ MANUAL_CHECK = [
|
|||||||
"https://aminagroup.com/careers/#positions"),
|
"https://aminagroup.com/careers/#positions"),
|
||||||
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
|
("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)",
|
||||||
"https://bitcoinsuisse.com/careers#open-positions"),
|
"https://bitcoinsuisse.com/careers#open-positions"),
|
||||||
|
# International org — qualifies (Basel, commutable from Bern, salary net of Swiss tax),
|
||||||
|
# but uses a JS-heavy Taleo widget that doesn't render requisitions headless. Manual check.
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -360,6 +368,35 @@ def fetch_smartrecruiters(args):
|
|||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_rss(args):
|
||||||
|
"""Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose
|
||||||
|
<item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain
|
||||||
|
RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required."""
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT})
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
root = ET.fromstring(resp.read())
|
||||||
|
ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"}
|
||||||
|
items = root.findall(".//rss1:item", ns) or root.findall(".//item")
|
||||||
|
jobs = []
|
||||||
|
for it in items:
|
||||||
|
def field(tag, namespaced=True):
|
||||||
|
el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag)
|
||||||
|
if el is None and namespaced:
|
||||||
|
el = it.find(tag)
|
||||||
|
return (el.text or "").strip() if el is not None and el.text else ""
|
||||||
|
link = field("link")
|
||||||
|
jobs.append({
|
||||||
|
"id": link or field("title"),
|
||||||
|
"title": field("title"),
|
||||||
|
"location": args.get("default_location", ""),
|
||||||
|
"url": link,
|
||||||
|
"posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")),
|
||||||
|
"description": re.sub(r"<[^>]+>", " ", field("description"))[:1500],
|
||||||
|
})
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
def fetch_wp_ajax(args):
|
def fetch_wp_ajax(args):
|
||||||
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
||||||
url = args["url"]
|
url = args["url"]
|
||||||
@@ -545,6 +582,7 @@ ADAPTERS = {
|
|||||||
"pcsx": fetch_pcsx,
|
"pcsx": fetch_pcsx,
|
||||||
"wp_ajax": fetch_wp_ajax,
|
"wp_ajax": fetch_wp_ajax,
|
||||||
"smartrecruiters": fetch_smartrecruiters,
|
"smartrecruiters": fetch_smartrecruiters,
|
||||||
|
"rss": fetch_rss,
|
||||||
"playwright": fetch_playwright,
|
"playwright": fetch_playwright,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user