diff --git a/job_scout/scout.py b/job_scout/scout.py index 9d0fd21..3b05323 100644 --- a/job_scout/scout.py +++ b/job_scout/scout.py @@ -125,6 +125,12 @@ COMPANIES = [ ("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}), ("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}), ("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}), + # International org — BIS (Basel), commutable from Bern, salary net of Swiss tax. + # Low-volume RSS feed; no title filter (Innovation Hub roles can be oddly titled). + ("bis", "BIS (Basel)","rss", { + "url": "https://www.bis.org/doclist/vacancies.rss", + "default_location": "Basel, Switzerland", + }), # Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites. # Google actively bot-detects; the STEALTH_JS init script (applied to every context) # is what makes its job list render. Cards are
  • with a "Learn more about " @@ -209,6 +215,8 @@ MANUAL_CHECK = [ "https://aminagroup.com/careers/#positions"), ("Bitcoin Suisse", "jobs under /careers#open-positions load via JS widget; section empty at scrape time (likely no/few openings)", "https://bitcoinsuisse.com/careers#open-positions"), + # International org — qualifies (Basel, commutable from Bern, salary net of Swiss tax), + # but uses a JS-heavy Taleo widget that doesn't render requisitions headless. Manual check. ] @@ -360,6 +368,35 @@ def fetch_smartrecruiters(args): return jobs +def fetch_rss(args): + """Generic RSS/RDF feed parser. BIS publishes vacancies as RSS 1.0 (RDF), whose + <item> elements live in the http://purl.org/rss/1.0/ namespace. Falls back to plain + RSS 2.0 <item> elements. Location isn't in the feed, so default_location is required.""" + import xml.etree.ElementTree as ET + req = urllib.request.Request(args["url"], headers={"User-Agent": USER_AGENT}) + with urllib.request.urlopen(req, timeout=30) as resp: + root = ET.fromstring(resp.read()) + ns = {"rss1": "http://purl.org/rss/1.0/", "dc": "http://purl.org/dc/elements/1.1/"} + items = root.findall(".//rss1:item", ns) or root.findall(".//item") + jobs = [] + for it in items: + def field(tag, namespaced=True): + el = it.find(f"rss1:{tag}", ns) if namespaced else it.find(tag) + if el is None and namespaced: + el = it.find(tag) + return (el.text or "").strip() if el is not None and el.text else "" + link = field("link") + jobs.append({ + "id": link or field("title"), + "title": field("title"), + "location": args.get("default_location", ""), + "url": link, + "posted": (it.findtext("dc:date", default="", namespaces=ns) or field("date")), + "description": re.sub(r"<[^>]+>", " ", field("description"))[:1500], + }) + return jobs + + def fetch_wp_ajax(args): """WordPress admin-ajax style endpoint. Sygnum uses this pattern.""" url = args["url"] @@ -545,6 +582,7 @@ ADAPTERS = { "pcsx": fetch_pcsx, "wp_ajax": fetch_wp_ajax, "smartrecruiters": fetch_smartrecruiters, + "rss": fetch_rss, "playwright": fetch_playwright, }