feat(job_scout): expand to 19 companies + new adapters, filters, dedup
- New SmartRecruiters adapter (EU energy/commodity firms) - Add data-infra US tech (Confluent/GitLab/ClickHouse/Grafana) and commodity/energy traders (MET Group/Vitol/Louis Dreyfus) - Headless stealth (navigator.webdriver mask + chrome fingerprint) — unblocks Google; also enabled Meta and Cisco scraping - Tight title prefilter + title-only scoring + cross-region dedup so high-volume all-remote boards don't flood the report - Remove Canonical (below-market pay, poor culture) and IBM Research (research-scale pay below bar; weak data-eng fit) per reputation review Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -82,7 +82,8 @@
|
|||||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)",
|
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=google)",
|
||||||
"Bash(job_scout/.venv/Scripts/python.exe -c ' *)",
|
"Bash(job_scout/.venv/Scripts/python.exe -c ' *)",
|
||||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)",
|
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=meta)",
|
||||||
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)"
|
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=cisco --include-weak)",
|
||||||
|
"Bash(job_scout/.venv/Scripts/python.exe job_scout/scout.py --only=confluent)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -136,6 +136,7 @@ _Update this section when starting/finishing a JD._
|
|||||||
| Infineon AI Engineer | Critique DONE Pass 2 (78.5/100) | Submit or Tier 2 polish |
|
| Infineon AI Engineer | Critique DONE Pass 2 (78.5/100) | Submit or Tier 2 polish |
|
||||||
| Apple Data Engineer (ISE, Zurich) | Critique DONE Pass 1 (78.5/100) | /edit-resume for Tier 1 fixes or submit |
|
| Apple Data Engineer (ISE, Zurich) | Critique DONE Pass 1 (78.5/100) | /edit-resume for Tier 1 fixes or submit |
|
||||||
| Kraken AI Infrastructure | Critique DONE Pass 2 (84.5/100) — converged near max | Submit, or apply Tier 2 polish (agent orchestration / guardrails in skills) |
|
| Kraken AI Infrastructure | Critique DONE Pass 2 (84.5/100) — converged near max | Submit, or apply Tier 2 polish (agent orchestration / guardrails in skills) |
|
||||||
|
| Google FDE GenAI (Zurich) | PAUSED — GenAI evidence gap too large; redirecting to data-eng/MLOps roles | Likely abandon |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
+80
-18
@@ -37,7 +37,7 @@ CH_LOCATION_KEYWORDS = [
|
|||||||
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
|
"lausanne", "zug", "rüschlikon", "stäfa", "schweiz", "suisse",
|
||||||
]
|
]
|
||||||
|
|
||||||
REMOTE_KEYWORDS = ["remote"]
|
REMOTE_KEYWORDS = ["remote", "home based", "home-based", "anywhere", "distributed"]
|
||||||
|
|
||||||
US_ONLY_PATTERNS = [
|
US_ONLY_PATTERNS = [
|
||||||
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
|
"remote - us", "remote, us", "remote-us", "us remote", "us-remote",
|
||||||
@@ -49,7 +49,7 @@ EU_HINT_KEYWORDS = [
|
|||||||
"germany", "france", "spain", "portugal", "ireland", "netherlands",
|
"germany", "france", "spain", "portugal", "ireland", "netherlands",
|
||||||
"sweden", "norway", "finland", "denmark", "poland", "czech",
|
"sweden", "norway", "finland", "denmark", "poland", "czech",
|
||||||
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
|
"romania", "italy", "austria", "belgium", "uk", "united kingdom",
|
||||||
"europe", "emea", "global",
|
"europe", "emea", "global", "worldwide",
|
||||||
] + CH_LOCATION_KEYWORDS
|
] + CH_LOCATION_KEYWORDS
|
||||||
|
|
||||||
POSITIVE_KEYWORDS = {
|
POSITIVE_KEYWORDS = {
|
||||||
@@ -77,6 +77,19 @@ NEGATIVE_KEYWORDS = {
|
|||||||
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
|
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
|
||||||
|
# post mostly non-tech roles). Only keep titles containing one of these specific role
|
||||||
|
# phrases — kept tight so "Sales Engineer"/"Staff Accountant"/"Data Privacy Counsel"
|
||||||
|
# don't leak in. Matched as case-insensitive substrings against the title only.
|
||||||
|
ENG_TITLE_FILTER = [
|
||||||
|
"data engineer", "data engineering", "data platform", "platform engineer",
|
||||||
|
"data infrastructure", "data architect", "analytics engineer",
|
||||||
|
"mlops", "ml engineer", "ml platform", "machine learning engineer",
|
||||||
|
"site reliability", "sre", "backend engineer", "back-end engineer",
|
||||||
|
"devops engineer", "cloud engineer", "software engineer", "infrastructure engineer",
|
||||||
|
"kafka", "streaming", "big data", "quantitative developer", "quant developer",
|
||||||
|
]
|
||||||
|
|
||||||
# id, display, adapter, adapter_args
|
# id, display, adapter, adapter_args
|
||||||
COMPANIES = [
|
COMPANIES = [
|
||||||
("nvidia", "NVIDIA", "workday", {
|
("nvidia", "NVIDIA", "workday", {
|
||||||
@@ -103,6 +116,15 @@ COMPANIES = [
|
|||||||
("sygnum", "Sygnum", "wp_ajax", {
|
("sygnum", "Sygnum", "wp_ajax", {
|
||||||
"url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
|
"url": "https://www.sygnum.com/wp-admin/admin-ajax.php?action=fetch_careers&_wpnonce=c036d1627c",
|
||||||
}),
|
}),
|
||||||
|
# --- Data-infra US tech (his exact stack; mostly all-remote — title-filtered to eng/data) ---
|
||||||
|
("confluent", "Confluent", "ashby", {"slug": "confluent", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
("gitlab", "GitLab", "greenhouse", {"board": "gitlab", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
("clickhouse","ClickHouse","greenhouse", {"board": "clickhouse", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
("grafana", "Grafana Labs","greenhouse",{"board": "grafanalabs", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
# --- Energy / commodity trading (SmartRecruiters; title-filtered to tech roles) ---
|
||||||
|
("metgroup", "MET Group", "smartrecruiters", {"company": "METGroup", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
("vitol", "Vitol", "smartrecruiters", {"company": "Vitol", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
|
("ldc", "Louis Dreyfus","smartrecruiters",{"company": "LouisDreyfusCompany", "_title_filter": ENG_TITLE_FILTER}),
|
||||||
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
# Headless-browser scrapers — slower (3-15s per company) but covers JS-rendered sites.
|
||||||
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
# Google actively bot-detects; the STEALTH_JS init script (applied to every context)
|
||||||
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
# is what makes its job list render. Cards are <li> with a "Learn more about <title>"
|
||||||
@@ -174,19 +196,6 @@ COMPANIES = [
|
|||||||
"scroll_count": 5,
|
"scroll_count": 5,
|
||||||
"use_inner_text_as_blob": True,
|
"use_inner_text_as_blob": True,
|
||||||
}),
|
}),
|
||||||
("ibm", "IBM Research", "playwright", {
|
|
||||||
# IBM Research Zurich careers page is mostly a static intro with few openings.
|
|
||||||
# Use IBM's main careers search filtered to Switzerland instead.
|
|
||||||
"url": "https://www.ibm.com/careers/search?q=&field_keyword_05[0]=Switzerland",
|
|
||||||
"wait_for": "a[href*='/careers/'], a[href*='ibm.com/employment']",
|
|
||||||
"card": "li:has(a[href*='/careers/']), a[href*='/careers/']:has(h3)",
|
|
||||||
"title_sel": "h3, h4",
|
|
||||||
"link_sel": "a[href*='/careers/']",
|
|
||||||
"link_attr": "href",
|
|
||||||
"url_prefix": "https://www.ibm.com",
|
|
||||||
"default_location": "Switzerland",
|
|
||||||
"scroll_count": 4,
|
|
||||||
}),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
# Companies where adapter probing did not yield a reliable scrape. Reasons noted.
|
||||||
@@ -318,6 +327,39 @@ def fetch_pcsx(args):
|
|||||||
return jobs
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_smartrecruiters(args):
|
||||||
|
"""SmartRecruiters public postings API. Used by many EU energy/commodity firms."""
|
||||||
|
company = args["company"]
|
||||||
|
base = f"https://api.smartrecruiters.com/v1/companies/{company}/postings"
|
||||||
|
jobs, offset = [], 0
|
||||||
|
while True:
|
||||||
|
data = http_get_json(f"{base}?limit=100&offset={offset}")
|
||||||
|
content = data.get("content", []) or []
|
||||||
|
for p in content:
|
||||||
|
loc = p.get("location") or {}
|
||||||
|
parts = [loc.get("fullLocation") or loc.get("city") or ""]
|
||||||
|
if loc.get("remote"):
|
||||||
|
parts.append("Remote")
|
||||||
|
if loc.get("hybrid"):
|
||||||
|
parts.append("Hybrid")
|
||||||
|
loc_str = " ".join(x for x in parts if x)
|
||||||
|
dept = (p.get("department") or {}).get("label", "") if isinstance(p.get("department"), dict) else ""
|
||||||
|
func = (p.get("function") or {}).get("label", "") if isinstance(p.get("function"), dict) else ""
|
||||||
|
jobs.append({
|
||||||
|
"id": str(p.get("id")),
|
||||||
|
"title": p.get("name", ""),
|
||||||
|
"location": loc_str,
|
||||||
|
"url": f"https://jobs.smartrecruiters.com/{company}/{p.get('id')}",
|
||||||
|
"posted": p.get("releasedDate", ""),
|
||||||
|
"description": " ".join(filter(None, [dept, func])),
|
||||||
|
})
|
||||||
|
total = data.get("totalFound", 0)
|
||||||
|
offset += len(content)
|
||||||
|
if not content or offset >= total or offset >= 300:
|
||||||
|
break
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
def fetch_wp_ajax(args):
|
def fetch_wp_ajax(args):
|
||||||
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
"""WordPress admin-ajax style endpoint. Sygnum uses this pattern."""
|
||||||
url = args["url"]
|
url = args["url"]
|
||||||
@@ -502,6 +544,7 @@ ADAPTERS = {
|
|||||||
"greenhouse": fetch_greenhouse,
|
"greenhouse": fetch_greenhouse,
|
||||||
"pcsx": fetch_pcsx,
|
"pcsx": fetch_pcsx,
|
||||||
"wp_ajax": fetch_wp_ajax,
|
"wp_ajax": fetch_wp_ajax,
|
||||||
|
"smartrecruiters": fetch_smartrecruiters,
|
||||||
"playwright": fetch_playwright,
|
"playwright": fetch_playwright,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -520,8 +563,14 @@ def location_matches(loc_text):
|
|||||||
return in_ch, is_remote
|
return in_ch, is_remote
|
||||||
|
|
||||||
|
|
||||||
def score_job(job):
|
def score_job(job, title_only=False):
|
||||||
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
# Title-filtered high-volume boards score on title only — the title filter already
|
||||||
|
# gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
|
||||||
|
# mention adds points), flooding the medium bucket.
|
||||||
|
if title_only:
|
||||||
|
blob = (job.get("title") or "").lower()
|
||||||
|
else:
|
||||||
|
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
||||||
score, pos, neg = 0, [], []
|
score, pos, neg = 0, [], []
|
||||||
for kw, w in POSITIVE_KEYWORDS.items():
|
for kw, w in POSITIVE_KEYWORDS.items():
|
||||||
if kw in blob:
|
if kw in blob:
|
||||||
@@ -626,14 +675,27 @@ def main():
|
|||||||
errors.append((display, f"unexpected: {e!r}"))
|
errors.append((display, f"unexpected: {e!r}"))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Optional per-company title prefilter for high-volume boards
|
||||||
|
title_filter = args.get("_title_filter")
|
||||||
|
if title_filter:
|
||||||
|
jobs = [j for j in jobs
|
||||||
|
if any(k in (j.get("title") or "").lower() for k in title_filter)]
|
||||||
|
|
||||||
company_seen = seen.setdefault(cid, {})
|
company_seen = seen.setdefault(cid, {})
|
||||||
|
title_seen = set()
|
||||||
for j in jobs:
|
for j in jobs:
|
||||||
jid = str(j.get("id") or j.get("url"))
|
jid = str(j.get("id") or j.get("url"))
|
||||||
in_ch, is_remote = location_matches(j.get("location", ""))
|
in_ch, is_remote = location_matches(j.get("location", ""))
|
||||||
if not (in_ch or is_remote):
|
if not (in_ch or is_remote):
|
||||||
continue
|
continue
|
||||||
|
# Collapse the same role posted once per remote country (title differs only
|
||||||
|
# by a "| Country | Remote" suffix) — dedupe on the title before the first "|".
|
||||||
|
norm_title = re.sub(r"\s+", " ", (j.get("title") or "").split("|")[0]).strip().lower()
|
||||||
|
if norm_title in title_seen:
|
||||||
|
continue
|
||||||
|
title_seen.add(norm_title)
|
||||||
is_new = jid not in company_seen
|
is_new = jid not in company_seen
|
||||||
score, pos, neg = score_job(j)
|
score, pos, neg = score_job(j, title_only=bool(title_filter))
|
||||||
all_results.append({
|
all_results.append({
|
||||||
"company": display, "company_id": cid,
|
"company": display, "company_id": cid,
|
||||||
"title": j["title"], "location": j["location"],
|
"title": j["title"], "location": j["location"],
|
||||||
|
|||||||
Reference in New Issue
Block a user