fix(job_scout): word-boundary keyword matching + title-weighted scoring

Two scoring bugs inflated the rankings:
- Substring matching: 'rag' matched sto[rag]e / tet[rag]on, 'intern'
  matched inte[rnal], 'lead' matched [lead]ership. Roche's staff
  restaurant and Cisco Tetragon roles scored as fits. Now keywords must
  not be flanked by alphanumerics (c#/.net/c++ keep matching).
- Body boilerplate: every Kraken post mentions crypto/blockchain/trading,
  so sales/PM/design roles scored as high as engineering ones. Title
  matches now score full weight, body-only matches half (min 1);
  negatives still count fully anywhere.

Strong-fit count drops 13 -> 5; the bogus 'intern' negatives and rag
false positives are gone. Shared _kw_in() also backs the title filter.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-24 21:37:00 +02:00
parent 74dcc244b8
commit 1e83966049
+32 -11
View File
@@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp
import json
import re
import sys
from functools import lru_cache
import urllib.error
import urllib.parse
import urllib.request
@@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = {
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
"physical design": -3, "silicon": -2,
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
"intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
}
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
@@ -690,21 +691,41 @@ def location_matches(loc_text):
return in_ch, is_remote
@lru_cache(maxsize=512)
def _kw_pattern(kw):
"""Word-boundary regex for a keyword. Plain substring matching produced false hits
('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
non-word char (c#, .net, c++) skip that side's guard so they still match."""
esc = re.escape(kw.strip())
left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
return re.compile(left + esc + right)
def _kw_in(kw, text):
return bool(_kw_pattern(kw).search(text))
def score_job(job, title_only=False):
# Title-filtered high-volume boards score on title only — the title filter already
# gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
# mention adds points), flooding the medium bucket.
if title_only:
blob = (job.get("title") or "").lower()
else:
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
# Title carries the real signal; the JD body is full of company boilerplate (every
# Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
# So title matches score at full weight and body-only matches at half (min 1) — enough
# to surface a role without letting boilerplate inflate it. Negatives count fully
# wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
# boards pass title_only=True and skip body scoring entirely.
title = (job.get("title") or "").lower()
desc = "" if title_only else (job.get("description") or "").lower()
score, pos, neg = 0, [], []
for kw, w in POSITIVE_KEYWORDS.items():
if kw in blob:
if _kw_in(kw, title):
score += w
pos.append(kw)
elif desc and _kw_in(kw, desc):
score += max(1, w // 2)
pos.append(kw)
for kw, w in NEGATIVE_KEYWORDS.items():
if kw in blob:
if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
score += w
neg.append(kw)
return score, pos, neg
@@ -807,7 +828,7 @@ def main():
title_filter = args.get("_title_filter")
if title_filter:
jobs = [j for j in jobs
if any(k in (j.get("title") or "").lower() for k in title_filter)]
if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]
company_seen = seen.setdefault(cid, {})
title_seen = set()