fix(job_scout): word-boundary keyword matching + title-weighted scoring

Two scoring bugs inflated the rankings: - Substring matching: 'rag' matched sto[rag]e / tet[rag]on, 'intern' matched inte[rnal], 'lead' matched [lead]ership. Roche's staff restaurant and Cisco Tetragon roles scored as fits. Now keywords must not be flanked by alphanumerics (c#/.net/c++ keep matching). - Body boilerplate: every Kraken post mentions crypto/blockchain/trading, so sales/PM/design roles scored as high as engineering ones. Title matches now score full weight, body-only matches half (min 1); negatives still count fully anywhere. Strong-fit count drops 13 -> 5; the bogus 'intern' negatives and rag false positives are gone. Shared _kw_in() also backs the title filter. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 21:37:00 +02:00
parent 74dcc244b8
commit 1e83966049
1 changed files with 32 additions and 11 deletions
@@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp
 import json
 import re
 import sys
+from functools import lru_cache
 import urllib.error
 import urllib.parse
 import urllib.request
@@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = {
    "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
    "physical design": -3, "silicon": -2,
    "expert c++": -2, "5+ years c++": -2, "deep c++": -2,
-    "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
+    "intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
 }

 # Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
@@ -690,21 +691,41 @@ def location_matches(loc_text):
    return in_ch, is_remote


+@lru_cache(maxsize=512)
+def _kw_pattern(kw):
+    """Word-boundary regex for a keyword. Plain substring matching produced false hits
+    ('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
+    require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
+    non-word char (c#, .net, c++) skip that side's guard so they still match."""
+    esc = re.escape(kw.strip())
+    left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
+    right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
+    return re.compile(left + esc + right)
+
+
+def _kw_in(kw, text):
+    return bool(_kw_pattern(kw).search(text))
+
+
 def score_job(job, title_only=False):
-    # Title-filtered high-volume boards score on title only — the title filter already
-    # gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
-    # mention adds points), flooding the medium bucket.
-    if title_only:
-        blob = (job.get("title") or "").lower()
-    else:
-        blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
+    # Title carries the real signal; the JD body is full of company boilerplate (every
+    # Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
+    # So title matches score at full weight and body-only matches at half (min 1) — enough
+    # to surface a role without letting boilerplate inflate it. Negatives count fully
+    # wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
+    # boards pass title_only=True and skip body scoring entirely.
+    title = (job.get("title") or "").lower()
+    desc = "" if title_only else (job.get("description") or "").lower()
    score, pos, neg = 0, [], []
    for kw, w in POSITIVE_KEYWORDS.items():
-        if kw in blob:
+        if _kw_in(kw, title):
            score += w
            pos.append(kw)
+        elif desc and _kw_in(kw, desc):
+            score += max(1, w // 2)
+            pos.append(kw)
    for kw, w in NEGATIVE_KEYWORDS.items():
-        if kw in blob:
+        if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
            score += w
            neg.append(kw)
    return score, pos, neg
@@ -807,7 +828,7 @@ def main():
        title_filter = args.get("_title_filter")
        if title_filter:
            jobs = [j for j in jobs
-                    if any(k in (j.get("title") or "").lower() for k in title_filter)]
+                    if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]

        company_seen = seen.setdefault(cid, {})
        title_seen = set()