From 1e83966049ab384dee97b09f2e032ff862d94e81 Mon Sep 17 00:00:00 2001
From: Dennis Thiessen <dennis@thiessen.io>
Date: Sun, 24 May 2026 21:37:00 +0200
Subject: [PATCH] fix(job_scout): word-boundary keyword matching +
 title-weighted scoring

Two scoring bugs inflated the rankings:
- Substring matching: 'rag' matched sto[rag]e / tet[rag]on, 'intern'
  matched inte[rnal], 'lead' matched [lead]ership. Roche's staff
  restaurant and Cisco Tetragon roles scored as fits. Now keywords must
  not be flanked by alphanumerics (c#/.net/c++ keep matching).
- Body boilerplate: every Kraken post mentions crypto/blockchain/trading,
  so sales/PM/design roles scored as high as engineering ones. Title
  matches now score full weight, body-only matches half (min 1);
  negatives still count fully anywhere.

Strong-fit count drops 13 -> 5; the bogus 'intern' negatives and rag
false positives are gone. Shared _kw_in() also backs the title filter.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 job_scout/scout.py | 43 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/job_scout/scout.py b/job_scout/scout.py
index 993fa93..458e716 100644
--- a/job_scout/scout.py
+++ b/job_scout/scout.py
@@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp
 import json
 import re
 import sys
+from functools import lru_cache
 import urllib.error
 import urllib.parse
 import urllib.request
@@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = {
     "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
     "physical design": -3, "silicon": -2,
     "expert c++": -2, "5+ years c++": -2, "deep c++": -2,
-    "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
+    "intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
 }
 
 # Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
@@ -690,21 +691,41 @@ def location_matches(loc_text):
     return in_ch, is_remote
 
 
+@lru_cache(maxsize=512)
+def _kw_pattern(kw):
+    """Word-boundary regex for a keyword. Plain substring matching produced false hits
+    ('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
+    require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
+    non-word char (c#, .net, c++) skip that side's guard so they still match."""
+    esc = re.escape(kw.strip())
+    left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
+    right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
+    return re.compile(left + esc + right)
+
+
+def _kw_in(kw, text):
+    return bool(_kw_pattern(kw).search(text))
+
+
 def score_job(job, title_only=False):
-    # Title-filtered high-volume boards score on title only — the title filter already
-    # gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
-    # mention adds points), flooding the medium bucket.
-    if title_only:
-        blob = (job.get("title") or "").lower()
-    else:
-        blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
+    # Title carries the real signal; the JD body is full of company boilerplate (every
+    # Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
+    # So title matches score at full weight and body-only matches at half (min 1) — enough
+    # to surface a role without letting boilerplate inflate it. Negatives count fully
+    # wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
+    # boards pass title_only=True and skip body scoring entirely.
+    title = (job.get("title") or "").lower()
+    desc = "" if title_only else (job.get("description") or "").lower()
     score, pos, neg = 0, [], []
     for kw, w in POSITIVE_KEYWORDS.items():
-        if kw in blob:
+        if _kw_in(kw, title):
             score += w
             pos.append(kw)
+        elif desc and _kw_in(kw, desc):
+            score += max(1, w // 2)
+            pos.append(kw)
     for kw, w in NEGATIVE_KEYWORDS.items():
-        if kw in blob:
+        if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
             score += w
             neg.append(kw)
     return score, pos, neg
@@ -807,7 +828,7 @@ def main():
         title_filter = args.get("_title_filter")
         if title_filter:
             jobs = [j for j in jobs
-                    if any(k in (j.get("title") or "").lower() for k in title_filter)]
+                    if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]
 
         company_seen = seen.setdefault(cid, {})
         title_seen = set()