fix(job_scout): word-boundary keyword matching + title-weighted scoring
Two scoring bugs inflated the rankings: - Substring matching: 'rag' matched sto[rag]e / tet[rag]on, 'intern' matched inte[rnal], 'lead' matched [lead]ership. Roche's staff restaurant and Cisco Tetragon roles scored as fits. Now keywords must not be flanked by alphanumerics (c#/.net/c++ keep matching). - Body boilerplate: every Kraken post mentions crypto/blockchain/trading, so sales/PM/design roles scored as high as engineering ones. Title matches now score full weight, body-only matches half (min 1); negatives still count fully anywhere. Strong-fit count drops 13 -> 5; the bogus 'intern' negatives and rag false positives are gone. Shared _kw_in() also backs the title filter. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+32
-11
@@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from functools import lru_cache
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
@@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = {
|
||||
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
|
||||
"physical design": -3, "silicon": -2,
|
||||
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
|
||||
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
|
||||
"intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
|
||||
}
|
||||
|
||||
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
|
||||
@@ -690,21 +691,41 @@ def location_matches(loc_text):
|
||||
return in_ch, is_remote
|
||||
|
||||
|
||||
@lru_cache(maxsize=512)
|
||||
def _kw_pattern(kw):
|
||||
"""Word-boundary regex for a keyword. Plain substring matching produced false hits
|
||||
('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
|
||||
require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
|
||||
non-word char (c#, .net, c++) skip that side's guard so they still match."""
|
||||
esc = re.escape(kw.strip())
|
||||
left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
|
||||
right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
|
||||
return re.compile(left + esc + right)
|
||||
|
||||
|
||||
def _kw_in(kw, text):
|
||||
return bool(_kw_pattern(kw).search(text))
|
||||
|
||||
|
||||
def score_job(job, title_only=False):
|
||||
# Title-filtered high-volume boards score on title only — the title filter already
|
||||
# gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
|
||||
# mention adds points), flooding the medium bucket.
|
||||
if title_only:
|
||||
blob = (job.get("title") or "").lower()
|
||||
else:
|
||||
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
||||
# Title carries the real signal; the JD body is full of company boilerplate (every
|
||||
# Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
|
||||
# So title matches score at full weight and body-only matches at half (min 1) — enough
|
||||
# to surface a role without letting boilerplate inflate it. Negatives count fully
|
||||
# wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
|
||||
# boards pass title_only=True and skip body scoring entirely.
|
||||
title = (job.get("title") or "").lower()
|
||||
desc = "" if title_only else (job.get("description") or "").lower()
|
||||
score, pos, neg = 0, [], []
|
||||
for kw, w in POSITIVE_KEYWORDS.items():
|
||||
if kw in blob:
|
||||
if _kw_in(kw, title):
|
||||
score += w
|
||||
pos.append(kw)
|
||||
elif desc and _kw_in(kw, desc):
|
||||
score += max(1, w // 2)
|
||||
pos.append(kw)
|
||||
for kw, w in NEGATIVE_KEYWORDS.items():
|
||||
if kw in blob:
|
||||
if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
|
||||
score += w
|
||||
neg.append(kw)
|
||||
return score, pos, neg
|
||||
@@ -807,7 +828,7 @@ def main():
|
||||
title_filter = args.get("_title_filter")
|
||||
if title_filter:
|
||||
jobs = [j for j in jobs
|
||||
if any(k in (j.get("title") or "").lower() for k in title_filter)]
|
||||
if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]
|
||||
|
||||
company_seen = seen.setdefault(cid, {})
|
||||
title_seen = set()
|
||||
|
||||
Reference in New Issue
Block a user