fix(job_scout): word-boundary keyword matching + title-weighted scoring
Two scoring bugs inflated the rankings: - Substring matching: 'rag' matched sto[rag]e / tet[rag]on, 'intern' matched inte[rnal], 'lead' matched [lead]ership. Roche's staff restaurant and Cisco Tetragon roles scored as fits. Now keywords must not be flanked by alphanumerics (c#/.net/c++ keep matching). - Body boilerplate: every Kraken post mentions crypto/blockchain/trading, so sales/PM/design roles scored as high as engineering ones. Title matches now score full weight, body-only matches half (min 1); negatives still count fully anywhere. Strong-fit count drops 13 -> 5; the bogus 'intern' negatives and rag false positives are gone. Shared _kw_in() also backs the title filter. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
+32
-11
@@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from functools import lru_cache
|
||||||
import urllib.error
|
import urllib.error
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
import urllib.request
|
import urllib.request
|
||||||
@@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = {
|
|||||||
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
|
"verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3,
|
||||||
"physical design": -3, "silicon": -2,
|
"physical design": -3, "silicon": -2,
|
||||||
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
|
"expert c++": -2, "5+ years c++": -2, "deep c++": -2,
|
||||||
"intern": -5, "internship": -5, "graduate program": -3, " junior ": -3,
|
"intern": -5, "internship": -5, "graduate program": -3, "junior": -3,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
|
# Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that
|
||||||
@@ -690,21 +691,41 @@ def location_matches(loc_text):
|
|||||||
return in_ch, is_remote
|
return in_ch, is_remote
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=512)
|
||||||
|
def _kw_pattern(kw):
|
||||||
|
"""Word-boundary regex for a keyword. Plain substring matching produced false hits
|
||||||
|
('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead
|
||||||
|
require the keyword not be flanked by alphanumerics. Keywords that begin/end on a
|
||||||
|
non-word char (c#, .net, c++) skip that side's guard so they still match."""
|
||||||
|
esc = re.escape(kw.strip())
|
||||||
|
left = r"(?<![a-z0-9])" if kw.strip()[:1].isalnum() else ""
|
||||||
|
right = r"(?![a-z0-9])" if kw.strip()[-1:].isalnum() else ""
|
||||||
|
return re.compile(left + esc + right)
|
||||||
|
|
||||||
|
|
||||||
|
def _kw_in(kw, text):
|
||||||
|
return bool(_kw_pattern(kw).search(text))
|
||||||
|
|
||||||
|
|
||||||
def score_job(job, title_only=False):
|
def score_job(job, title_only=False):
|
||||||
# Title-filtered high-volume boards score on title only — the title filter already
|
# Title carries the real signal; the JD body is full of company boilerplate (every
|
||||||
# gated relevance, and scoring the full JD body over-inflates (every "python"/"data"
|
# Kraken post mentions crypto/blockchain/trading, every cloud post mentions python).
|
||||||
# mention adds points), flooding the medium bucket.
|
# So title matches score at full weight and body-only matches at half (min 1) — enough
|
||||||
if title_only:
|
# to surface a role without letting boilerplate inflate it. Negatives count fully
|
||||||
blob = (job.get("title") or "").lower()
|
# wherever they appear (a disqualifier in the body still disqualifies). Title-filtered
|
||||||
else:
|
# boards pass title_only=True and skip body scoring entirely.
|
||||||
blob = ((job.get("title") or "") + " " + (job.get("description") or "")).lower()
|
title = (job.get("title") or "").lower()
|
||||||
|
desc = "" if title_only else (job.get("description") or "").lower()
|
||||||
score, pos, neg = 0, [], []
|
score, pos, neg = 0, [], []
|
||||||
for kw, w in POSITIVE_KEYWORDS.items():
|
for kw, w in POSITIVE_KEYWORDS.items():
|
||||||
if kw in blob:
|
if _kw_in(kw, title):
|
||||||
score += w
|
score += w
|
||||||
pos.append(kw)
|
pos.append(kw)
|
||||||
|
elif desc and _kw_in(kw, desc):
|
||||||
|
score += max(1, w // 2)
|
||||||
|
pos.append(kw)
|
||||||
for kw, w in NEGATIVE_KEYWORDS.items():
|
for kw, w in NEGATIVE_KEYWORDS.items():
|
||||||
if kw in blob:
|
if _kw_in(kw, title) or (desc and _kw_in(kw, desc)):
|
||||||
score += w
|
score += w
|
||||||
neg.append(kw)
|
neg.append(kw)
|
||||||
return score, pos, neg
|
return score, pos, neg
|
||||||
@@ -807,7 +828,7 @@ def main():
|
|||||||
title_filter = args.get("_title_filter")
|
title_filter = args.get("_title_filter")
|
||||||
if title_filter:
|
if title_filter:
|
||||||
jobs = [j for j in jobs
|
jobs = [j for j in jobs
|
||||||
if any(k in (j.get("title") or "").lower() for k in title_filter)]
|
if any(_kw_in(k, (j.get("title") or "").lower()) for k in title_filter)]
|
||||||
|
|
||||||
company_seen = seen.setdefault(cid, {})
|
company_seen = seen.setdefault(cid, {})
|
||||||
title_seen = set()
|
title_seen = set()
|
||||||
|
|||||||
Reference in New Issue
Block a user