From 1e83966049ab384dee97b09f2e032ff862d94e81 Mon Sep 17 00:00:00 2001 From: Dennis Thiessen Date: Sun, 24 May 2026 21:37:00 +0200 Subject: [PATCH] fix(job_scout): word-boundary keyword matching + title-weighted scoring Two scoring bugs inflated the rankings: - Substring matching: 'rag' matched sto[rag]e / tet[rag]on, 'intern' matched inte[rnal], 'lead' matched [lead]ership. Roche's staff restaurant and Cisco Tetragon roles scored as fits. Now keywords must not be flanked by alphanumerics (c#/.net/c++ keep matching). - Body boilerplate: every Kraken post mentions crypto/blockchain/trading, so sales/PM/design roles scored as high as engineering ones. Title matches now score full weight, body-only matches half (min 1); negatives still count fully anywhere. Strong-fit count drops 13 -> 5; the bogus 'intern' negatives and rag false positives are gone. Shared _kw_in() also backs the title filter. Co-Authored-By: Claude Opus 4.7 --- job_scout/scout.py | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/job_scout/scout.py b/job_scout/scout.py index 993fa93..458e716 100644 --- a/job_scout/scout.py +++ b/job_scout/scout.py @@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp import json import re import sys +from functools import lru_cache import urllib.error import urllib.parse import urllib.request @@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = { "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3, "physical design": -3, "silicon": -2, "expert c++": -2, "5+ years c++": -2, "deep c++": -2, - "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3, + "intern": -5, "internship": -5, "graduate program": -3, "junior": -3, } # Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that @@ -690,21 +691,41 @@ def location_matches(loc_text): return in_ch, is_remote +@lru_cache(maxsize=512) +def _kw_pattern(kw): + """Word-boundary regex for a keyword. Plain substring matching produced false hits + ('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead + require the keyword not be flanked by alphanumerics. Keywords that begin/end on a + non-word char (c#, .net, c++) skip that side's guard so they still match.""" + esc = re.escape(kw.strip()) + left = r"(?