diff --git a/job_scout/scout.py b/job_scout/scout.py index 993fa93..458e716 100644 --- a/job_scout/scout.py +++ b/job_scout/scout.py @@ -22,6 +22,7 @@ See the adapter-coverage notes at the bottom for the current automated/manual sp import json import re import sys +from functools import lru_cache import urllib.error import urllib.parse import urllib.request @@ -81,7 +82,7 @@ NEGATIVE_KEYWORDS = { "verilog": -3, "vhdl": -3, "asic": -3, "rtl design": -3, "physical design": -3, "silicon": -2, "expert c++": -2, "5+ years c++": -2, "deep c++": -2, - "intern": -5, "internship": -5, "graduate program": -3, " junior ": -3, + "intern": -5, "internship": -5, "graduate program": -3, "junior": -3, } # Title prefilter for high-volume boards (all-remote tech orgs + commodity traders that @@ -690,21 +691,41 @@ def location_matches(loc_text): return in_ch, is_remote +@lru_cache(maxsize=512) +def _kw_pattern(kw): + """Word-boundary regex for a keyword. Plain substring matching produced false hits + ('rag' inside 'sto[rag]e'/'tet[rag]on', 'intern' inside 'inte[rnal]'); we instead + require the keyword not be flanked by alphanumerics. Keywords that begin/end on a + non-word char (c#, .net, c++) skip that side's guard so they still match.""" + esc = re.escape(kw.strip()) + left = r"(?