peregrine/scripts/match.py
pyr0ball 13d3234e69 feat: extract hard-coded personal references from all scripts via UserProfile
Replace hard-coded paths (/Library/Documents/JobSearch), names (Meghan McCann),
NDA sets (_NDA_COMPANIES), and the scraper path with UserProfile-driven lookups.
Update tests to be profile-agnostic (no user.yaml in peregrine config dir).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 18:45:39 -08:00

170 lines
5.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Resume match scoring.
Two modes:
1. SQLite batch — score all unscored pending/approved jobs in staging.db
Usage: python scripts/match.py
2. Notion single — score one Notion page by URL/ID and write results back
Usage: python scripts/match.py <notion-page-url-or-id>
"""
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
import requests
import yaml
from bs4 import BeautifulSoup
from notion_client import Client
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
CONFIG_DIR = Path(__file__).parent.parent / "config"
def _find_resume(docs_dir: Path) -> Path | None:
"""Find the most recently modified PDF in docs_dir matching *resume* or *cv*."""
candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf"))
return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None
RESUME_PATH = (
_find_resume(_profile.docs_dir) if _profile else None
) or Path(__file__).parent.parent / "config" / "resume.pdf"
def load_notion() -> tuple[Client, dict]:
cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text())
return Client(auth=cfg["token"]), cfg["field_map"]
def extract_page_id(url_or_id: str) -> str:
"""Extract 32-char Notion page ID from a URL or return as-is."""
clean = url_or_id.replace("-", "")
match = re.search(r"[0-9a-f]{32}", clean)
return match.group(0) if match else url_or_id.strip()
def get_job_url_from_notion(notion: Client, page_id: str, url_field: str) -> str:
page = notion.pages.retrieve(page_id)
return page["properties"][url_field]["url"] or ""
def extract_job_description(url: str) -> str:
"""Fetch a job listing URL and return its visible text."""
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
for tag in soup(["script", "style", "nav", "header", "footer"]):
tag.decompose()
return " ".join(soup.get_text(separator=" ").split())
def read_resume_text() -> str:
"""Extract text from the ATS-clean PDF resume."""
import pypdf
reader = pypdf.PdfReader(str(RESUME_PATH))
return " ".join(page.extract_text() or "" for page in reader.pages)
def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]:
"""
Score resume against job description using TF-IDF cosine similarity.
Returns (score 0100, list of high-value job keywords missing from resume).
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
vectorizer = TfidfVectorizer(stop_words="english", max_features=200)
tfidf = vectorizer.fit_transform([resume_text, job_text])
score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100
resume_terms = set(resume_text.lower().split())
feature_names = vectorizer.get_feature_names_out()
job_tfidf = tfidf[1].toarray()[0]
top_indices = np.argsort(job_tfidf)[::-1][:30]
top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0]
gaps = [t for t in top_job_terms if t not in resume_terms and t == t][:10] # t==t drops NaN
return round(score, 1), gaps
def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str], fm: dict) -> None:
notion.pages.update(
page_id=page_id,
properties={
fm["match_score"]: {"number": score},
fm["keyword_gaps"]: {"rich_text": [{"text": {"content": ", ".join(gaps)}}]},
},
)
def run_match(page_url_or_id: str) -> None:
notion, fm = load_notion()
page_id = extract_page_id(page_url_or_id)
print(f"[match] Page ID: {page_id}")
job_url = get_job_url_from_notion(notion, page_id, fm["url"])
print(f"[match] Fetching job description from: {job_url}")
job_text = extract_job_description(job_url)
resume_text = read_resume_text()
score, gaps = match_score(resume_text, job_text)
print(f"[match] Score: {score}/100")
print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}")
write_match_to_notion(notion, page_id, score, gaps, fm)
print("[match] Written to Notion.")
def score_pending_jobs(db_path: Path = None) -> int:
"""
Score all unscored jobs (any status) in SQLite using the description
already scraped during discovery. Writes match_score + keyword_gaps back.
Returns the number of jobs scored.
"""
from scripts.db import DEFAULT_DB, write_match_scores
if db_path is None:
db_path = DEFAULT_DB
import sqlite3
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
rows = conn.execute(
"SELECT id, title, company, description FROM jobs "
"WHERE match_score IS NULL "
"AND description IS NOT NULL AND description != '' AND description != 'nan'"
).fetchall()
conn.close()
if not rows:
print("[match] No unscored jobs with descriptions found.")
return 0
resume_text = read_resume_text()
scored = 0
for row in rows:
job_id, title, company, description = row["id"], row["title"], row["company"], row["description"]
try:
score, gaps = match_score(resume_text, description)
write_match_scores(db_path, job_id, score, ", ".join(gaps))
print(f"[match] {title} @ {company}: {score}/100 gaps: {', '.join(gaps) or 'none'}")
scored += 1
except Exception as e:
print(f"[match] Error scoring job {job_id}: {e}")
print(f"[match] Done — {scored} jobs scored.")
return scored
if __name__ == "__main__":
if len(sys.argv) < 2:
score_pending_jobs()
else:
run_match(sys.argv[1])