peregrine/scripts/match.py

"""
Resume match scoring.

Two modes:
  1. SQLite batch — score all unscored pending/approved jobs in staging.db
     Usage: python scripts/match.py

  2. Notion single — score one Notion page by URL/ID and write results back
     Usage: python scripts/match.py <notion-page-url-or-id>
"""
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

import requests
import yaml
from bs4 import BeautifulSoup
from notion_client import Client

from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None

CONFIG_DIR = Path(__file__).parent.parent / "config"


def _find_resume(docs_dir: Path) -> Path | None:
    """Find the most recently modified PDF in docs_dir matching *resume* or *cv*."""
    candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf"))
    return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None


RESUME_PATH = (
    _find_resume(_profile.docs_dir) if _profile else None
) or Path(__file__).parent.parent / "config" / "resume.pdf"


def load_notion() -> tuple[Client, dict]:
    cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text())
    return Client(auth=cfg["token"]), cfg["field_map"]


def extract_page_id(url_or_id: str) -> str:
    """Extract 32-char Notion page ID from a URL or return as-is."""
    clean = url_or_id.replace("-", "")
    match = re.search(r"[0-9a-f]{32}", clean)
    return match.group(0) if match else url_or_id.strip()


def get_job_url_from_notion(notion: Client, page_id: str, url_field: str) -> str:
    page = notion.pages.retrieve(page_id)
    return page["properties"][url_field]["url"] or ""


def extract_job_description(url: str) -> str:
    """Fetch a job listing URL and return its visible text."""
    resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    for tag in soup(["script", "style", "nav", "header", "footer"]):
        tag.decompose()
    return " ".join(soup.get_text(separator=" ").split())


def read_resume_text() -> str:
    """Extract text from the ATS-clean PDF resume."""
    import pypdf
    reader = pypdf.PdfReader(str(RESUME_PATH))
    return " ".join(page.extract_text() or "" for page in reader.pages)


def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]:
    """
    Score resume against job description using TF-IDF cosine similarity.
    Returns (score 0–100, list of high-value job keywords missing from resume).
    """
    import numpy as np
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity

    vectorizer = TfidfVectorizer(stop_words="english", max_features=200)
    tfidf = vectorizer.fit_transform([resume_text, job_text])
    score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100

    resume_terms = set(resume_text.lower().split())
    feature_names = vectorizer.get_feature_names_out()
    job_tfidf = tfidf[1].toarray()[0]
    top_indices = np.argsort(job_tfidf)[::-1][:30]
    top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0]
    gaps = [t for t in top_job_terms if t not in resume_terms and t == t][:10]  # t==t drops NaN

    return round(score, 1), gaps


def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str], fm: dict) -> None:
    notion.pages.update(
        page_id=page_id,
        properties={
            fm["match_score"]:   {"number": score},
            fm["keyword_gaps"]:  {"rich_text": [{"text": {"content": ", ".join(gaps)}}]},
        },
    )


def run_match(page_url_or_id: str) -> None:
    notion, fm = load_notion()
    page_id = extract_page_id(page_url_or_id)

    print(f"[match] Page ID: {page_id}")
    job_url = get_job_url_from_notion(notion, page_id, fm["url"])
    print(f"[match] Fetching job description from: {job_url}")

    job_text = extract_job_description(job_url)
    resume_text = read_resume_text()

    score, gaps = match_score(resume_text, job_text)
    print(f"[match] Score: {score}/100")
    print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}")

    write_match_to_notion(notion, page_id, score, gaps, fm)
    print("[match] Written to Notion.")


def score_pending_jobs(db_path: Path = None) -> int:
    """
    Score all unscored jobs (any status) in SQLite using the description
    already scraped during discovery. Writes match_score + keyword_gaps back.
    Returns the number of jobs scored.
    """
    from scripts.db import DEFAULT_DB, write_match_scores

    if db_path is None:
        db_path = DEFAULT_DB

    import sqlite3
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row
    rows = conn.execute(
        "SELECT id, title, company, description FROM jobs "
        "WHERE match_score IS NULL "
        "AND description IS NOT NULL AND description != '' AND description != 'nan'"
    ).fetchall()
    conn.close()

    if not rows:
        print("[match] No unscored jobs with descriptions found.")
        return 0

    resume_text = read_resume_text()
    scored = 0
    for row in rows:
        job_id, title, company, description = row["id"], row["title"], row["company"], row["description"]
        try:
            score, gaps = match_score(resume_text, description)
            write_match_scores(db_path, job_id, score, ", ".join(gaps))
            print(f"[match] {title} @ {company}: {score}/100  gaps: {', '.join(gaps) or 'none'}")
            scored += 1
        except Exception as e:
            print(f"[match] Error scoring job {job_id}: {e}")

    print(f"[match] Done — {scored} jobs scored.")
    return scored


if __name__ == "__main__":
    if len(sys.argv) < 2:
        score_pending_jobs()
    else:
        run_match(sys.argv[1])