From 9dc02445464ea6ce404b72afcd8495ec41bd3aba Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Tue, 24 Feb 2026 18:45:39 -0800
Subject: [PATCH] feat: extract hard-coded personal references from all scripts
 via UserProfile

Replace hard-coded paths (/Library/Documents/JobSearch), names (Alex Rivera),
NDA sets (_NDA_COMPANIES), and the scraper path with UserProfile-driven lookups.
Update tests to be profile-agnostic (no user.yaml in peregrine config dir).
---
 scripts/company_research.py      | 73 ++++++++++++++++++--------------
 scripts/finetune_local.py        | 28 +++++++-----
 scripts/generate_cover_letter.py | 57 +++++++++++--------------
 scripts/match.py                 | 16 ++++++-
 scripts/prepare_training_data.py | 13 ++++--
 tests/test_company_research.py   | 12 ++++--
 tests/test_cover_letter.py       | 15 +++----
 7 files changed, 124 insertions(+), 90 deletions(-)

diff --git a/scripts/company_research.py b/scripts/company_research.py
index 3c7069c..17b8d8e 100644
--- a/scripts/company_research.py
+++ b/scripts/company_research.py
@@ -3,13 +3,13 @@
 Pre-interview company research generator.
 
 Three-phase approach:
-  1. If SearXNG is available (port 8888), use companyScraper.py to fetch live
+  1. If SearXNG is available, use companyScraper.py to fetch live
      data: CEO name, HQ address, LinkedIn, contact info.
   1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
       recent news snippets (funding, launches, leadership changes, etc.).
   2. Feed all real data into an LLM prompt to synthesise a structured brief
      covering company overview, leadership, recent developments, and talking
-     points tailored to Alex.
+     points tailored to the candidate.
 
 Falls back to pure LLM knowledge when SearXNG is offline.
 
@@ -24,25 +24,32 @@ from types import SimpleNamespace
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
 # ── SearXNG scraper integration ───────────────────────────────────────────────
-_SCRAPER_DIR = Path("/Library/Development/scrapers")
+# companyScraper is bundled into the Docker image at /app/scrapers/
 _SCRAPER_AVAILABLE = False
-
-if _SCRAPER_DIR.exists():
-    sys.path.insert(0, str(_SCRAPER_DIR))
-    try:
-        from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
-        _SCRAPER_AVAILABLE = True
-    except (ImportError, SystemExit):
-        # companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed
-        pass
+for _scraper_candidate in [
+    Path("/app/scrapers"),          # Docker container path
+    Path(__file__).parent.parent / "scrapers",  # local dev fallback
+]:
+    if _scraper_candidate.exists():
+        sys.path.insert(0, str(_scraper_candidate))
+        try:
+            from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
+            _SCRAPER_AVAILABLE = True
+        except (ImportError, SystemExit):
+            pass
+        break
 
 
-def _searxng_running() -> bool:
+def _searxng_running(searxng_url: str = "http://localhost:8888") -> bool:
     """Quick check whether SearXNG is reachable."""
     try:
         import requests
-        r = requests.get("http://localhost:8888/", timeout=3)
+        r = requests.get(f"{searxng_url}/", timeout=3)
         return r.status_code == 200
     except Exception:
         return False
@@ -186,9 +193,13 @@ def _parse_sections(text: str) -> dict[str, str]:
 _RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
 _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
 
-# Companies where Alex has an NDA — reference as generic label unless
-# the role is security-focused (score >= 3 matching JD keywords).
-_NDA_COMPANIES = {"upguard"}
+
+def _company_label(exp: dict) -> str:
+    company = exp.get("company", "")
+    score = exp.get("score", 0)
+    if _profile:
+        return _profile.nda_label(company, score)
+    return company
 
 
 def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
@@ -214,8 +225,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
     """
     Build the resume section of the LLM context block.
     Top 2 scored experiences included in full detail; rest as one-liners.
-    Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)'
-    unless the role is security-focused (score >= 3).
+    NDA companies are masked via UserProfile.nda_label() when score < threshold.
     """
     experiences = resume.get("experience_details", [])
     if not experiences:
@@ -225,11 +235,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
     top2 = scored[:2]
     rest = scored[2:]
 
-    def _company_label(exp: dict) -> str:
-        company = exp.get("company", "")
-        if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
-            return "enterprise security vendor (NDA)"
-        return company
+    candidate = _profile.name if _profile else "the candidate"
 
     def _exp_header(exp: dict) -> str:
         return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"
@@ -238,14 +244,14 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
         bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
         return "\n".join(f"  - {b}" for b in bullets)
 
-    lines = ["## Alex's Matched Experience"]
+    lines = [f"## {candidate}'s Matched Experience"]
     for exp in top2:
         lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
         lines.append(_exp_bullets(exp))
 
     if rest:
         condensed = ", ".join(_exp_header(e) for e in rest)
-        lines.append(f"\nAlso in Alex's background: {condensed}")
+        lines.append(f"\nAlso in {candidate}'s background: {condensed}")
 
     return "\n".join(lines)
 
@@ -359,7 +365,10 @@ def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict
 
     # ── Phase 2: LLM synthesis ────────────────────────────────────────────────
     _stage("Generating brief with LLM… (30–90 seconds)")
-    prompt = f"""You are preparing Alex Rivera for a job interview.
+    name = _profile.name if _profile else "the candidate"
+    career_summary = _profile.career_summary if _profile else ""
+    prompt = f"""You are preparing {name} for a job interview.
+{f"Candidate background: {career_summary}" if career_summary else ""}
 
 Role: **{title}** at **{company}**
 
@@ -404,12 +413,12 @@ Assess {company}'s commitment to disability inclusion and accessibility. Cover:
 - Any public disability/accessibility advocacy, partnerships, or certifications
 - Glassdoor or press signals about how employees with disabilities experience the company
 If no specific signals are found, say so clearly — absence of public commitment is itself signal.
-This section is for Alex's personal decision-making only and will not appear in any application.
+This section is for the candidate's personal decision-making only and will not appear in any application.
 
-## Talking Points for Alex
+## Talking Points for {name}
 Five specific talking points for the phone screen. Each must:
-- Reference a concrete experience from Alex's matched background by name
-  (UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus)
+- Reference a concrete experience from {name}'s matched background by name
+  (NDA rule: use the masked label shown in the matched experience section for any NDA-protected employer)
 - Connect to a specific signal from the JD or company context above
 - Be 1–2 sentences, ready to speak aloud
 - Never give generic advice
@@ -432,7 +441,7 @@ Five specific talking points for the phone screen. Each must:
         "competitors_brief": sections.get("Funding & Market Position", ""),  # competitor landscape is in the funding section
         "red_flags":         sections.get("Red Flags & Watch-outs", ""),
         "accessibility_brief": sections.get("Inclusion & Accessibility", ""),
-        "talking_points":    sections.get("Talking Points for Alex", ""),
+        "talking_points":    sections.get(f"Talking Points for {name}", ""),
         "scrape_used":       scrape_used,
     }
 
diff --git a/scripts/finetune_local.py b/scripts/finetune_local.py
index 6dfa406..c29fe93 100644
--- a/scripts/finetune_local.py
+++ b/scripts/finetune_local.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # scripts/finetune_local.py
 """
-Local LoRA fine-tune on Alex's cover letter corpus.
+Local LoRA fine-tune on the candidate's cover letter corpus.
 No HuggingFace account or internet required after the base model is cached.
 
 Usage:
@@ -17,24 +17,32 @@ import os
 import sys
 from pathlib import Path
 
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
 # Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained
 # pins every layer to GPU 0, avoiding the accelerate None-device bug that
 # occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation.
 # Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation.
 os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
 
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
 # ── Config ────────────────────────────────────────────────────────────────────
 DEFAULT_MODEL   = "unsloth/Llama-3.2-3B-Instruct"   # safe on 8 GB VRAM
-LETTERS_JSONL   = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl")
-OUTPUT_DIR      = Path("/Library/Documents/JobSearch/training_data/finetune_output")
-GGUF_DIR        = Path("/Library/Documents/JobSearch/training_data/gguf")
-OLLAMA_NAME     = "alex-cover-writer"
+
+_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+LETTERS_JSONL   = _docs / "training_data" / "cover_letters.jsonl"
+OUTPUT_DIR      = _docs / "training_data" / "finetune_output"
+GGUF_DIR        = _docs / "training_data" / "gguf"
+OLLAMA_NAME     = f"{_profile.name.split()[0].lower()}-cover-writer" if _profile else "cover-writer"
 
 SYSTEM_PROMPT = (
-    "You are Alex Rivera's personal cover letter writer. "
-    "Write professional, warm, and results-focused cover letters in Alex's voice. "
-    "Draw on her background in customer success, technical account management, "
-    "and revenue operations. Be specific and avoid generic filler."
+    f"You are {_profile.name}'s personal cover letter writer. "
+    f"{_profile.career_summary}"
+    if _profile else
+    "You are a professional cover letter writer. Write in first person."
 )
 
 # ── Args ──────────────────────────────────────────────────────────────────────
@@ -48,7 +56,7 @@ parser.add_argument("--max-length", type=int, default=1024, help="Max token leng
 args = parser.parse_args()
 
 print(f"\n{'='*60}")
-print(f"  Alex Cover Letter Fine-Tuner")
+print(f"  Cover Letter Fine-Tuner  [{OLLAMA_NAME}]")
 print(f"  Base model : {args.model}")
 print(f"  Epochs     : {args.epochs}")
 print(f"  LoRA rank  : {args.rank}")
diff --git a/scripts/generate_cover_letter.py b/scripts/generate_cover_letter.py
index 071dd41..ca159c5 100644
--- a/scripts/generate_cover_letter.py
+++ b/scripts/generate_cover_letter.py
@@ -1,6 +1,6 @@
 # scripts/generate_cover_letter.py
 """
-Generate a cover letter in Alex's voice using few-shot examples from her corpus.
+Generate a cover letter in the candidate's voice using few-shot examples from their corpus.
 
 Usage:
     conda run -n job-seeker python scripts/generate_cover_letter.py \
@@ -16,30 +16,21 @@ import re
 import sys
 from pathlib import Path
 
-LETTERS_DIR = Path("/Library/Documents/JobSearch")
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
+LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
 LETTER_GLOB = "*Cover Letter*.md"
 
-# Background injected into every prompt so the model has Alex's facts
-SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader.
-
-Background:
-- 6+ years in customer success, technical account management, and CS leadership
-- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95
-- Also founder of M3 Consulting, a CS advisory practice for SaaS startups
-- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere
-- Based in San Francisco Bay Area; open to remote/hybrid
-- Pronouns: any
-
-Voice guidelines:
-- Warm, confident, and specific — never generic
-- Opens with "I'm delighted/thrilled to apply for [role] at [company]."
-- 3–4 focused paragraphs, ~250–350 words total
-- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric)
-- Para 3: genuine connection to THIS company's mission/product
-- Closes with "Thank you for considering my application." + warm sign-off
-- Never use: "I am writing to express my interest", "passionate about making a difference",
-  "I look forward to hearing from you", or any hollow filler phrases
-"""
+# Background injected into every prompt so the model has the candidate's facts
+SYSTEM_CONTEXT = (
+    f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"
+    if _profile else
+    "You are a professional cover letter writer. Write in first person."
+)
 
 
 # ── Mission-alignment detection ───────────────────────────────────────────────
@@ -69,21 +60,23 @@ _MISSION_SIGNALS: dict[str, list[str]] = {
     ],
 }
 
+_candidate = _profile.name if _profile else "the candidate"
+
 _MISSION_NOTES: dict[str, str] = {
     "music": (
-        "This company is in the music industry, which is one of Alex's genuinely "
-        "ideal work environments — she has a real personal passion for the music scene. "
+        f"This company is in the music industry, which is one of {_candidate}'s genuinely "
+        "ideal work environments — they have a real personal passion for the music scene. "
         "Para 3 should warmly and specifically reflect this authentic alignment, not as "
-        "a generic fan statement, but as an honest statement of where she'd love to apply "
-        "her CS skills."
+        "a generic fan statement, but as an honest statement of where they'd love to apply "
+        "their CS skills."
     ),
     "animal_welfare": (
-        "This organization works in animal welfare/rescue — one of Alex's dream-job "
+        f"This organization works in animal welfare/rescue — one of {_candidate}'s dream-job "
         "domains and a genuine personal passion. Para 3 should reflect this authentic "
-        "connection warmly and specifically, tying her CS skills to this mission."
+        "connection warmly and specifically, tying their CS skills to this mission."
     ),
     "education": (
-        "This company works in children's education or EdTech — one of Alex's ideal "
+        f"This company works in children's education or EdTech — one of {_candidate}'s ideal "
         "work domains, reflecting genuine personal values around learning and young people. "
         "Para 3 should reflect this authentic connection specifically and warmly."
     ),
@@ -138,7 +131,7 @@ def build_prompt(
 ) -> str:
     parts = [SYSTEM_CONTEXT.strip(), ""]
     if examples:
-        parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n")
+        parts.append(f"=== STYLE EXAMPLES ({_candidate}'s past letters) ===\n")
         for i, ex in enumerate(examples, 1):
             parts.append(f"--- Example {i} ({ex['company']}) ---")
             parts.append(ex["text"])
@@ -183,7 +176,7 @@ def generate(title: str, company: str, description: str = "", _router=None) -> s
 
 
 def main() -> None:
-    parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice")
+    parser = argparse.ArgumentParser(description=f"Generate a cover letter in {_candidate}'s voice")
     parser.add_argument("--title", help="Job title")
     parser.add_argument("--company", help="Company name")
     parser.add_argument("--description", default="", help="Job description text")
diff --git a/scripts/match.py b/scripts/match.py
index af1d000..53edd1f 100644
--- a/scripts/match.py
+++ b/scripts/match.py
@@ -18,8 +18,22 @@ import yaml
 from bs4 import BeautifulSoup
 from notion_client import Client
 
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
 CONFIG_DIR = Path(__file__).parent.parent / "config"
-RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf")
+
+
+def _find_resume(docs_dir: Path) -> Path | None:
+    """Find the most recently modified PDF in docs_dir matching *resume* or *cv*."""
+    candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf"))
+    return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None
+
+
+RESUME_PATH = (
+    _find_resume(_profile.docs_dir) if _profile else None
+) or Path(__file__).parent.parent / "config" / "resume.pdf"
 
 
 def load_notion() -> tuple[Client, dict]:
diff --git a/scripts/prepare_training_data.py b/scripts/prepare_training_data.py
index 5b2010b..9b7441c 100644
--- a/scripts/prepare_training_data.py
+++ b/scripts/prepare_training_data.py
@@ -1,6 +1,6 @@
 # scripts/prepare_training_data.py
 """
-Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning.
+Extract training pairs from the candidate's cover letter corpus for LoRA fine-tuning.
 
 Outputs a JSONL file where each line is:
   {"instruction": "Write a cover letter for the [role] position at [company].",
@@ -16,10 +16,17 @@ import re
 import sys
 from pathlib import Path
 
-LETTERS_DIR = Path("/Library/Documents/JobSearch")
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
+_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+LETTERS_DIR = _docs
 # Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
 LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
-DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl"
+DEFAULT_OUTPUT = _docs / "training_data" / "cover_letters.jsonl"
 
 # Patterns that appear in opening sentences to extract role
 ROLE_PATTERNS = [
diff --git a/tests/test_company_research.py b/tests/test_company_research.py
index ea696dd..2b1e13f 100644
--- a/tests/test_company_research.py
+++ b/tests/test_company_research.py
@@ -64,16 +64,22 @@ def test_build_resume_context_top2_in_full():
 def test_build_resume_context_rest_condensed():
     """Remaining experiences appear as condensed one-liners, not full bullets."""
     ctx = _build_resume_context(RESUME, KEYWORDS, JD)
-    assert "Also in Alex" in ctx
+    assert "Also in" in ctx
     assert "Generic Co" in ctx
     # Generic Co bullets should NOT appear in full
     assert "Managed SMB portfolio" not in ctx
 
 
 def test_upguard_nda_low_score():
-    """UpGuard name replaced with 'enterprise security vendor' when score < 3."""
+    """UpGuard NDA rule: company masked when score < 3 and profile has NDA companies configured."""
+    from scripts.company_research import _profile
     ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops")
-    assert "enterprise security vendor" in ctx
+    if _profile and _profile.is_nda("upguard"):
+        # Profile present with UpGuard NDA — company should be masked
+        assert "UpGuard" not in ctx
+    else:
+        # No profile or UpGuard not in NDA list — company name appears directly
+        assert "UpGuard" in ctx or "enterprise security vendor" in ctx or "previous employer" in ctx
 
 
 def test_load_resume_and_keywords_returns_lists():
diff --git a/tests/test_cover_letter.py b/tests/test_cover_letter.py
index 558d261..5db4104 100644
--- a/tests/test_cover_letter.py
+++ b/tests/test_cover_letter.py
@@ -89,17 +89,14 @@ def test_find_similar_letters_returns_top_k():
 
 
 def test_load_corpus_returns_list():
-    """load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash)."""
+    """load_corpus returns a list (empty if LETTERS_DIR absent) without crashing."""
     from scripts.generate_cover_letter import load_corpus, LETTERS_DIR
 
-    if LETTERS_DIR.exists():
-        corpus = load_corpus()
-        assert isinstance(corpus, list)
-        if corpus:
-            assert "company" in corpus[0]
-            assert "text" in corpus[0]
-    else:
-        pytest.skip("LETTERS_DIR not present in this environment")
+    corpus = load_corpus()
+    assert isinstance(corpus, list)
+    if corpus:
+        assert "company" in corpus[0]
+        assert "text" in corpus[0]
 
 
 def test_generate_calls_llm_router():