feat: extract hard-coded personal references from all scripts via UserProfile

Replace hard-coded paths (/Library/Documents/JobSearch), names (Alex Rivera), NDA sets (_NDA_COMPANIES), and the scraper path with UserProfile-driven lookups. Update tests to be profile-agnostic (no user.yaml in peregrine config dir).
2026-02-24 18:45:39 -08:00 · 2026-02-24 18:45:39 -08:00 · 9dc0244546
commit 9dc0244546
parent 7380deb021
7 changed files with 124 additions and 90 deletions
--- a/scripts/company_research.py
+++ b/scripts/company_research.py
@ -3,13 +3,13 @@
 Pre-interview company research generator.

 Three-phase approach:
-  1. If SearXNG is available (port 8888), use companyScraper.py to fetch live
+  1. If SearXNG is available, use companyScraper.py to fetch live
     data: CEO name, HQ address, LinkedIn, contact info.
  1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
      recent news snippets (funding, launches, leadership changes, etc.).
  2. Feed all real data into an LLM prompt to synthesise a structured brief
     covering company overview, leadership, recent developments, and talking
-     points tailored to Alex.
+     points tailored to the candidate.

 Falls back to pure LLM knowledge when SearXNG is offline.

@ -24,25 +24,32 @@ from types import SimpleNamespace

 sys.path.insert(0, str(Path(__file__).parent.parent))

-# ── SearXNG scraper integration ───────────────────────────────────────────────
-_SCRAPER_DIR = Path("/Library/Development/scrapers")
-_SCRAPER_AVAILABLE = False
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None

-if _SCRAPER_DIR.exists():
-    sys.path.insert(0, str(_SCRAPER_DIR))
+# ── SearXNG scraper integration ───────────────────────────────────────────────
+# companyScraper is bundled into the Docker image at /app/scrapers/
+_SCRAPER_AVAILABLE = False
+for _scraper_candidate in [
+    Path("/app/scrapers"),          # Docker container path
+    Path(__file__).parent.parent / "scrapers",  # local dev fallback
+]:
+    if _scraper_candidate.exists():
+        sys.path.insert(0, str(_scraper_candidate))
        try:
            from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
            _SCRAPER_AVAILABLE = True
        except (ImportError, SystemExit):
-        # companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed
            pass
+        break


-def _searxng_running() -> bool:
+def _searxng_running(searxng_url: str = "http://localhost:8888") -> bool:
    """Quick check whether SearXNG is reachable."""
    try:
        import requests
-        r = requests.get("http://localhost:8888/", timeout=3)
+        r = requests.get(f"{searxng_url}/", timeout=3)
        return r.status_code == 200
    except Exception:
        return False
@ -186,9 +193,13 @@ def _parse_sections(text: str) -> dict[str, str]:
 _RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
 _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"

-# Companies where Alex has an NDA — reference as generic label unless
-# the role is security-focused (score >= 3 matching JD keywords).
-_NDA_COMPANIES = {"upguard"}
+
+def _company_label(exp: dict) -> str:
+    company = exp.get("company", "")
+    score = exp.get("score", 0)
+    if _profile:
+        return _profile.nda_label(company, score)
+    return company


 def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
@ -214,8 +225,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
    """
    Build the resume section of the LLM context block.
    Top 2 scored experiences included in full detail; rest as one-liners.
-    Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)'
-    unless the role is security-focused (score >= 3).
+    NDA companies are masked via UserProfile.nda_label() when score < threshold.
    """
    experiences = resume.get("experience_details", [])
    if not experiences:
@ -225,11 +235,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
    top2 = scored[:2]
    rest = scored[2:]

-    def _company_label(exp: dict) -> str:
-        company = exp.get("company", "")
-        if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
-            return "enterprise security vendor (NDA)"
-        return company
+    candidate = _profile.name if _profile else "the candidate"

    def _exp_header(exp: dict) -> str:
        return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"
@ -238,14 +244,14 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
        bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
        return "\n".join(f"  - {b}" for b in bullets)

-    lines = ["## Alex's Matched Experience"]
+    lines = [f"## {candidate}'s Matched Experience"]
    for exp in top2:
        lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
        lines.append(_exp_bullets(exp))

    if rest:
        condensed = ", ".join(_exp_header(e) for e in rest)
-        lines.append(f"\nAlso in Alex's background: {condensed}")
+        lines.append(f"\nAlso in {candidate}'s background: {condensed}")

    return "\n".join(lines)

@ -359,7 +365,10 @@ def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict

    # ── Phase 2: LLM synthesis ────────────────────────────────────────────────
    _stage("Generating brief with LLM… (30–90 seconds)")
-    prompt = f"""You are preparing Alex Rivera for a job interview.
+    name = _profile.name if _profile else "the candidate"
+    career_summary = _profile.career_summary if _profile else ""
+    prompt = f"""You are preparing {name} for a job interview.
+{f"Candidate background: {career_summary}" if career_summary else ""}

 Role: **{title}** at **{company}**

@ -404,12 +413,12 @@ Assess {company}'s commitment to disability inclusion and accessibility. Cover:
 - Any public disability/accessibility advocacy, partnerships, or certifications
 - Glassdoor or press signals about how employees with disabilities experience the company
 If no specific signals are found, say so clearly — absence of public commitment is itself signal.
-This section is for Alex's personal decision-making only and will not appear in any application.
+This section is for the candidate's personal decision-making only and will not appear in any application.

-## Talking Points for Alex
+## Talking Points for {name}
 Five specific talking points for the phone screen. Each must:
- Reference a concrete experience from Alex's matched background by name
-  (UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus)
+- Reference a concrete experience from {name}'s matched background by name
+  (NDA rule: use the masked label shown in the matched experience section for any NDA-protected employer)
 - Connect to a specific signal from the JD or company context above
 - Be 1–2 sentences, ready to speak aloud
 - Never give generic advice
@ -432,7 +441,7 @@ Five specific talking points for the phone screen. Each must:
        "competitors_brief": sections.get("Funding & Market Position", ""),  # competitor landscape is in the funding section
        "red_flags":         sections.get("Red Flags & Watch-outs", ""),
        "accessibility_brief": sections.get("Inclusion & Accessibility", ""),
-        "talking_points":    sections.get("Talking Points for Alex", ""),
+        "talking_points":    sections.get(f"Talking Points for {name}", ""),
        "scrape_used":       scrape_used,
    }

--- a/scripts/finetune_local.py
+++ b/scripts/finetune_local.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # scripts/finetune_local.py
 """
-Local LoRA fine-tune on Alex's cover letter corpus.
+Local LoRA fine-tune on the candidate's cover letter corpus.
 No HuggingFace account or internet required after the base model is cached.

 Usage:
@ -17,24 +17,32 @@ import os
 import sys
 from pathlib import Path

+sys.path.insert(0, str(Path(__file__).parent.parent))
+
 # Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained
 # pins every layer to GPU 0, avoiding the accelerate None-device bug that
 # occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation.
 # Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation.
 os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")

+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
 # ── Config ────────────────────────────────────────────────────────────────────
 DEFAULT_MODEL   = "unsloth/Llama-3.2-3B-Instruct"   # safe on 8 GB VRAM
-LETTERS_JSONL   = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl")
-OUTPUT_DIR      = Path("/Library/Documents/JobSearch/training_data/finetune_output")
-GGUF_DIR        = Path("/Library/Documents/JobSearch/training_data/gguf")
-OLLAMA_NAME     = "alex-cover-writer"
+
+_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+LETTERS_JSONL   = _docs / "training_data" / "cover_letters.jsonl"
+OUTPUT_DIR      = _docs / "training_data" / "finetune_output"
+GGUF_DIR        = _docs / "training_data" / "gguf"
+OLLAMA_NAME     = f"{_profile.name.split()[0].lower()}-cover-writer" if _profile else "cover-writer"

 SYSTEM_PROMPT = (
-    "You are Alex Rivera's personal cover letter writer. "
-    "Write professional, warm, and results-focused cover letters in Alex's voice. "
-    "Draw on her background in customer success, technical account management, "
-    "and revenue operations. Be specific and avoid generic filler."
+    f"You are {_profile.name}'s personal cover letter writer. "
+    f"{_profile.career_summary}"
+    if _profile else
+    "You are a professional cover letter writer. Write in first person."
 )

 # ── Args ──────────────────────────────────────────────────────────────────────
@ -48,7 +56,7 @@ parser.add_argument("--max-length", type=int, default=1024, help="Max token leng
 args = parser.parse_args()

 print(f"\n{'='*60}")
-print(f"  Alex Cover Letter Fine-Tuner")
+print(f"  Cover Letter Fine-Tuner  [{OLLAMA_NAME}]")
 print(f"  Base model : {args.model}")
 print(f"  Epochs     : {args.epochs}")
 print(f"  LoRA rank  : {args.rank}")
--- a/scripts/generate_cover_letter.py
+++ b/scripts/generate_cover_letter.py
@ -1,6 +1,6 @@
 # scripts/generate_cover_letter.py
 """
-Generate a cover letter in Alex's voice using few-shot examples from her corpus.
+Generate a cover letter in the candidate's voice using few-shot examples from their corpus.

 Usage:
    conda run -n job-seeker python scripts/generate_cover_letter.py \
@ -16,30 +16,21 @@ import re
 import sys
 from pathlib import Path

-LETTERS_DIR = Path("/Library/Documents/JobSearch")
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
+LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
 LETTER_GLOB = "*Cover Letter*.md"

-# Background injected into every prompt so the model has Alex's facts
-SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader.
-
-Background:
- 6+ years in customer success, technical account management, and CS leadership
- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95
- Also founder of M3 Consulting, a CS advisory practice for SaaS startups
- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere
- Based in San Francisco Bay Area; open to remote/hybrid
- Pronouns: any
-
-Voice guidelines:
- Warm, confident, and specific — never generic
- Opens with "I'm delighted/thrilled to apply for [role] at [company]."
- 3–4 focused paragraphs, ~250–350 words total
- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric)
- Para 3: genuine connection to THIS company's mission/product
- Closes with "Thank you for considering my application." + warm sign-off
- Never use: "I am writing to express my interest", "passionate about making a difference",
-  "I look forward to hearing from you", or any hollow filler phrases
-"""
+# Background injected into every prompt so the model has the candidate's facts
+SYSTEM_CONTEXT = (
+    f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"
+    if _profile else
+    "You are a professional cover letter writer. Write in first person."
+)


 # ── Mission-alignment detection ───────────────────────────────────────────────
@ -69,21 +60,23 @@ _MISSION_SIGNALS: dict[str, list[str]] = {
    ],
 }

+_candidate = _profile.name if _profile else "the candidate"
+
 _MISSION_NOTES: dict[str, str] = {
    "music": (
-        "This company is in the music industry, which is one of Alex's genuinely "
-        "ideal work environments — she has a real personal passion for the music scene. "
+        f"This company is in the music industry, which is one of {_candidate}'s genuinely "
+        "ideal work environments — they have a real personal passion for the music scene. "
        "Para 3 should warmly and specifically reflect this authentic alignment, not as "
-        "a generic fan statement, but as an honest statement of where she'd love to apply "
-        "her CS skills."
+        "a generic fan statement, but as an honest statement of where they'd love to apply "
+        "their CS skills."
    ),
    "animal_welfare": (
-        "This organization works in animal welfare/rescue — one of Alex's dream-job "
+        f"This organization works in animal welfare/rescue — one of {_candidate}'s dream-job "
        "domains and a genuine personal passion. Para 3 should reflect this authentic "
-        "connection warmly and specifically, tying her CS skills to this mission."
+        "connection warmly and specifically, tying their CS skills to this mission."
    ),
    "education": (
-        "This company works in children's education or EdTech — one of Alex's ideal "
+        f"This company works in children's education or EdTech — one of {_candidate}'s ideal "
        "work domains, reflecting genuine personal values around learning and young people. "
        "Para 3 should reflect this authentic connection specifically and warmly."
    ),
@ -138,7 +131,7 @@ def build_prompt(
 ) -> str:
    parts = [SYSTEM_CONTEXT.strip(), ""]
    if examples:
-        parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n")
+        parts.append(f"=== STYLE EXAMPLES ({_candidate}'s past letters) ===\n")
        for i, ex in enumerate(examples, 1):
            parts.append(f"--- Example {i} ({ex['company']}) ---")
            parts.append(ex["text"])
@ -183,7 +176,7 @@ def generate(title: str, company: str, description: str = "", _router=None) -> s


 def main() -> None:
-    parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice")
+    parser = argparse.ArgumentParser(description=f"Generate a cover letter in {_candidate}'s voice")
    parser.add_argument("--title", help="Job title")
    parser.add_argument("--company", help="Company name")
    parser.add_argument("--description", default="", help="Job description text")
--- a/scripts/match.py
+++ b/scripts/match.py
@ -18,8 +18,22 @@ import yaml
 from bs4 import BeautifulSoup
 from notion_client import Client

+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
 CONFIG_DIR = Path(__file__).parent.parent / "config"
-RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf")
+
+
+def _find_resume(docs_dir: Path) -> Path | None:
+    """Find the most recently modified PDF in docs_dir matching *resume* or *cv*."""
+    candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf"))
+    return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None
+
+
+RESUME_PATH = (
+    _find_resume(_profile.docs_dir) if _profile else None
+) or Path(__file__).parent.parent / "config" / "resume.pdf"


 def load_notion() -> tuple[Client, dict]:
--- a/scripts/prepare_training_data.py
+++ b/scripts/prepare_training_data.py
@ -1,6 +1,6 @@
 # scripts/prepare_training_data.py
 """
-Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning.
+Extract training pairs from the candidate's cover letter corpus for LoRA fine-tuning.

 Outputs a JSONL file where each line is:
  {"instruction": "Write a cover letter for the [role] position at [company].",
@ -16,10 +16,17 @@ import re
 import sys
 from pathlib import Path

-LETTERS_DIR = Path("/Library/Documents/JobSearch")
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.user_profile import UserProfile
+_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
+_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
+
+_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+LETTERS_DIR = _docs
 # Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
 LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
-DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl"
+DEFAULT_OUTPUT = _docs / "training_data" / "cover_letters.jsonl"

 # Patterns that appear in opening sentences to extract role
 ROLE_PATTERNS = [
--- a/tests/test_company_research.py
+++ b/tests/test_company_research.py
@ -64,16 +64,22 @@ def test_build_resume_context_top2_in_full():
 def test_build_resume_context_rest_condensed():
    """Remaining experiences appear as condensed one-liners, not full bullets."""
    ctx = _build_resume_context(RESUME, KEYWORDS, JD)
-    assert "Also in Alex" in ctx
+    assert "Also in" in ctx
    assert "Generic Co" in ctx
    # Generic Co bullets should NOT appear in full
    assert "Managed SMB portfolio" not in ctx


 def test_upguard_nda_low_score():
-    """UpGuard name replaced with 'enterprise security vendor' when score < 3."""
+    """UpGuard NDA rule: company masked when score < 3 and profile has NDA companies configured."""
+    from scripts.company_research import _profile
    ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops")
-    assert "enterprise security vendor" in ctx
+    if _profile and _profile.is_nda("upguard"):
+        # Profile present with UpGuard NDA — company should be masked
+        assert "UpGuard" not in ctx
+    else:
+        # No profile or UpGuard not in NDA list — company name appears directly
+        assert "UpGuard" in ctx or "enterprise security vendor" in ctx or "previous employer" in ctx


 def test_load_resume_and_keywords_returns_lists():
--- a/tests/test_cover_letter.py
+++ b/tests/test_cover_letter.py
@ -89,17 +89,14 @@ def test_find_similar_letters_returns_top_k():


 def test_load_corpus_returns_list():
-    """load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash)."""
+    """load_corpus returns a list (empty if LETTERS_DIR absent) without crashing."""
    from scripts.generate_cover_letter import load_corpus, LETTERS_DIR

-    if LETTERS_DIR.exists():
    corpus = load_corpus()
    assert isinstance(corpus, list)
    if corpus:
        assert "company" in corpus[0]
        assert "text" in corpus[0]
-    else:
-        pytest.skip("LETTERS_DIR not present in this environment")


 def test_generate_calls_llm_router():