From 9dc02445464ea6ce404b72afcd8495ec41bd3aba Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Tue, 24 Feb 2026 18:45:39 -0800 Subject: [PATCH] feat: extract hard-coded personal references from all scripts via UserProfile Replace hard-coded paths (/Library/Documents/JobSearch), names (Alex Rivera), NDA sets (_NDA_COMPANIES), and the scraper path with UserProfile-driven lookups. Update tests to be profile-agnostic (no user.yaml in peregrine config dir). --- scripts/company_research.py | 73 ++++++++++++++++++-------------- scripts/finetune_local.py | 28 +++++++----- scripts/generate_cover_letter.py | 57 +++++++++++-------------- scripts/match.py | 16 ++++++- scripts/prepare_training_data.py | 13 ++++-- tests/test_company_research.py | 12 ++++-- tests/test_cover_letter.py | 15 +++---- 7 files changed, 124 insertions(+), 90 deletions(-) diff --git a/scripts/company_research.py b/scripts/company_research.py index 3c7069c..17b8d8e 100644 --- a/scripts/company_research.py +++ b/scripts/company_research.py @@ -3,13 +3,13 @@ Pre-interview company research generator. Three-phase approach: - 1. If SearXNG is available (port 8888), use companyScraper.py to fetch live + 1. If SearXNG is available, use companyScraper.py to fetch live data: CEO name, HQ address, LinkedIn, contact info. 1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for recent news snippets (funding, launches, leadership changes, etc.). 2. Feed all real data into an LLM prompt to synthesise a structured brief covering company overview, leadership, recent developments, and talking - points tailored to Alex. + points tailored to the candidate. Falls back to pure LLM knowledge when SearXNG is offline. @@ -24,25 +24,32 @@ from types import SimpleNamespace sys.path.insert(0, str(Path(__file__).parent.parent)) +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + # ── SearXNG scraper integration ─────────────────────────────────────────────── -_SCRAPER_DIR = Path("/Library/Development/scrapers") +# companyScraper is bundled into the Docker image at /app/scrapers/ _SCRAPER_AVAILABLE = False - -if _SCRAPER_DIR.exists(): - sys.path.insert(0, str(_SCRAPER_DIR)) - try: - from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig - _SCRAPER_AVAILABLE = True - except (ImportError, SystemExit): - # companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed - pass +for _scraper_candidate in [ + Path("/app/scrapers"), # Docker container path + Path(__file__).parent.parent / "scrapers", # local dev fallback +]: + if _scraper_candidate.exists(): + sys.path.insert(0, str(_scraper_candidate)) + try: + from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig + _SCRAPER_AVAILABLE = True + except (ImportError, SystemExit): + pass + break -def _searxng_running() -> bool: +def _searxng_running(searxng_url: str = "http://localhost:8888") -> bool: """Quick check whether SearXNG is reachable.""" try: import requests - r = requests.get("http://localhost:8888/", timeout=3) + r = requests.get(f"{searxng_url}/", timeout=3) return r.status_code == 200 except Exception: return False @@ -186,9 +193,13 @@ def _parse_sections(text: str) -> dict[str, str]: _RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" _KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml" -# Companies where Alex has an NDA — reference as generic label unless -# the role is security-focused (score >= 3 matching JD keywords). -_NDA_COMPANIES = {"upguard"} + +def _company_label(exp: dict) -> str: + company = exp.get("company", "") + score = exp.get("score", 0) + if _profile: + return _profile.nda_label(company, score) + return company def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]: @@ -214,8 +225,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str: """ Build the resume section of the LLM context block. Top 2 scored experiences included in full detail; rest as one-liners. - Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)' - unless the role is security-focused (score >= 3). + NDA companies are masked via UserProfile.nda_label() when score < threshold. """ experiences = resume.get("experience_details", []) if not experiences: @@ -225,11 +235,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str: top2 = scored[:2] rest = scored[2:] - def _company_label(exp: dict) -> str: - company = exp.get("company", "") - if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3: - return "enterprise security vendor (NDA)" - return company + candidate = _profile.name if _profile else "the candidate" def _exp_header(exp: dict) -> str: return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})" @@ -238,14 +244,14 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str: bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()] return "\n".join(f" - {b}" for b in bullets) - lines = ["## Alex's Matched Experience"] + lines = [f"## {candidate}'s Matched Experience"] for exp in top2: lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})") lines.append(_exp_bullets(exp)) if rest: condensed = ", ".join(_exp_header(e) for e in rest) - lines.append(f"\nAlso in Alex's background: {condensed}") + lines.append(f"\nAlso in {candidate}'s background: {condensed}") return "\n".join(lines) @@ -359,7 +365,10 @@ def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict # ── Phase 2: LLM synthesis ──────────────────────────────────────────────── _stage("Generating brief with LLM… (30–90 seconds)") - prompt = f"""You are preparing Alex Rivera for a job interview. + name = _profile.name if _profile else "the candidate" + career_summary = _profile.career_summary if _profile else "" + prompt = f"""You are preparing {name} for a job interview. +{f"Candidate background: {career_summary}" if career_summary else ""} Role: **{title}** at **{company}** @@ -404,12 +413,12 @@ Assess {company}'s commitment to disability inclusion and accessibility. Cover: - Any public disability/accessibility advocacy, partnerships, or certifications - Glassdoor or press signals about how employees with disabilities experience the company If no specific signals are found, say so clearly — absence of public commitment is itself signal. -This section is for Alex's personal decision-making only and will not appear in any application. +This section is for the candidate's personal decision-making only and will not appear in any application. -## Talking Points for Alex +## Talking Points for {name} Five specific talking points for the phone screen. Each must: -- Reference a concrete experience from Alex's matched background by name - (UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus) +- Reference a concrete experience from {name}'s matched background by name + (NDA rule: use the masked label shown in the matched experience section for any NDA-protected employer) - Connect to a specific signal from the JD or company context above - Be 1–2 sentences, ready to speak aloud - Never give generic advice @@ -432,7 +441,7 @@ Five specific talking points for the phone screen. Each must: "competitors_brief": sections.get("Funding & Market Position", ""), # competitor landscape is in the funding section "red_flags": sections.get("Red Flags & Watch-outs", ""), "accessibility_brief": sections.get("Inclusion & Accessibility", ""), - "talking_points": sections.get("Talking Points for Alex", ""), + "talking_points": sections.get(f"Talking Points for {name}", ""), "scrape_used": scrape_used, } diff --git a/scripts/finetune_local.py b/scripts/finetune_local.py index 6dfa406..c29fe93 100644 --- a/scripts/finetune_local.py +++ b/scripts/finetune_local.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # scripts/finetune_local.py """ -Local LoRA fine-tune on Alex's cover letter corpus. +Local LoRA fine-tune on the candidate's cover letter corpus. No HuggingFace account or internet required after the base model is cached. Usage: @@ -17,24 +17,32 @@ import os import sys from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + # Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained # pins every layer to GPU 0, avoiding the accelerate None-device bug that # occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation. # Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation. os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + # ── Config ──────────────────────────────────────────────────────────────────── DEFAULT_MODEL = "unsloth/Llama-3.2-3B-Instruct" # safe on 8 GB VRAM -LETTERS_JSONL = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl") -OUTPUT_DIR = Path("/Library/Documents/JobSearch/training_data/finetune_output") -GGUF_DIR = Path("/Library/Documents/JobSearch/training_data/gguf") -OLLAMA_NAME = "alex-cover-writer" + +_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" +LETTERS_JSONL = _docs / "training_data" / "cover_letters.jsonl" +OUTPUT_DIR = _docs / "training_data" / "finetune_output" +GGUF_DIR = _docs / "training_data" / "gguf" +OLLAMA_NAME = f"{_profile.name.split()[0].lower()}-cover-writer" if _profile else "cover-writer" SYSTEM_PROMPT = ( - "You are Alex Rivera's personal cover letter writer. " - "Write professional, warm, and results-focused cover letters in Alex's voice. " - "Draw on her background in customer success, technical account management, " - "and revenue operations. Be specific and avoid generic filler." + f"You are {_profile.name}'s personal cover letter writer. " + f"{_profile.career_summary}" + if _profile else + "You are a professional cover letter writer. Write in first person." ) # ── Args ────────────────────────────────────────────────────────────────────── @@ -48,7 +56,7 @@ parser.add_argument("--max-length", type=int, default=1024, help="Max token leng args = parser.parse_args() print(f"\n{'='*60}") -print(f" Alex Cover Letter Fine-Tuner") +print(f" Cover Letter Fine-Tuner [{OLLAMA_NAME}]") print(f" Base model : {args.model}") print(f" Epochs : {args.epochs}") print(f" LoRA rank : {args.rank}") diff --git a/scripts/generate_cover_letter.py b/scripts/generate_cover_letter.py index 071dd41..ca159c5 100644 --- a/scripts/generate_cover_letter.py +++ b/scripts/generate_cover_letter.py @@ -1,6 +1,6 @@ # scripts/generate_cover_letter.py """ -Generate a cover letter in Alex's voice using few-shot examples from her corpus. +Generate a cover letter in the candidate's voice using few-shot examples from their corpus. Usage: conda run -n job-seeker python scripts/generate_cover_letter.py \ @@ -16,30 +16,21 @@ import re import sys from pathlib import Path -LETTERS_DIR = Path("/Library/Documents/JobSearch") +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + +LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" LETTER_GLOB = "*Cover Letter*.md" -# Background injected into every prompt so the model has Alex's facts -SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader. - -Background: -- 6+ years in customer success, technical account management, and CS leadership -- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95 -- Also founder of M3 Consulting, a CS advisory practice for SaaS startups -- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere -- Based in San Francisco Bay Area; open to remote/hybrid -- Pronouns: any - -Voice guidelines: -- Warm, confident, and specific — never generic -- Opens with "I'm delighted/thrilled to apply for [role] at [company]." -- 3–4 focused paragraphs, ~250–350 words total -- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric) -- Para 3: genuine connection to THIS company's mission/product -- Closes with "Thank you for considering my application." + warm sign-off -- Never use: "I am writing to express my interest", "passionate about making a difference", - "I look forward to hearing from you", or any hollow filler phrases -""" +# Background injected into every prompt so the model has the candidate's facts +SYSTEM_CONTEXT = ( + f"You are writing cover letters for {_profile.name}. {_profile.career_summary}" + if _profile else + "You are a professional cover letter writer. Write in first person." +) # ── Mission-alignment detection ─────────────────────────────────────────────── @@ -69,21 +60,23 @@ _MISSION_SIGNALS: dict[str, list[str]] = { ], } +_candidate = _profile.name if _profile else "the candidate" + _MISSION_NOTES: dict[str, str] = { "music": ( - "This company is in the music industry, which is one of Alex's genuinely " - "ideal work environments — she has a real personal passion for the music scene. " + f"This company is in the music industry, which is one of {_candidate}'s genuinely " + "ideal work environments — they have a real personal passion for the music scene. " "Para 3 should warmly and specifically reflect this authentic alignment, not as " - "a generic fan statement, but as an honest statement of where she'd love to apply " - "her CS skills." + "a generic fan statement, but as an honest statement of where they'd love to apply " + "their CS skills." ), "animal_welfare": ( - "This organization works in animal welfare/rescue — one of Alex's dream-job " + f"This organization works in animal welfare/rescue — one of {_candidate}'s dream-job " "domains and a genuine personal passion. Para 3 should reflect this authentic " - "connection warmly and specifically, tying her CS skills to this mission." + "connection warmly and specifically, tying their CS skills to this mission." ), "education": ( - "This company works in children's education or EdTech — one of Alex's ideal " + f"This company works in children's education or EdTech — one of {_candidate}'s ideal " "work domains, reflecting genuine personal values around learning and young people. " "Para 3 should reflect this authentic connection specifically and warmly." ), @@ -138,7 +131,7 @@ def build_prompt( ) -> str: parts = [SYSTEM_CONTEXT.strip(), ""] if examples: - parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n") + parts.append(f"=== STYLE EXAMPLES ({_candidate}'s past letters) ===\n") for i, ex in enumerate(examples, 1): parts.append(f"--- Example {i} ({ex['company']}) ---") parts.append(ex["text"]) @@ -183,7 +176,7 @@ def generate(title: str, company: str, description: str = "", _router=None) -> s def main() -> None: - parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice") + parser = argparse.ArgumentParser(description=f"Generate a cover letter in {_candidate}'s voice") parser.add_argument("--title", help="Job title") parser.add_argument("--company", help="Company name") parser.add_argument("--description", default="", help="Job description text") diff --git a/scripts/match.py b/scripts/match.py index af1d000..53edd1f 100644 --- a/scripts/match.py +++ b/scripts/match.py @@ -18,8 +18,22 @@ import yaml from bs4 import BeautifulSoup from notion_client import Client +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + CONFIG_DIR = Path(__file__).parent.parent / "config" -RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf") + + +def _find_resume(docs_dir: Path) -> Path | None: + """Find the most recently modified PDF in docs_dir matching *resume* or *cv*.""" + candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf")) + return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None + + +RESUME_PATH = ( + _find_resume(_profile.docs_dir) if _profile else None +) or Path(__file__).parent.parent / "config" / "resume.pdf" def load_notion() -> tuple[Client, dict]: diff --git a/scripts/prepare_training_data.py b/scripts/prepare_training_data.py index 5b2010b..9b7441c 100644 --- a/scripts/prepare_training_data.py +++ b/scripts/prepare_training_data.py @@ -1,6 +1,6 @@ # scripts/prepare_training_data.py """ -Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning. +Extract training pairs from the candidate's cover letter corpus for LoRA fine-tuning. Outputs a JSONL file where each line is: {"instruction": "Write a cover letter for the [role] position at [company].", @@ -16,10 +16,17 @@ import re import sys from pathlib import Path -LETTERS_DIR = Path("/Library/Documents/JobSearch") +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.user_profile import UserProfile +_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml" +_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None + +_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch" +LETTERS_DIR = _docs # Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter") LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"] -DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl" +DEFAULT_OUTPUT = _docs / "training_data" / "cover_letters.jsonl" # Patterns that appear in opening sentences to extract role ROLE_PATTERNS = [ diff --git a/tests/test_company_research.py b/tests/test_company_research.py index ea696dd..2b1e13f 100644 --- a/tests/test_company_research.py +++ b/tests/test_company_research.py @@ -64,16 +64,22 @@ def test_build_resume_context_top2_in_full(): def test_build_resume_context_rest_condensed(): """Remaining experiences appear as condensed one-liners, not full bullets.""" ctx = _build_resume_context(RESUME, KEYWORDS, JD) - assert "Also in Alex" in ctx + assert "Also in" in ctx assert "Generic Co" in ctx # Generic Co bullets should NOT appear in full assert "Managed SMB portfolio" not in ctx def test_upguard_nda_low_score(): - """UpGuard name replaced with 'enterprise security vendor' when score < 3.""" + """UpGuard NDA rule: company masked when score < 3 and profile has NDA companies configured.""" + from scripts.company_research import _profile ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops") - assert "enterprise security vendor" in ctx + if _profile and _profile.is_nda("upguard"): + # Profile present with UpGuard NDA — company should be masked + assert "UpGuard" not in ctx + else: + # No profile or UpGuard not in NDA list — company name appears directly + assert "UpGuard" in ctx or "enterprise security vendor" in ctx or "previous employer" in ctx def test_load_resume_and_keywords_returns_lists(): diff --git a/tests/test_cover_letter.py b/tests/test_cover_letter.py index 558d261..5db4104 100644 --- a/tests/test_cover_letter.py +++ b/tests/test_cover_letter.py @@ -89,17 +89,14 @@ def test_find_similar_letters_returns_top_k(): def test_load_corpus_returns_list(): - """load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash).""" + """load_corpus returns a list (empty if LETTERS_DIR absent) without crashing.""" from scripts.generate_cover_letter import load_corpus, LETTERS_DIR - if LETTERS_DIR.exists(): - corpus = load_corpus() - assert isinstance(corpus, list) - if corpus: - assert "company" in corpus[0] - assert "text" in corpus[0] - else: - pytest.skip("LETTERS_DIR not present in this environment") + corpus = load_corpus() + assert isinstance(corpus, list) + if corpus: + assert "company" in corpus[0] + assert "text" in corpus[0] def test_generate_calls_llm_router():