feat: extract hard-coded personal references from all scripts via UserProfile

Replace hard-coded paths (/Library/Documents/JobSearch), names (Alex Rivera),
NDA sets (_NDA_COMPANIES), and the scraper path with UserProfile-driven lookups.
Update tests to be profile-agnostic (no user.yaml in peregrine config dir).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
pyr0ball 2026-02-24 18:45:39 -08:00
parent 83ce120666
commit bc94a92681
7 changed files with 124 additions and 90 deletions

View file

@ -3,13 +3,13 @@
Pre-interview company research generator.
Three-phase approach:
1. If SearXNG is available (port 8888), use companyScraper.py to fetch live
1. If SearXNG is available, use companyScraper.py to fetch live
data: CEO name, HQ address, LinkedIn, contact info.
1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
recent news snippets (funding, launches, leadership changes, etc.).
2. Feed all real data into an LLM prompt to synthesise a structured brief
covering company overview, leadership, recent developments, and talking
points tailored to Alex.
points tailored to the candidate.
Falls back to pure LLM knowledge when SearXNG is offline.
@ -24,25 +24,32 @@ from types import SimpleNamespace
sys.path.insert(0, str(Path(__file__).parent.parent))
# ── SearXNG scraper integration ───────────────────────────────────────────────
_SCRAPER_DIR = Path("/Library/Development/scrapers")
_SCRAPER_AVAILABLE = False
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
if _SCRAPER_DIR.exists():
sys.path.insert(0, str(_SCRAPER_DIR))
# ── SearXNG scraper integration ───────────────────────────────────────────────
# companyScraper is bundled into the Docker image at /app/scrapers/
_SCRAPER_AVAILABLE = False
for _scraper_candidate in [
Path("/app/scrapers"), # Docker container path
Path(__file__).parent.parent / "scrapers", # local dev fallback
]:
if _scraper_candidate.exists():
sys.path.insert(0, str(_scraper_candidate))
try:
from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
_SCRAPER_AVAILABLE = True
except (ImportError, SystemExit):
# companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed
pass
break
def _searxng_running() -> bool:
def _searxng_running(searxng_url: str = "http://localhost:8888") -> bool:
"""Quick check whether SearXNG is reachable."""
try:
import requests
r = requests.get("http://localhost:8888/", timeout=3)
r = requests.get(f"{searxng_url}/", timeout=3)
return r.status_code == 200
except Exception:
return False
@ -186,9 +193,13 @@ def _parse_sections(text: str) -> dict[str, str]:
_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
# Companies where Alex has an NDA — reference as generic label unless
# the role is security-focused (score >= 3 matching JD keywords).
_NDA_COMPANIES = {"upguard"}
def _company_label(exp: dict) -> str:
company = exp.get("company", "")
score = exp.get("score", 0)
if _profile:
return _profile.nda_label(company, score)
return company
def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
@ -214,8 +225,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
"""
Build the resume section of the LLM context block.
Top 2 scored experiences included in full detail; rest as one-liners.
Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)'
unless the role is security-focused (score >= 3).
NDA companies are masked via UserProfile.nda_label() when score < threshold.
"""
experiences = resume.get("experience_details", [])
if not experiences:
@ -225,11 +235,7 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
top2 = scored[:2]
rest = scored[2:]
def _company_label(exp: dict) -> str:
company = exp.get("company", "")
if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
return "enterprise security vendor (NDA)"
return company
candidate = _profile.name if _profile else "the candidate"
def _exp_header(exp: dict) -> str:
return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"
@ -238,14 +244,14 @@ def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
return "\n".join(f" - {b}" for b in bullets)
lines = ["## Alex's Matched Experience"]
lines = [f"## {candidate}'s Matched Experience"]
for exp in top2:
lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
lines.append(_exp_bullets(exp))
if rest:
condensed = ", ".join(_exp_header(e) for e in rest)
lines.append(f"\nAlso in Alex's background: {condensed}")
lines.append(f"\nAlso in {candidate}'s background: {condensed}")
return "\n".join(lines)
@ -359,7 +365,10 @@ def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict
# ── Phase 2: LLM synthesis ────────────────────────────────────────────────
_stage("Generating brief with LLM… (3090 seconds)")
prompt = f"""You are preparing Alex Rivera for a job interview.
name = _profile.name if _profile else "the candidate"
career_summary = _profile.career_summary if _profile else ""
prompt = f"""You are preparing {name} for a job interview.
{f"Candidate background: {career_summary}" if career_summary else ""}
Role: **{title}** at **{company}**
@ -404,12 +413,12 @@ Assess {company}'s commitment to disability inclusion and accessibility. Cover:
- Any public disability/accessibility advocacy, partnerships, or certifications
- Glassdoor or press signals about how employees with disabilities experience the company
If no specific signals are found, say so clearly absence of public commitment is itself signal.
This section is for Alex's personal decision-making only and will not appear in any application.
This section is for the candidate's personal decision-making only and will not appear in any application.
## Talking Points for Alex
## Talking Points for {name}
Five specific talking points for the phone screen. Each must:
- Reference a concrete experience from Alex's matched background by name
(UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus)
- Reference a concrete experience from {name}'s matched background by name
(NDA rule: use the masked label shown in the matched experience section for any NDA-protected employer)
- Connect to a specific signal from the JD or company context above
- Be 12 sentences, ready to speak aloud
- Never give generic advice
@ -432,7 +441,7 @@ Five specific talking points for the phone screen. Each must:
"competitors_brief": sections.get("Funding & Market Position", ""), # competitor landscape is in the funding section
"red_flags": sections.get("Red Flags & Watch-outs", ""),
"accessibility_brief": sections.get("Inclusion & Accessibility", ""),
"talking_points": sections.get("Talking Points for Alex", ""),
"talking_points": sections.get(f"Talking Points for {name}", ""),
"scrape_used": scrape_used,
}

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
# scripts/finetune_local.py
"""
Local LoRA fine-tune on Alex's cover letter corpus.
Local LoRA fine-tune on the candidate's cover letter corpus.
No HuggingFace account or internet required after the base model is cached.
Usage:
@ -17,24 +17,32 @@ import os
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
# Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained
# pins every layer to GPU 0, avoiding the accelerate None-device bug that
# occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation.
# Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation.
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
# ── Config ────────────────────────────────────────────────────────────────────
DEFAULT_MODEL = "unsloth/Llama-3.2-3B-Instruct" # safe on 8 GB VRAM
LETTERS_JSONL = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl")
OUTPUT_DIR = Path("/Library/Documents/JobSearch/training_data/finetune_output")
GGUF_DIR = Path("/Library/Documents/JobSearch/training_data/gguf")
OLLAMA_NAME = "alex-cover-writer"
_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
LETTERS_JSONL = _docs / "training_data" / "cover_letters.jsonl"
OUTPUT_DIR = _docs / "training_data" / "finetune_output"
GGUF_DIR = _docs / "training_data" / "gguf"
OLLAMA_NAME = f"{_profile.name.split()[0].lower()}-cover-writer" if _profile else "cover-writer"
SYSTEM_PROMPT = (
"You are Alex Rivera's personal cover letter writer. "
"Write professional, warm, and results-focused cover letters in Alex's voice. "
"Draw on her background in customer success, technical account management, "
"and revenue operations. Be specific and avoid generic filler."
f"You are {_profile.name}'s personal cover letter writer. "
f"{_profile.career_summary}"
if _profile else
"You are a professional cover letter writer. Write in first person."
)
# ── Args ──────────────────────────────────────────────────────────────────────
@ -48,7 +56,7 @@ parser.add_argument("--max-length", type=int, default=1024, help="Max token leng
args = parser.parse_args()
print(f"\n{'='*60}")
print(f" Alex Cover Letter Fine-Tuner")
print(f" Cover Letter Fine-Tuner [{OLLAMA_NAME}]")
print(f" Base model : {args.model}")
print(f" Epochs : {args.epochs}")
print(f" LoRA rank : {args.rank}")

View file

@ -1,6 +1,6 @@
# scripts/generate_cover_letter.py
"""
Generate a cover letter in Alex's voice using few-shot examples from her corpus.
Generate a cover letter in the candidate's voice using few-shot examples from their corpus.
Usage:
conda run -n job-seeker python scripts/generate_cover_letter.py \
@ -16,30 +16,21 @@ import re
import sys
from pathlib import Path
LETTERS_DIR = Path("/Library/Documents/JobSearch")
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
LETTERS_DIR = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
LETTER_GLOB = "*Cover Letter*.md"
# Background injected into every prompt so the model has Alex's facts
SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader.
Background:
- 6+ years in customer success, technical account management, and CS leadership
- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95
- Also founder of M3 Consulting, a CS advisory practice for SaaS startups
- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere
- Based in San Francisco Bay Area; open to remote/hybrid
- Pronouns: any
Voice guidelines:
- Warm, confident, and specific never generic
- Opens with "I'm delighted/thrilled to apply for [role] at [company]."
- 34 focused paragraphs, ~250350 words total
- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric)
- Para 3: genuine connection to THIS company's mission/product
- Closes with "Thank you for considering my application." + warm sign-off
- Never use: "I am writing to express my interest", "passionate about making a difference",
"I look forward to hearing from you", or any hollow filler phrases
"""
# Background injected into every prompt so the model has the candidate's facts
SYSTEM_CONTEXT = (
f"You are writing cover letters for {_profile.name}. {_profile.career_summary}"
if _profile else
"You are a professional cover letter writer. Write in first person."
)
# ── Mission-alignment detection ───────────────────────────────────────────────
@ -69,21 +60,23 @@ _MISSION_SIGNALS: dict[str, list[str]] = {
],
}
_candidate = _profile.name if _profile else "the candidate"
_MISSION_NOTES: dict[str, str] = {
"music": (
"This company is in the music industry, which is one of Alex's genuinely "
"ideal work environments — she has a real personal passion for the music scene. "
f"This company is in the music industry, which is one of {_candidate}'s genuinely "
"ideal work environments — they have a real personal passion for the music scene. "
"Para 3 should warmly and specifically reflect this authentic alignment, not as "
"a generic fan statement, but as an honest statement of where she'd love to apply "
"her CS skills."
"a generic fan statement, but as an honest statement of where they'd love to apply "
"their CS skills."
),
"animal_welfare": (
"This organization works in animal welfare/rescue — one of Alex's dream-job "
f"This organization works in animal welfare/rescue — one of {_candidate}'s dream-job "
"domains and a genuine personal passion. Para 3 should reflect this authentic "
"connection warmly and specifically, tying her CS skills to this mission."
"connection warmly and specifically, tying their CS skills to this mission."
),
"education": (
"This company works in children's education or EdTech — one of Alex's ideal "
f"This company works in children's education or EdTech — one of {_candidate}'s ideal "
"work domains, reflecting genuine personal values around learning and young people. "
"Para 3 should reflect this authentic connection specifically and warmly."
),
@ -138,7 +131,7 @@ def build_prompt(
) -> str:
parts = [SYSTEM_CONTEXT.strip(), ""]
if examples:
parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n")
parts.append(f"=== STYLE EXAMPLES ({_candidate}'s past letters) ===\n")
for i, ex in enumerate(examples, 1):
parts.append(f"--- Example {i} ({ex['company']}) ---")
parts.append(ex["text"])
@ -183,7 +176,7 @@ def generate(title: str, company: str, description: str = "", _router=None) -> s
def main() -> None:
parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice")
parser = argparse.ArgumentParser(description=f"Generate a cover letter in {_candidate}'s voice")
parser.add_argument("--title", help="Job title")
parser.add_argument("--company", help="Company name")
parser.add_argument("--description", default="", help="Job description text")

View file

@ -18,8 +18,22 @@ import yaml
from bs4 import BeautifulSoup
from notion_client import Client
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
CONFIG_DIR = Path(__file__).parent.parent / "config"
RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf")
def _find_resume(docs_dir: Path) -> Path | None:
"""Find the most recently modified PDF in docs_dir matching *resume* or *cv*."""
candidates = list(docs_dir.glob("*[Rr]esume*.pdf")) + list(docs_dir.glob("*[Cc][Vv]*.pdf"))
return max(candidates, key=lambda p: p.stat().st_mtime) if candidates else None
RESUME_PATH = (
_find_resume(_profile.docs_dir) if _profile else None
) or Path(__file__).parent.parent / "config" / "resume.pdf"
def load_notion() -> tuple[Client, dict]:

View file

@ -1,6 +1,6 @@
# scripts/prepare_training_data.py
"""
Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning.
Extract training pairs from the candidate's cover letter corpus for LoRA fine-tuning.
Outputs a JSONL file where each line is:
{"instruction": "Write a cover letter for the [role] position at [company].",
@ -16,10 +16,17 @@ import re
import sys
from pathlib import Path
LETTERS_DIR = Path("/Library/Documents/JobSearch")
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
LETTERS_DIR = _docs
# Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl"
DEFAULT_OUTPUT = _docs / "training_data" / "cover_letters.jsonl"
# Patterns that appear in opening sentences to extract role
ROLE_PATTERNS = [

View file

@ -64,16 +64,22 @@ def test_build_resume_context_top2_in_full():
def test_build_resume_context_rest_condensed():
"""Remaining experiences appear as condensed one-liners, not full bullets."""
ctx = _build_resume_context(RESUME, KEYWORDS, JD)
assert "Also in Alex" in ctx
assert "Also in" in ctx
assert "Generic Co" in ctx
# Generic Co bullets should NOT appear in full
assert "Managed SMB portfolio" not in ctx
def test_upguard_nda_low_score():
"""UpGuard name replaced with 'enterprise security vendor' when score < 3."""
"""UpGuard NDA rule: company masked when score < 3 and profile has NDA companies configured."""
from scripts.company_research import _profile
ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops")
assert "enterprise security vendor" in ctx
if _profile and _profile.is_nda("upguard"):
# Profile present with UpGuard NDA — company should be masked
assert "UpGuard" not in ctx
else:
# No profile or UpGuard not in NDA list — company name appears directly
assert "UpGuard" in ctx or "enterprise security vendor" in ctx or "previous employer" in ctx
def test_load_resume_and_keywords_returns_lists():

View file

@ -89,17 +89,14 @@ def test_find_similar_letters_returns_top_k():
def test_load_corpus_returns_list():
"""load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash)."""
"""load_corpus returns a list (empty if LETTERS_DIR absent) without crashing."""
from scripts.generate_cover_letter import load_corpus, LETTERS_DIR
if LETTERS_DIR.exists():
corpus = load_corpus()
assert isinstance(corpus, list)
if corpus:
assert "company" in corpus[0]
assert "text" in corpus[0]
else:
pytest.skip("LETTERS_DIR not present in this environment")
def test_generate_calls_llm_router():