- scripts/resume_optimizer.py: full pipeline (extract_jd_signals → prioritize_gaps → rewrite_for_ats → hallucination_check) - scripts/db.py: add optimized_resume + ats_gap_report columns + save_optimized_resume / get_optimized_resume helpers - tests/test_resume_optimizer.py: 17 unit tests; patches at source module (scripts.llm_router.LLMRouter), not consumer Tier gate: gap report is free; full LLM rewrite is paid+.
439 lines
18 KiB
Python
439 lines
18 KiB
Python
"""
|
||
ATS Resume Optimizer — rewrite a candidate's resume to maximize keyword match
|
||
for a specific job description without fabricating experience.
|
||
|
||
Tier behaviour:
|
||
Free → gap report only (extract_jd_signals + prioritize_gaps, no LLM rewrite)
|
||
Paid → full LLM rewrite targeting the JD (rewrite_for_ats)
|
||
Premium → same as paid for now; fine-tuned voice model is a future enhancement
|
||
|
||
Pipeline:
|
||
job.description
|
||
→ extract_jd_signals() # TF-IDF gaps + LLM-extracted ATS signals
|
||
→ prioritize_gaps() # rank by impact, map to resume sections
|
||
→ rewrite_for_ats() # per-section LLM rewrite (paid+)
|
||
→ hallucination_check() # reject rewrites that invent new experience
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import logging
|
||
import re
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
# ── Signal extraction ─────────────────────────────────────────────────────────
|
||
|
||
def extract_jd_signals(description: str, resume_text: str = "") -> list[str]:
|
||
"""Return ATS keyword signals from a job description.
|
||
|
||
Combines two sources:
|
||
1. TF-IDF keyword gaps from match.py (fast, deterministic, no LLM cost)
|
||
2. LLM extraction for phrasing nuance TF-IDF misses (e.g. "cross-functional"
|
||
vs "cross-team", "led" vs "managed")
|
||
|
||
Falls back to TF-IDF-only if LLM is unavailable.
|
||
|
||
Args:
|
||
description: Raw job description text.
|
||
resume_text: Candidate's resume text (used to compute gap vs. already present).
|
||
|
||
Returns:
|
||
Deduplicated list of ATS keyword signals, most impactful first.
|
||
"""
|
||
# Phase 1: deterministic TF-IDF gaps (always available)
|
||
tfidf_gaps: list[str] = []
|
||
if resume_text:
|
||
try:
|
||
from scripts.match import match_score
|
||
_, tfidf_gaps = match_score(resume_text, description)
|
||
except Exception:
|
||
log.warning("[resume_optimizer] TF-IDF gap extraction failed", exc_info=True)
|
||
|
||
# Phase 2: LLM extraction for phrasing/qualifier nuance
|
||
llm_signals: list[str] = []
|
||
try:
|
||
from scripts.llm_router import LLMRouter
|
||
prompt = (
|
||
"Extract the most important ATS (applicant tracking system) keywords and "
|
||
"phrases from this job description. Focus on:\n"
|
||
"- Required skills and technologies (exact phrasing matters)\n"
|
||
"- Action verbs used to describe responsibilities\n"
|
||
"- Qualification signals ('required', 'must have', 'preferred')\n"
|
||
"- Industry-specific terminology\n\n"
|
||
"Return a JSON array of strings only. No explanation.\n\n"
|
||
f"Job description:\n{description[:3000]}"
|
||
)
|
||
raw = LLMRouter().complete(prompt)
|
||
# Extract JSON array from response (LLM may wrap it in markdown)
|
||
match = re.search(r"\[.*\]", raw, re.DOTALL)
|
||
if match:
|
||
llm_signals = json.loads(match.group(0))
|
||
llm_signals = [s.strip() for s in llm_signals if isinstance(s, str) and s.strip()]
|
||
except Exception:
|
||
log.warning("[resume_optimizer] LLM signal extraction failed", exc_info=True)
|
||
|
||
# Merge: LLM signals first (richer phrasing), TF-IDF fills gaps
|
||
seen: set[str] = set()
|
||
merged: list[str] = []
|
||
for term in llm_signals + tfidf_gaps:
|
||
key = term.lower()
|
||
if key not in seen:
|
||
seen.add(key)
|
||
merged.append(term)
|
||
|
||
return merged
|
||
|
||
|
||
# ── Gap prioritization ────────────────────────────────────────────────────────
|
||
|
||
# Map each gap term to the resume section where it would have the most ATS impact.
|
||
# ATS systems weight keywords higher in certain sections:
|
||
# skills — direct keyword match, highest density, indexed first
|
||
# summary — executive summary keywords often boost overall relevance score
|
||
# experience — verbs + outcomes in bullet points; adds context weight
|
||
_SECTION_KEYWORDS: dict[str, list[str]] = {
|
||
"skills": [
|
||
"python", "sql", "java", "typescript", "react", "vue", "docker",
|
||
"kubernetes", "aws", "gcp", "azure", "terraform", "ci/cd", "git",
|
||
"postgresql", "redis", "kafka", "spark", "tableau", "salesforce",
|
||
"jira", "figma", "excel", "powerpoint", "machine learning", "llm",
|
||
"deep learning", "pytorch", "tensorflow", "scikit-learn",
|
||
],
|
||
"summary": [
|
||
"leadership", "strategy", "vision", "executive", "director", "vp",
|
||
"growth", "transformation", "stakeholder", "cross-functional",
|
||
"p&l", "revenue", "budget", "board", "c-suite",
|
||
],
|
||
}
|
||
|
||
|
||
def prioritize_gaps(gaps: list[str], resume_sections: dict[str, Any]) -> list[dict]:
|
||
"""Rank keyword gaps by ATS impact and map each to a target resume section.
|
||
|
||
Args:
|
||
gaps: List of missing keyword signals from extract_jd_signals().
|
||
resume_sections: Structured resume dict from resume_parser.parse_resume().
|
||
|
||
Returns:
|
||
List of dicts, sorted by priority score descending:
|
||
{
|
||
"term": str, # the keyword/phrase to inject
|
||
"section": str, # target resume section ("skills", "summary", "experience")
|
||
"priority": int, # 1=high, 2=medium, 3=low
|
||
"rationale": str, # why this section was chosen
|
||
}
|
||
|
||
TODO: implement the ranking logic below.
|
||
The current stub assigns every gap to "experience" at medium priority.
|
||
A good implementation should:
|
||
- Score "skills" section terms highest (direct keyword density)
|
||
- Score "summary" terms next (executive/leadership signals)
|
||
- Route remaining gaps to "experience" bullets
|
||
- Deprioritize terms already present in any section (case-insensitive)
|
||
- Consider gap term length: multi-word phrases > single words (more specific = higher ATS weight)
|
||
"""
|
||
existing_text = _flatten_resume_text(resume_sections).lower()
|
||
|
||
prioritized: list[dict] = []
|
||
for term in gaps:
|
||
# Skip terms already present anywhere in the resume
|
||
if term.lower() in existing_text:
|
||
continue
|
||
|
||
# REVIEW: _SECTION_KEYWORDS lists are tech-centric; domain-specific roles
|
||
# (creative, healthcare, operations) may over-route to experience.
|
||
# Consider expanding the lists or making them config-driven.
|
||
term_lower = term.lower()
|
||
|
||
# Partial-match: term contains a skills keyword (handles "PostgreSQL" vs "postgresql",
|
||
# "AWS Lambda" vs "aws", etc.)
|
||
skills_match = any(kw in term_lower or term_lower in kw
|
||
for kw in _SECTION_KEYWORDS["skills"])
|
||
summary_match = any(kw in term_lower or term_lower in kw
|
||
for kw in _SECTION_KEYWORDS["summary"])
|
||
|
||
if skills_match:
|
||
section = "skills"
|
||
priority = 1
|
||
rationale = "matched technical skills list — highest ATS keyword density"
|
||
elif summary_match:
|
||
section = "summary"
|
||
priority = 1
|
||
rationale = "matched leadership/executive signals — boosts overall relevance score"
|
||
elif len(term.split()) > 1:
|
||
section = "experience"
|
||
priority = 2
|
||
rationale = "multi-word phrase — more specific than single keywords, context weight in bullets"
|
||
else:
|
||
section = "experience"
|
||
priority = 3
|
||
rationale = "single generic term — lowest ATS impact, added to experience for coverage"
|
||
|
||
prioritized.append({
|
||
"term": term,
|
||
"section": section,
|
||
"priority": priority,
|
||
"rationale": rationale,
|
||
})
|
||
|
||
prioritized.sort(key=lambda x: x["priority"])
|
||
return prioritized
|
||
|
||
|
||
def _flatten_resume_text(resume: dict[str, Any]) -> str:
|
||
"""Concatenate all text from a structured resume dict into one searchable string."""
|
||
parts: list[str] = []
|
||
parts.append(resume.get("career_summary", "") or "")
|
||
parts.extend(resume.get("skills", []))
|
||
for exp in resume.get("experience", []):
|
||
parts.append(exp.get("title", ""))
|
||
parts.append(exp.get("company", ""))
|
||
parts.extend(exp.get("bullets", []))
|
||
for edu in resume.get("education", []):
|
||
parts.append(edu.get("degree", ""))
|
||
parts.append(edu.get("field", ""))
|
||
parts.append(edu.get("institution", ""))
|
||
parts.extend(resume.get("achievements", []))
|
||
return " ".join(parts)
|
||
|
||
|
||
# ── LLM rewrite ───────────────────────────────────────────────────────────────
|
||
|
||
def rewrite_for_ats(
|
||
resume: dict[str, Any],
|
||
prioritized_gaps: list[dict],
|
||
job: dict[str, Any],
|
||
candidate_voice: str = "",
|
||
) -> dict[str, Any]:
|
||
"""Rewrite resume sections to naturally incorporate ATS keyword gaps.
|
||
|
||
Operates section-by-section. For each target section in prioritized_gaps,
|
||
builds a focused prompt that injects only the gaps destined for that section.
|
||
The hallucination constraint is enforced in the prompt itself and verified
|
||
post-hoc by hallucination_check().
|
||
|
||
Args:
|
||
resume: Structured resume dict (from resume_parser.parse_resume).
|
||
prioritized_gaps: Output of prioritize_gaps().
|
||
job: Job dict with at minimum {"title": str, "company": str, "description": str}.
|
||
candidate_voice: Free-text personality/style note from user.yaml (may be empty).
|
||
|
||
Returns:
|
||
New resume dict (same structure as input) with rewritten sections.
|
||
Sections with no relevant gaps are copied through unchanged.
|
||
"""
|
||
from scripts.llm_router import LLMRouter
|
||
router = LLMRouter()
|
||
|
||
# Group gaps by target section
|
||
by_section: dict[str, list[str]] = {}
|
||
for gap in prioritized_gaps:
|
||
by_section.setdefault(gap["section"], []).append(gap["term"])
|
||
|
||
rewritten = dict(resume) # shallow copy — sections replaced below
|
||
|
||
for section, terms in by_section.items():
|
||
terms_str = ", ".join(f'"{t}"' for t in terms)
|
||
original_content = _section_text_for_prompt(resume, section)
|
||
|
||
voice_note = (
|
||
f'\n\nCandidate voice/style: "{candidate_voice}". '
|
||
"Preserve this authentic tone — do not write generically."
|
||
) if candidate_voice else ""
|
||
|
||
prompt = (
|
||
f"You are rewriting the **{section}** section of a resume to help it pass "
|
||
f"ATS (applicant tracking system) screening for this role:\n"
|
||
f" Job title: {job.get('title', 'Unknown')}\n"
|
||
f" Company: {job.get('company', 'Unknown')}\n\n"
|
||
f"Inject these missing ATS keywords naturally into the section:\n"
|
||
f" {terms_str}\n\n"
|
||
f"CRITICAL RULES — violating any of these invalidates the rewrite:\n"
|
||
f"1. Do NOT invent new employers, job titles, dates, or education.\n"
|
||
f"2. Do NOT add skills the candidate did not already demonstrate.\n"
|
||
f"3. Only rephrase existing content — replace vague verbs/nouns with the "
|
||
f" ATS-preferred equivalents listed above.\n"
|
||
f"4. Keep the same number of bullet points in experience entries.\n"
|
||
f"5. Return ONLY the rewritten section content, no labels or explanation."
|
||
f"{voice_note}\n\n"
|
||
f"Original {section} section:\n{original_content}"
|
||
)
|
||
|
||
try:
|
||
result = router.complete(prompt)
|
||
rewritten = _apply_section_rewrite(rewritten, section, result.strip())
|
||
except Exception:
|
||
log.warning("[resume_optimizer] rewrite failed for section %r", section, exc_info=True)
|
||
# Leave section unchanged on failure
|
||
|
||
return rewritten
|
||
|
||
|
||
def _section_text_for_prompt(resume: dict[str, Any], section: str) -> str:
|
||
"""Render a resume section as plain text suitable for an LLM prompt."""
|
||
if section == "summary":
|
||
return resume.get("career_summary", "") or "(empty)"
|
||
if section == "skills":
|
||
skills = resume.get("skills", [])
|
||
return ", ".join(skills) if skills else "(empty)"
|
||
if section == "experience":
|
||
lines: list[str] = []
|
||
for exp in resume.get("experience", []):
|
||
lines.append(f"{exp['title']} at {exp['company']} ({exp['start_date']}–{exp['end_date']})")
|
||
for b in exp.get("bullets", []):
|
||
lines.append(f" • {b}")
|
||
return "\n".join(lines) if lines else "(empty)"
|
||
return "(unsupported section)"
|
||
|
||
|
||
def _apply_section_rewrite(resume: dict[str, Any], section: str, rewritten: str) -> dict[str, Any]:
|
||
"""Return a new resume dict with the given section replaced by rewritten text."""
|
||
updated = dict(resume)
|
||
if section == "summary":
|
||
updated["career_summary"] = rewritten
|
||
elif section == "skills":
|
||
# LLM returns comma-separated or newline-separated skills
|
||
skills = [s.strip() for s in re.split(r"[,\n•·]+", rewritten) if s.strip()]
|
||
updated["skills"] = skills
|
||
elif section == "experience":
|
||
# For experience, we keep the structured entries but replace the bullets.
|
||
# The LLM rewrites the whole section as plain text; we re-parse the bullets.
|
||
updated["experience"] = _reparse_experience_bullets(resume["experience"], rewritten)
|
||
return updated
|
||
|
||
|
||
def _reparse_experience_bullets(
|
||
original_entries: list[dict],
|
||
rewritten_text: str,
|
||
) -> list[dict]:
|
||
"""Re-associate rewritten bullet text with the original experience entries.
|
||
|
||
The LLM rewrites the section as a block of text. We split on the original
|
||
entry headers (title + company) to re-bind bullets to entries. Falls back
|
||
to the original entries if splitting fails.
|
||
"""
|
||
if not original_entries:
|
||
return original_entries
|
||
|
||
result: list[dict] = []
|
||
remaining = rewritten_text
|
||
|
||
for i, entry in enumerate(original_entries):
|
||
# Find where the next entry starts so we can slice out this entry's bullets
|
||
if i + 1 < len(original_entries):
|
||
next_title = original_entries[i + 1]["title"]
|
||
# Look for the next entry header in the remaining text
|
||
split_pat = re.escape(next_title)
|
||
m = re.search(split_pat, remaining, re.IGNORECASE)
|
||
chunk = remaining[:m.start()] if m else remaining
|
||
remaining = remaining[m.start():] if m else ""
|
||
else:
|
||
chunk = remaining
|
||
|
||
bullets = [
|
||
re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
|
||
for line in chunk.splitlines()
|
||
if re.match(r"^[•\-–—*◦▪▸►]\s*", line.strip())
|
||
]
|
||
new_entry = dict(entry)
|
||
new_entry["bullets"] = bullets if bullets else entry["bullets"]
|
||
result.append(new_entry)
|
||
|
||
return result
|
||
|
||
|
||
# ── Hallucination guard ───────────────────────────────────────────────────────
|
||
|
||
def hallucination_check(original: dict[str, Any], rewritten: dict[str, Any]) -> bool:
|
||
"""Return True if the rewrite is safe (no fabricated facts detected).
|
||
|
||
Checks that the set of employers, job titles, and date ranges in the
|
||
rewritten resume is a subset of those in the original. Any new entry
|
||
signals hallucination.
|
||
|
||
Args:
|
||
original: Structured resume dict before rewrite.
|
||
rewritten: Structured resume dict after rewrite.
|
||
|
||
Returns:
|
||
True → rewrite is safe to use
|
||
False → hallucination detected; caller should fall back to original
|
||
"""
|
||
orig_anchors = _extract_anchors(original)
|
||
rewrite_anchors = _extract_anchors(rewritten)
|
||
|
||
new_anchors = rewrite_anchors - orig_anchors
|
||
if new_anchors:
|
||
log.warning(
|
||
"[resume_optimizer] hallucination_check FAILED — new anchors in rewrite: %s",
|
||
new_anchors,
|
||
)
|
||
return False
|
||
return True
|
||
|
||
|
||
def _extract_anchors(resume: dict[str, Any]) -> frozenset[str]:
|
||
"""Extract stable factual anchors (company, title, dates) from experience entries."""
|
||
anchors: set[str] = set()
|
||
for exp in resume.get("experience", []):
|
||
for field in ("company", "title", "start_date", "end_date"):
|
||
val = (exp.get(field) or "").strip().lower()
|
||
if val:
|
||
anchors.add(val)
|
||
for edu in resume.get("education", []):
|
||
val = (edu.get("institution") or "").strip().lower()
|
||
if val:
|
||
anchors.add(val)
|
||
return frozenset(anchors)
|
||
|
||
|
||
# ── Resume → plain text renderer ─────────────────────────────────────────────
|
||
|
||
def render_resume_text(resume: dict[str, Any]) -> str:
|
||
"""Render a structured resume dict back to formatted plain text for PDF export."""
|
||
lines: list[str] = []
|
||
|
||
contact_parts = [resume.get("name", ""), resume.get("email", ""), resume.get("phone", "")]
|
||
lines.append(" ".join(p for p in contact_parts if p))
|
||
lines.append("")
|
||
|
||
if resume.get("career_summary"):
|
||
lines.append("SUMMARY")
|
||
lines.append(resume["career_summary"])
|
||
lines.append("")
|
||
|
||
if resume.get("experience"):
|
||
lines.append("EXPERIENCE")
|
||
for exp in resume["experience"]:
|
||
lines.append(
|
||
f"{exp.get('title', '')} | {exp.get('company', '')} "
|
||
f"({exp.get('start_date', '')}–{exp.get('end_date', '')})"
|
||
)
|
||
for b in exp.get("bullets", []):
|
||
lines.append(f" • {b}")
|
||
lines.append("")
|
||
|
||
if resume.get("education"):
|
||
lines.append("EDUCATION")
|
||
for edu in resume["education"]:
|
||
lines.append(
|
||
f"{edu.get('degree', '')} {edu.get('field', '')} | "
|
||
f"{edu.get('institution', '')} {edu.get('graduation_year', '')}"
|
||
)
|
||
lines.append("")
|
||
|
||
if resume.get("skills"):
|
||
lines.append("SKILLS")
|
||
lines.append(", ".join(resume["skills"]))
|
||
lines.append("")
|
||
|
||
if resume.get("achievements"):
|
||
lines.append("ACHIEVEMENTS")
|
||
for a in resume["achievements"]:
|
||
lines.append(f" • {a}")
|
||
lines.append("")
|
||
|
||
return "\n".join(lines)
|