peregrine/scripts/resume_optimizer.py

"""
ATS Resume Optimizer — rewrite a candidate's resume to maximize keyword match
for a specific job description without fabricating experience.

Tier behaviour:
  Free   → gap report only  (extract_jd_signals + prioritize_gaps, no LLM rewrite)
  Paid   → full LLM rewrite targeting the JD (rewrite_for_ats)
  Premium → same as paid for now; fine-tuned voice model is a future enhancement

Pipeline:
  job.description
      → extract_jd_signals()    # TF-IDF gaps + LLM-extracted ATS signals
      → prioritize_gaps()       # rank by impact, map to resume sections
      → rewrite_for_ats()       # per-section LLM rewrite (paid+)
      → hallucination_check()   # reject rewrites that invent new experience
"""
from __future__ import annotations

import json
import logging
import re
from pathlib import Path
from typing import Any

log = logging.getLogger(__name__)

# ── Signal extraction ─────────────────────────────────────────────────────────

def extract_jd_signals(description: str, resume_text: str = "") -> list[str]:
    """Return ATS keyword signals from a job description.

    Combines two sources:
      1. TF-IDF keyword gaps from match.py (fast, deterministic, no LLM cost)
      2. LLM extraction for phrasing nuance TF-IDF misses (e.g. "cross-functional"
         vs "cross-team", "led" vs "managed")

    Falls back to TF-IDF-only if LLM is unavailable.

    Args:
        description: Raw job description text.
        resume_text: Candidate's resume text (used to compute gap vs. already present).

    Returns:
        Deduplicated list of ATS keyword signals, most impactful first.
    """
    # Phase 1: deterministic TF-IDF gaps (always available)
    tfidf_gaps: list[str] = []
    if resume_text:
        try:
            from scripts.match import match_score
            _, tfidf_gaps = match_score(resume_text, description)
        except Exception:
            log.warning("[resume_optimizer] TF-IDF gap extraction failed", exc_info=True)

    # Phase 2: LLM extraction for phrasing/qualifier nuance
    llm_signals: list[str] = []
    try:
        from scripts.llm_router import LLMRouter
        prompt = (
            "Extract the most important ATS (applicant tracking system) keywords and "
            "phrases from this job description. Focus on:\n"
            "- Required skills and technologies (exact phrasing matters)\n"
            "- Action verbs used to describe responsibilities\n"
            "- Qualification signals ('required', 'must have', 'preferred')\n"
            "- Industry-specific terminology\n\n"
            "Return a JSON array of strings only. No explanation.\n\n"
            f"Job description:\n{description[:3000]}"
        )
        raw = LLMRouter().complete(prompt)
        # Extract JSON array from response (LLM may wrap it in markdown)
        match = re.search(r"\[.*\]", raw, re.DOTALL)
        if match:
            llm_signals = json.loads(match.group(0))
            llm_signals = [s.strip() for s in llm_signals if isinstance(s, str) and s.strip()]
    except Exception:
        log.warning("[resume_optimizer] LLM signal extraction failed", exc_info=True)

    # Merge: LLM signals first (richer phrasing), TF-IDF fills gaps
    seen: set[str] = set()
    merged: list[str] = []
    for term in llm_signals + tfidf_gaps:
        key = term.lower()
        if key not in seen:
            seen.add(key)
            merged.append(term)

    return merged


# ── Gap prioritization ────────────────────────────────────────────────────────

# Map each gap term to the resume section where it would have the most ATS impact.
# ATS systems weight keywords higher in certain sections:
#   skills    — direct keyword match, highest density, indexed first
#   summary   — executive summary keywords often boost overall relevance score
#   experience — verbs + outcomes in bullet points; adds context weight
_SECTION_KEYWORDS: dict[str, list[str]] = {
    "skills": [
        "python", "sql", "java", "typescript", "react", "vue", "docker",
        "kubernetes", "aws", "gcp", "azure", "terraform", "ci/cd", "git",
        "postgresql", "redis", "kafka", "spark", "tableau", "salesforce",
        "jira", "figma", "excel", "powerpoint", "machine learning", "llm",
        "deep learning", "pytorch", "tensorflow", "scikit-learn",
    ],
    "summary": [
        "leadership", "strategy", "vision", "executive", "director", "vp",
        "growth", "transformation", "stakeholder", "cross-functional",
        "p&l", "revenue", "budget", "board", "c-suite",
    ],
}


def prioritize_gaps(gaps: list[str], resume_sections: dict[str, Any]) -> list[dict]:
    """Rank keyword gaps by ATS impact and map each to a target resume section.

    Args:
        gaps: List of missing keyword signals from extract_jd_signals().
        resume_sections: Structured resume dict from resume_parser.parse_resume().

    Returns:
        List of dicts, sorted by priority score descending:
            {
              "term": str,          # the keyword/phrase to inject
              "section": str,       # target resume section ("skills", "summary", "experience")
              "priority": int,      # 1=high, 2=medium, 3=low
              "rationale": str,     # why this section was chosen
            }

    TODO: implement the ranking logic below.
    The current stub assigns every gap to "experience" at medium priority.
    A good implementation should:
      - Score "skills" section terms highest (direct keyword density)
      - Score "summary" terms next (executive/leadership signals)
      - Route remaining gaps to "experience" bullets
      - Deprioritize terms already present in any section (case-insensitive)
      - Consider gap term length: multi-word phrases > single words (more specific = higher ATS weight)
    """
    existing_text = _flatten_resume_text(resume_sections).lower()

    prioritized: list[dict] = []
    for term in gaps:
        # Skip terms already present anywhere in the resume
        if term.lower() in existing_text:
            continue

        # REVIEW: _SECTION_KEYWORDS lists are tech-centric; domain-specific roles
        # (creative, healthcare, operations) may over-route to experience.
        # Consider expanding the lists or making them config-driven.
        term_lower = term.lower()

        # Partial-match: term contains a skills keyword (handles "PostgreSQL" vs "postgresql",
        # "AWS Lambda" vs "aws", etc.)
        skills_match = any(kw in term_lower or term_lower in kw
                           for kw in _SECTION_KEYWORDS["skills"])
        summary_match = any(kw in term_lower or term_lower in kw
                            for kw in _SECTION_KEYWORDS["summary"])

        if skills_match:
            section = "skills"
            priority = 1
            rationale = "matched technical skills list — highest ATS keyword density"
        elif summary_match:
            section = "summary"
            priority = 1
            rationale = "matched leadership/executive signals — boosts overall relevance score"
        elif len(term.split()) > 1:
            section = "experience"
            priority = 2
            rationale = "multi-word phrase — more specific than single keywords, context weight in bullets"
        else:
            section = "experience"
            priority = 3
            rationale = "single generic term — lowest ATS impact, added to experience for coverage"

        prioritized.append({
            "term":      term,
            "section":   section,
            "priority":  priority,
            "rationale": rationale,
        })

    prioritized.sort(key=lambda x: x["priority"])
    return prioritized


def _flatten_resume_text(resume: dict[str, Any]) -> str:
    """Concatenate all text from a structured resume dict into one searchable string."""
    parts: list[str] = []
    parts.append(resume.get("career_summary", "") or "")
    parts.extend(resume.get("skills", []))
    for exp in resume.get("experience", []):
        parts.append(exp.get("title", ""))
        parts.append(exp.get("company", ""))
        parts.extend(exp.get("bullets", []))
    for edu in resume.get("education", []):
        parts.append(edu.get("degree", ""))
        parts.append(edu.get("field", ""))
        parts.append(edu.get("institution", ""))
    parts.extend(resume.get("achievements", []))
    return " ".join(parts)


# ── LLM rewrite ───────────────────────────────────────────────────────────────

def rewrite_for_ats(
    resume: dict[str, Any],
    prioritized_gaps: list[dict],
    job: dict[str, Any],
    candidate_voice: str = "",
) -> dict[str, Any]:
    """Rewrite resume sections to naturally incorporate ATS keyword gaps.

    Operates section-by-section. For each target section in prioritized_gaps,
    builds a focused prompt that injects only the gaps destined for that section.
    The hallucination constraint is enforced in the prompt itself and verified
    post-hoc by hallucination_check().

    Args:
        resume: Structured resume dict (from resume_parser.parse_resume).
        prioritized_gaps: Output of prioritize_gaps().
        job: Job dict with at minimum {"title": str, "company": str, "description": str}.
        candidate_voice: Free-text personality/style note from user.yaml (may be empty).

    Returns:
        New resume dict (same structure as input) with rewritten sections.
        Sections with no relevant gaps are copied through unchanged.
    """
    from scripts.llm_router import LLMRouter
    router = LLMRouter()

    # Group gaps by target section
    by_section: dict[str, list[str]] = {}
    for gap in prioritized_gaps:
        by_section.setdefault(gap["section"], []).append(gap["term"])

    rewritten = dict(resume)  # shallow copy — sections replaced below

    for section, terms in by_section.items():
        terms_str = ", ".join(f'"{t}"' for t in terms)
        original_content = _section_text_for_prompt(resume, section)

        voice_note = (
            f'\n\nCandidate voice/style: "{candidate_voice}". '
            "Preserve this authentic tone — do not write generically."
        ) if candidate_voice else ""

        prompt = (
            f"You are rewriting the **{section}** section of a resume to help it pass "
            f"ATS (applicant tracking system) screening for this role:\n"
            f"  Job title: {job.get('title', 'Unknown')}\n"
            f"  Company: {job.get('company', 'Unknown')}\n\n"
            f"Inject these missing ATS keywords naturally into the section:\n"
            f"  {terms_str}\n\n"
            f"CRITICAL RULES — violating any of these invalidates the rewrite:\n"
            f"1. Do NOT invent new employers, job titles, dates, or education.\n"
            f"2. Do NOT add skills the candidate did not already demonstrate.\n"
            f"3. Only rephrase existing content — replace vague verbs/nouns with the "
            f"   ATS-preferred equivalents listed above.\n"
            f"4. Keep the same number of bullet points in experience entries.\n"
            f"5. Return ONLY the rewritten section content, no labels or explanation."
            f"{voice_note}\n\n"
            f"Original {section} section:\n{original_content}"
        )

        try:
            result = router.complete(prompt)
            rewritten = _apply_section_rewrite(rewritten, section, result.strip())
        except Exception:
            log.warning("[resume_optimizer] rewrite failed for section %r", section, exc_info=True)
            # Leave section unchanged on failure

    return rewritten


def _section_text_for_prompt(resume: dict[str, Any], section: str) -> str:
    """Render a resume section as plain text suitable for an LLM prompt."""
    if section == "summary":
        return resume.get("career_summary", "") or "(empty)"
    if section == "skills":
        skills = resume.get("skills", [])
        return ", ".join(skills) if skills else "(empty)"
    if section == "experience":
        lines: list[str] = []
        for exp in resume.get("experience", []):
            lines.append(f"{exp['title']} at {exp['company']} ({exp['start_date']}–{exp['end_date']})")
            for b in exp.get("bullets", []):
                lines.append(f"  • {b}")
        return "\n".join(lines) if lines else "(empty)"
    return "(unsupported section)"


def _apply_section_rewrite(resume: dict[str, Any], section: str, rewritten: str) -> dict[str, Any]:
    """Return a new resume dict with the given section replaced by rewritten text."""
    updated = dict(resume)
    if section == "summary":
        updated["career_summary"] = rewritten
    elif section == "skills":
        # LLM returns comma-separated or newline-separated skills
        skills = [s.strip() for s in re.split(r"[,\n•·]+", rewritten) if s.strip()]
        updated["skills"] = skills
    elif section == "experience":
        # For experience, we keep the structured entries but replace the bullets.
        # The LLM rewrites the whole section as plain text; we re-parse the bullets.
        updated["experience"] = _reparse_experience_bullets(resume["experience"], rewritten)
    return updated


def _reparse_experience_bullets(
    original_entries: list[dict],
    rewritten_text: str,
) -> list[dict]:
    """Re-associate rewritten bullet text with the original experience entries.

    The LLM rewrites the section as a block of text. We split on the original
    entry headers (title + company) to re-bind bullets to entries. Falls back
    to the original entries if splitting fails.
    """
    if not original_entries:
        return original_entries

    result: list[dict] = []
    remaining = rewritten_text

    for i, entry in enumerate(original_entries):
        # Find where the next entry starts so we can slice out this entry's bullets
        if i + 1 < len(original_entries):
            next_title = original_entries[i + 1]["title"]
            # Look for the next entry header in the remaining text
            split_pat = re.escape(next_title)
            m = re.search(split_pat, remaining, re.IGNORECASE)
            chunk = remaining[:m.start()] if m else remaining
            remaining = remaining[m.start():] if m else ""
        else:
            chunk = remaining

        bullets = [
            re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
            for line in chunk.splitlines()
            if re.match(r"^[•\-–—*◦▪▸►]\s*", line.strip())
        ]
        new_entry = dict(entry)
        new_entry["bullets"] = bullets if bullets else entry["bullets"]
        result.append(new_entry)

    return result


# ── Hallucination guard ───────────────────────────────────────────────────────

def hallucination_check(original: dict[str, Any], rewritten: dict[str, Any]) -> bool:
    """Return True if the rewrite is safe (no fabricated facts detected).

    Checks that the set of employers, job titles, and date ranges in the
    rewritten resume is a subset of those in the original. Any new entry
    signals hallucination.

    Args:
        original: Structured resume dict before rewrite.
        rewritten: Structured resume dict after rewrite.

    Returns:
        True  → rewrite is safe to use
        False → hallucination detected; caller should fall back to original
    """
    orig_anchors  = _extract_anchors(original)
    rewrite_anchors = _extract_anchors(rewritten)

    new_anchors = rewrite_anchors - orig_anchors
    if new_anchors:
        log.warning(
            "[resume_optimizer] hallucination_check FAILED — new anchors in rewrite: %s",
            new_anchors,
        )
        return False
    return True


def _extract_anchors(resume: dict[str, Any]) -> frozenset[str]:
    """Extract stable factual anchors (company, title, dates) from experience entries."""
    anchors: set[str] = set()
    for exp in resume.get("experience", []):
        for field in ("company", "title", "start_date", "end_date"):
            val = (exp.get(field) or "").strip().lower()
            if val:
                anchors.add(val)
    for edu in resume.get("education", []):
        val = (edu.get("institution") or "").strip().lower()
        if val:
            anchors.add(val)
    return frozenset(anchors)


# ── Resume → plain text renderer ─────────────────────────────────────────────

def render_resume_text(resume: dict[str, Any]) -> str:
    """Render a structured resume dict back to formatted plain text for PDF export."""
    lines: list[str] = []

    contact_parts = [resume.get("name", ""), resume.get("email", ""), resume.get("phone", "")]
    lines.append("  ".join(p for p in contact_parts if p))
    lines.append("")

    if resume.get("career_summary"):
        lines.append("SUMMARY")
        lines.append(resume["career_summary"])
        lines.append("")

    if resume.get("experience"):
        lines.append("EXPERIENCE")
        for exp in resume["experience"]:
            lines.append(
                f"{exp.get('title', '')}  |  {exp.get('company', '')}  "
                f"({exp.get('start_date', '')}–{exp.get('end_date', '')})"
            )
            for b in exp.get("bullets", []):
                lines.append(f"  • {b}")
        lines.append("")

    if resume.get("education"):
        lines.append("EDUCATION")
        for edu in resume["education"]:
            lines.append(
                f"{edu.get('degree', '')} {edu.get('field', '')}  |  "
                f"{edu.get('institution', '')}  {edu.get('graduation_year', '')}"
            )
        lines.append("")

    if resume.get("skills"):
        lines.append("SKILLS")
        lines.append(", ".join(resume["skills"]))
        lines.append("")

    if resume.get("achievements"):
        lines.append("ACHIEVEMENTS")
        for a in resume["achievements"]:
            lines.append(f"  • {a}")
        lines.append("")

    return "\n".join(lines)