peregrine/scripts/resume_parser.py

"""
Resume parser — extract text from PDF/DOCX and structure via LLM.

Fast path: file bytes → raw text → LLM structures into resume dict.
Result dict keys mirror plain_text_resume.yaml sections.

Falls back to empty dict on any LLM/parsing error — caller should
then show the guided form builder.
"""
from __future__ import annotations
import io
import json
import re

import pdfplumber
from docx import Document


def extract_text_from_pdf(file_bytes: bytes) -> str:
    """Extract raw text from PDF bytes using pdfplumber.

    Returns empty string if extraction fails for any page.
    """
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(pages)


def extract_text_from_docx(file_bytes: bytes) -> str:
    """Extract raw text from DOCX bytes using python-docx."""
    doc = Document(io.BytesIO(file_bytes))
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())


def _llm_structure(raw_text: str) -> str:
    """Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
    from scripts.llm_router import LLMRouter
    prompt = (
        "You are a resume parser. Convert the following resume text into a JSON object.\n\n"
        "Required JSON keys:\n"
        "- name (string)\n"
        "- email (string, may be empty)\n"
        "- phone (string, may be empty)\n"
        "- career_summary (string: 2-4 sentence professional summary)\n"
        "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
        "- education (list of objects with: institution, degree, field, graduation_year)\n"
        "- skills (list of strings)\n"
        "- achievements (list of strings, may be empty)\n\n"
        "Return ONLY valid JSON. No markdown, no explanation.\n\n"
        f"Resume text:\n{raw_text[:6000]}"
    )
    router = LLMRouter()
    return router.complete(prompt)


def structure_resume(raw_text: str) -> dict:
    """Convert raw resume text to a structured dict via LLM.

    Returns an empty dict on any failure — caller should fall back to form builder.
    """
    try:
        raw = _llm_structure(raw_text)
        # Strip markdown code fences if present
        raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
        raw = re.sub(r"\s*```$", "", raw)
        return json.loads(raw)
    except Exception:
        return {}