refactor: replace LLM-based resume parser with section regex parser

Primary parse path is now fully deterministic — no LLM, no token limits, no JSON generation. Handles two-column experience headers, institution-before- or-after-degree education layouts, and header bleed prevention via looks_like_header detection. LLM path retained as optional career_summary enhancement only (1500 chars, falls back silently). structure_resume() now returns tuple[dict, str]. Tests updated to match the new API.
2026-02-26 07:34:25 -08:00 · 2026-02-26 07:34:25 -08:00 · b9f5dd1fc3
commit b9f5dd1fc3
parent 9297477ba0
2 changed files with 312 additions and 81 deletions
--- a/scripts/resume_parser.py
+++ b/scripts/resume_parser.py
@ -1,86 +1,306 @@
 """
-Resume parser — extract text from PDF/DOCX and structure via LLM.
+Resume parser — extract text from PDF/DOCX and structure via section parsing.
-Fast path: file bytes → raw text → LLM structures into resume dict.
+Primary path: regex + section detection (no LLM, no token limits).
-Result dict keys mirror plain_text_resume.yaml sections.
+Optional enhancement: LLM-generated career_summary if a capable backend is configured.
-Falls back to empty dict on any LLM/parsing error — caller should
+Falls back to empty dict on unrecoverable errors — caller shows the form builder.
 then show the guided form builder.
 """
 from __future__ import annotations
 import io
 import json
 import logging
 import re
 from pathlib import Path
 import pdfplumber
 from docx import Document
 log = logging.getLogger(__name__)
 # ── Section header detection ──────────────────────────────────────────────────
 _SECTION_NAMES = {
    "summary":    re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
    "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
    "education":  re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
    "skills":     re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
    "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
 }
 # Degrees — used to detect education lines
 _DEGREE_RE = re.compile(
    r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
    re.I,
 )
 # Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
 _DATE_RE = re.compile(
    r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
    r"july|august|september|october|november|december)?\s*\d{4}\b"
    r"|\b\d{1,2}/\d{4}\b",
    re.I,
 )
 _DATE_RANGE_RE = re.compile(
    r"("
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
    r"|\d{1,2}/\d{4}"
    r"|\d{4}"
    r")"
    r"\s*[-–—to]+\s*"
    r"("
    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
    r"|\d{1,2}/\d{4}"
    r"|\d{4}"
    r"|present|current|now"
    r")",
    re.I,
 )
 # Contact info
 _EMAIL_RE    = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
 _PHONE_RE    = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
 _LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
 # ── Text extraction ───────────────────────────────────────────────────────────
 def extract_text_from_pdf(file_bytes: bytes) -> str:
    """Extract raw text from PDF bytes using pdfplumber.
    Returns empty string if extraction fails for any page.
    """
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(pages)
 def extract_text_from_docx(file_bytes: bytes) -> str:
    """Extract raw text from DOCX bytes using python-docx."""
    doc = Document(io.BytesIO(file_bytes))
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
-def _llm_structure(raw_text: str) -> str:
+# ── Section splitter ──────────────────────────────────────────────────────────
-    """Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
+
-    from scripts.llm_router import LLMRouter
+def _split_sections(text: str) -> dict[str, list[str]]:
-    prompt = (
+    """Split resume text into named sections. Lines that don't match a known
-        "You are a resume parser. Convert the following resume text into a JSON object.\n\n"
+    section header go into 'header' (assumed to be contact/name block)."""
-        "Required JSON keys:\n"
+    sections: dict[str, list[str]] = {"header": []}
-        "- name (string)\n"
+    current = "header"
-        "- email (string, may be empty)\n"
+    for line in text.splitlines():
-        "- phone (string, may be empty)\n"
+        stripped = line.strip()
-        "- career_summary (string: 2-4 sentence professional summary)\n"
+        if not stripped:
-        "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
+            continue
-        "- education (list of objects with: institution, degree, field, graduation_year)\n"
+        matched = False
-        "- skills (list of strings)\n"
+        for section, pattern in _SECTION_NAMES.items():
-        "- achievements (list of strings, may be empty)\n\n"
+            # Match if the line IS a section header (short + matches pattern)
-        "Return ONLY valid JSON. No markdown, no explanation.\n\n"
+            if pattern.match(stripped) and len(stripped.split()) <= 5:
-        f"Resume text:\n{raw_text[:4000]}"
+                current = section
-    )
+                matched = True
-    router = LLMRouter()
+                break
-    return router.complete(prompt, max_tokens=2048)
+        if not matched:
            sections.setdefault(current, []).append(stripped)
    return sections
-def structure_resume(raw_text: str) -> tuple[dict, str]:
+# ── Contact info ──────────────────────────────────────────────────────────────
-    """Convert raw resume text to a structured dict via LLM.
+
 def _parse_header(lines: list[str]) -> dict:
    """Extract name, email, phone from the top-of-resume block."""
    full_text = "\n".join(lines)
    email_m   = _EMAIL_RE.search(full_text)
    phone_m   = _PHONE_RE.search(full_text)
    # Name heuristic: first non-empty line that has no @ and no digits-only tokens
    name = ""
    for line in lines[:5]:
        if "@" in line or re.match(r"^\d", line.strip()):
            continue
        # Skip lines that look like city/state/zip
        if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
            continue
        candidate = re.sub(r"[|•·,]+", " ", line).strip()
        candidate = re.sub(r"\s{2,}", " ", candidate)
        if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
            name = candidate
            break
    return {
        "name":  name,
        "email": email_m.group(0) if email_m else "",
        "phone": phone_m.group(0) if phone_m else "",
    }
 # ── Experience ────────────────────────────────────────────────────────────────
 def _parse_experience(lines: list[str]) -> list[dict]:
    """Parse work experience entries from section lines.
    Handles two common layouts:
      (A) Title | Company          (B) Title | Company | Dates
          Dates                        • bullet
          • bullet
    """
    entries: list[dict] = []
    current: dict | None = None
    prev_line = ""
    for line in lines:
        date_match = _DATE_RANGE_RE.search(line)
        if date_match:
            if current:
                entries.append(current)
            # Title/company may be on this line (layout B) or the previous line (layout A)
            same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
            header = same_line if same_line.strip() else prev_line
            parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
            current = {
                "title":      parts[0].strip() if parts else "",
                "company":    parts[1].strip() if len(parts) > 1 else "",
                "start_date": date_match.group(1),
                "end_date":   date_match.group(2),
                "bullets":    [],
            }
            prev_line = ""
        elif current is not None:
            is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
            looks_like_header = (
                not is_bullet
                and " | " in line
                and not _DATE_RE.search(line)
            )
            if looks_like_header:
                # Likely the title/company of the next entry — hold it as prev_line
                prev_line = line
            else:
                clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
                if clean:
                    current["bullets"].append(clean)
                prev_line = line
        else:
            prev_line = line
    if current:
        entries.append(current)
    return entries
 # ── Education ─────────────────────────────────────────────────────────────────
 def _parse_education(lines: list[str]) -> list[dict]:
    entries: list[dict] = []
    current: dict | None = None
    prev_line = ""
    for line in lines:
        if _DEGREE_RE.search(line):
            if current:
                entries.append(current)
            current = {
                "institution":      "",
                "degree":           "",
                "field":            "",
                "graduation_year":  "",
            }
            year_m = re.search(r"\b(19|20)\d{2}\b", line)
            if year_m:
                current["graduation_year"] = year_m.group(0)
            degree_m = _DEGREE_RE.search(line)
            if degree_m:
                current["degree"] = degree_m.group(0).upper()
            remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
            remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
            current["field"] = remainder.strip(" ,–—|•.")
            # Layout A: institution was on the line before the degree line
            if prev_line and not _DEGREE_RE.search(prev_line):
                current["institution"] = prev_line.strip(" ,–—|•")
        elif current is not None and not current["institution"]:
            # Layout B: institution follows the degree line
            clean = line.strip(" ,–—|•")
            if clean:
                current["institution"] = clean
        prev_line = line.strip()
    if current:
        entries.append(current)
    return entries
 # ── Skills ────────────────────────────────────────────────────────────────────
 def _parse_skills(lines: list[str]) -> list[str]:
    skills: list[str] = []
    for line in lines:
        # Split on common delimiters
        for item in re.split(r"[,|•·/]+", line):
            clean = item.strip(" -–—*◦▪▸►()")
            if 1 < len(clean) <= 50:
                skills.append(clean)
    return skills
 # ── Main parser ───────────────────────────────────────────────────────────────
 def parse_resume(raw_text: str) -> tuple[dict, str]:
    """Parse resume text into a structured dict using section detection + regex.
    Returns (result_dict, error_message). result_dict is empty on failure.
    """
    import traceback
    if not raw_text.strip():
        return {}, "Text extraction returned empty — the file may be image-based or unreadable."
-    raw = ""
+
    try:
-        raw = _llm_structure(raw_text)
+        sections = _split_sections(raw_text)
-        cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
+        contact  = _parse_header(sections.get("header", []))
-        cleaned = re.sub(r"\s*```$", "", cleaned)
+        result = {
-        try:
+            **contact,
-            return json.loads(cleaned), ""
+            "career_summary": " ".join(sections.get("summary", [])),
-        except json.JSONDecodeError:
+            "experience":     _parse_experience(sections.get("experience", [])),
-            # Try json-repair before giving up — handles truncation and minor malformations
+            "education":      _parse_education(sections.get("education", [])),
-            from json_repair import repair_json
+            "skills":         _parse_skills(sections.get("skills", [])),
-            repaired = repair_json(cleaned)
+            "achievements":   sections.get("achievements", []),
-            result = json.loads(repaired)
+        }
-            log.warning("[resume_parser] Used json-repair to recover malformed output")
+        return result, ""
            return result, ""
    except json.JSONDecodeError as e:
        log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
        return {}, f"LLM returned invalid JSON: {e}"
    except Exception as e:
-        log.error("[resume_parser] Error:\n%s", traceback.format_exc())
+        import traceback
        log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
        return {}, str(e)
 # ── LLM enhancement (career summary only, optional) ──────────────────────────
 def _llm_career_summary(raw_text: str) -> str:
    """Use LLM to generate a career summary. Returns empty string on any failure."""
    try:
        from scripts.llm_router import LLMRouter
        prompt = (
            "Write a 2-3 sentence professional career summary for this candidate "
            "based on their resume. Return only the summary text, no labels.\n\n"
            f"Resume:\n{raw_text[:1500]}"
        )
        return LLMRouter().complete(prompt)
    except Exception:
        return ""
 # ── Public entry point ────────────────────────────────────────────────────────
 def structure_resume(raw_text: str) -> tuple[dict, str]:
    """Parse resume and optionally enhance career_summary via LLM.
    Returns (result_dict, error_message).
    """
    result, err = parse_resume(raw_text)
    if not result:
        return result, err
    # Enhance career summary via LLM if the section wasn't found in the document
    if not result.get("career_summary"):
        try:
            summary = _llm_career_summary(raw_text)
        except Exception:
            summary = ""
        if summary:
            result["career_summary"] = summary.strip()
    return result, ""
--- a/tests/test_resume_parser.py
+++ b/tests/test_resume_parser.py
@ -41,51 +41,62 @@ def test_extract_docx_returns_string():
    assert "Senior Developer" in result
-def test_structure_resume_returns_dict():
+def test_structure_resume_returns_tuple_with_keys():
-    """structure_resume returns a dict with expected keys when LLM returns valid JSON."""
+    """structure_resume returns (dict, str) tuple with expected keys from plain text."""
-    raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023"
+    raw_text = (
-    llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}'
+        "Jane Doe\njane@example.com\n\n"
-
+        "Experience\nSoftware Engineer | Acme Corp\nJan 2020 - Dec 2023\n• Built things\n\n"
-    with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
+        "Skills\nPython, SQL"
-        from scripts.resume_parser import structure_resume
+    )
-        result = structure_resume(raw_text)
+    from scripts.resume_parser import structure_resume
    result, err = structure_resume(raw_text)
    assert err == ""
    assert isinstance(result, dict)
    assert "experience" in result
    assert isinstance(result["experience"], list)
    assert result["name"] == "Jane Doe"
    assert result["email"] == "jane@example.com"
-def test_structure_resume_strips_markdown_fences():
+def test_structure_resume_empty_text_returns_error():
-    """structure_resume handles LLM output wrapped in ```json ... ``` fences."""
+    """structure_resume returns empty dict + error message for empty input."""
-    raw_text = "Some resume"
+    from scripts.resume_parser import structure_resume
-    llm_response = '```json\n{"name": "Bob", "experience": []}\n```'
+    result, err = structure_resume("   ")
    with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
        from scripts.resume_parser import structure_resume
        result = structure_resume(raw_text)
    assert result.get("name") == "Bob"
 def test_structure_resume_invalid_json_returns_empty():
    """structure_resume returns {} on invalid JSON instead of crashing."""
    with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
        from scripts.resume_parser import structure_resume
        result = structure_resume("some text")
    assert isinstance(result, dict)
    assert result == {}
    assert err != ""
-def test_structure_resume_llm_exception_returns_empty():
+def test_parse_resume_contact_extraction():
-    """structure_resume returns {} when LLM raises an exception."""
+    """parse_resume correctly extracts name, email, and phone from header block."""
-    with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")):
+    raw_text = (
        "Alice Smith\nalice.smith@email.com | (206) 555-9999\n\n"
        "Skills\nLeadership, Communication"
    )
    from scripts.resume_parser import parse_resume
    result, err = parse_resume(raw_text)
    assert err == ""
    assert result["name"] == "Alice Smith"
    assert result["email"] == "alice.smith@email.com"
    assert "555-9999" in result["phone"]
 def test_structure_resume_llm_failure_still_returns_result():
    """structure_resume returns usable result even when LLM career summary fails."""
    raw_text = (
        "Bob Jones\nbob@test.com\n\n"
        "Skills\nProject Management, Agile"
    )
    with patch("scripts.resume_parser._llm_career_summary", side_effect=Exception("LLM down")):
        from scripts.resume_parser import structure_resume
-        result = structure_resume("some text")
+        result, err = structure_resume(raw_text)
-    assert isinstance(result, dict)
+    # Regex parse should still succeed even if LLM summary enhancement fails
-    assert result == {}
+    assert err == ""
    assert result["name"] == "Bob Jones"
    assert "Project Management" in result["skills"]
 def test_extract_pdf_empty_page_returns_string():