refactor: replace LLM-based resume parser with section regex parser

Primary parse path is now fully deterministic — no LLM, no token limits, no JSON generation. Handles two-column experience headers, institution-before- or-after-degree education layouts, and header bleed prevention via looks_like_header detection. LLM path retained as optional career_summary enhancement only (1500 chars, falls back silently). structure_resume() now returns tuple[dict, str]. Tests updated to match the new API.
2026-02-26 07:34:25 -08:00 · 2026-02-26 07:34:25 -08:00 · 26563a0990
commit 26563a0990
parent c8d8434371
2 changed files with 312 additions and 81 deletions
--- a/scripts/resume_parser.py
+++ b/scripts/resume_parser.py
@ -1,86 +1,306 @@
 """
-Resume parser — extract text from PDF/DOCX and structure via LLM.
+Resume parser — extract text from PDF/DOCX and structure via section parsing.

-Fast path: file bytes → raw text → LLM structures into resume dict.
-Result dict keys mirror plain_text_resume.yaml sections.
+Primary path: regex + section detection (no LLM, no token limits).
+Optional enhancement: LLM-generated career_summary if a capable backend is configured.

-Falls back to empty dict on any LLM/parsing error — caller should
-then show the guided form builder.
+Falls back to empty dict on unrecoverable errors — caller shows the form builder.
 """
 from __future__ import annotations
+
 import io
 import json
 import logging
 import re
+from pathlib import Path

 import pdfplumber
 from docx import Document

 log = logging.getLogger(__name__)

+# ── Section header detection ──────────────────────────────────────────────────
+
+_SECTION_NAMES = {
+    "summary":    re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
+    "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
+    "education":  re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
+    "skills":     re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
+    "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
+}
+
+# Degrees — used to detect education lines
+_DEGREE_RE = re.compile(
+    r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
+    re.I,
+)
+
+# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
+_DATE_RE = re.compile(
+    r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
+    r"july|august|september|october|november|december)?\s*\d{4}\b"
+    r"|\b\d{1,2}/\d{4}\b",
+    re.I,
+)
+_DATE_RANGE_RE = re.compile(
+    r"("
+    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
+    r"|\d{1,2}/\d{4}"
+    r"|\d{4}"
+    r")"
+    r"\s*[-–—to]+\s*"
+    r"("
+    r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
+    r"|\d{1,2}/\d{4}"
+    r"|\d{4}"
+    r"|present|current|now"
+    r")",
+    re.I,
+)
+
+# Contact info
+_EMAIL_RE    = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
+_PHONE_RE    = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
+_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
+
+
+# ── Text extraction ───────────────────────────────────────────────────────────

 def extract_text_from_pdf(file_bytes: bytes) -> str:
-    """Extract raw text from PDF bytes using pdfplumber.
-
-    Returns empty string if extraction fails for any page.
-    """
    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
    return "\n".join(pages)


 def extract_text_from_docx(file_bytes: bytes) -> str:
-    """Extract raw text from DOCX bytes using python-docx."""
    doc = Document(io.BytesIO(file_bytes))
    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())


-def _llm_structure(raw_text: str) -> str:
-    """Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
-    from scripts.llm_router import LLMRouter
-    prompt = (
-        "You are a resume parser. Convert the following resume text into a JSON object.\n\n"
-        "Required JSON keys:\n"
-        "- name (string)\n"
-        "- email (string, may be empty)\n"
-        "- phone (string, may be empty)\n"
-        "- career_summary (string: 2-4 sentence professional summary)\n"
-        "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
-        "- education (list of objects with: institution, degree, field, graduation_year)\n"
-        "- skills (list of strings)\n"
-        "- achievements (list of strings, may be empty)\n\n"
-        "Return ONLY valid JSON. No markdown, no explanation.\n\n"
-        f"Resume text:\n{raw_text[:4000]}"
+# ── Section splitter ──────────────────────────────────────────────────────────
+
+def _split_sections(text: str) -> dict[str, list[str]]:
+    """Split resume text into named sections. Lines that don't match a known
+    section header go into 'header' (assumed to be contact/name block)."""
+    sections: dict[str, list[str]] = {"header": []}
+    current = "header"
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        matched = False
+        for section, pattern in _SECTION_NAMES.items():
+            # Match if the line IS a section header (short + matches pattern)
+            if pattern.match(stripped) and len(stripped.split()) <= 5:
+                current = section
+                matched = True
+                break
+        if not matched:
+            sections.setdefault(current, []).append(stripped)
+    return sections
+
+
+# ── Contact info ──────────────────────────────────────────────────────────────
+
+def _parse_header(lines: list[str]) -> dict:
+    """Extract name, email, phone from the top-of-resume block."""
+    full_text = "\n".join(lines)
+    email_m   = _EMAIL_RE.search(full_text)
+    phone_m   = _PHONE_RE.search(full_text)
+
+    # Name heuristic: first non-empty line that has no @ and no digits-only tokens
+    name = ""
+    for line in lines[:5]:
+        if "@" in line or re.match(r"^\d", line.strip()):
+            continue
+        # Skip lines that look like city/state/zip
+        if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
+            continue
+        candidate = re.sub(r"[|•·,]+", " ", line).strip()
+        candidate = re.sub(r"\s{2,}", " ", candidate)
+        if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
+            name = candidate
+            break
+
+    return {
+        "name":  name,
+        "email": email_m.group(0) if email_m else "",
+        "phone": phone_m.group(0) if phone_m else "",
+    }
+
+
+# ── Experience ────────────────────────────────────────────────────────────────
+
+def _parse_experience(lines: list[str]) -> list[dict]:
+    """Parse work experience entries from section lines.
+
+    Handles two common layouts:
+      (A) Title | Company          (B) Title | Company | Dates
+          Dates                        • bullet
+          • bullet
+    """
+    entries: list[dict] = []
+    current: dict | None = None
+    prev_line = ""
+
+    for line in lines:
+        date_match = _DATE_RANGE_RE.search(line)
+        if date_match:
+            if current:
+                entries.append(current)
+            # Title/company may be on this line (layout B) or the previous line (layout A)
+            same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
+            header = same_line if same_line.strip() else prev_line
+            parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
+            current = {
+                "title":      parts[0].strip() if parts else "",
+                "company":    parts[1].strip() if len(parts) > 1 else "",
+                "start_date": date_match.group(1),
+                "end_date":   date_match.group(2),
+                "bullets":    [],
+            }
+            prev_line = ""
+        elif current is not None:
+            is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
+            looks_like_header = (
+                not is_bullet
+                and " | " in line
+                and not _DATE_RE.search(line)
            )
-    router = LLMRouter()
-    return router.complete(prompt, max_tokens=2048)
+            if looks_like_header:
+                # Likely the title/company of the next entry — hold it as prev_line
+                prev_line = line
+            else:
+                clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
+                if clean:
+                    current["bullets"].append(clean)
+                prev_line = line
+        else:
+            prev_line = line
+
+    if current:
+        entries.append(current)
+
+    return entries


-def structure_resume(raw_text: str) -> tuple[dict, str]:
-    """Convert raw resume text to a structured dict via LLM.
+# ── Education ─────────────────────────────────────────────────────────────────
+
+def _parse_education(lines: list[str]) -> list[dict]:
+    entries: list[dict] = []
+    current: dict | None = None
+    prev_line = ""
+
+    for line in lines:
+        if _DEGREE_RE.search(line):
+            if current:
+                entries.append(current)
+            current = {
+                "institution":      "",
+                "degree":           "",
+                "field":            "",
+                "graduation_year":  "",
+            }
+            year_m = re.search(r"\b(19|20)\d{2}\b", line)
+            if year_m:
+                current["graduation_year"] = year_m.group(0)
+            degree_m = _DEGREE_RE.search(line)
+            if degree_m:
+                current["degree"] = degree_m.group(0).upper()
+            remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
+            remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
+            current["field"] = remainder.strip(" ,–—|•.")
+            # Layout A: institution was on the line before the degree line
+            if prev_line and not _DEGREE_RE.search(prev_line):
+                current["institution"] = prev_line.strip(" ,–—|•")
+        elif current is not None and not current["institution"]:
+            # Layout B: institution follows the degree line
+            clean = line.strip(" ,–—|•")
+            if clean:
+                current["institution"] = clean
+        prev_line = line.strip()
+
+    if current:
+        entries.append(current)
+
+    return entries
+
+
+# ── Skills ────────────────────────────────────────────────────────────────────
+
+def _parse_skills(lines: list[str]) -> list[str]:
+    skills: list[str] = []
+    for line in lines:
+        # Split on common delimiters
+        for item in re.split(r"[,|•·/]+", line):
+            clean = item.strip(" -–—*◦▪▸►()")
+            if 1 < len(clean) <= 50:
+                skills.append(clean)
+    return skills
+
+
+# ── Main parser ───────────────────────────────────────────────────────────────
+
+def parse_resume(raw_text: str) -> tuple[dict, str]:
+    """Parse resume text into a structured dict using section detection + regex.

    Returns (result_dict, error_message). result_dict is empty on failure.
    """
-    import traceback
    if not raw_text.strip():
        return {}, "Text extraction returned empty — the file may be image-based or unreadable."
-    raw = ""
+
    try:
-        raw = _llm_structure(raw_text)
-        cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
-        cleaned = re.sub(r"\s*```$", "", cleaned)
-        try:
-            return json.loads(cleaned), ""
-        except json.JSONDecodeError:
-            # Try json-repair before giving up — handles truncation and minor malformations
-            from json_repair import repair_json
-            repaired = repair_json(cleaned)
-            result = json.loads(repaired)
-            log.warning("[resume_parser] Used json-repair to recover malformed output")
+        sections = _split_sections(raw_text)
+        contact  = _parse_header(sections.get("header", []))
+        result = {
+            **contact,
+            "career_summary": " ".join(sections.get("summary", [])),
+            "experience":     _parse_experience(sections.get("experience", [])),
+            "education":      _parse_education(sections.get("education", [])),
+            "skills":         _parse_skills(sections.get("skills", [])),
+            "achievements":   sections.get("achievements", []),
+        }
        return result, ""
-    except json.JSONDecodeError as e:
-        log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
-        return {}, f"LLM returned invalid JSON: {e}"
    except Exception as e:
-        log.error("[resume_parser] Error:\n%s", traceback.format_exc())
+        import traceback
+        log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
        return {}, str(e)
+
+
+# ── LLM enhancement (career summary only, optional) ──────────────────────────
+
+def _llm_career_summary(raw_text: str) -> str:
+    """Use LLM to generate a career summary. Returns empty string on any failure."""
+    try:
+        from scripts.llm_router import LLMRouter
+        prompt = (
+            "Write a 2-3 sentence professional career summary for this candidate "
+            "based on their resume. Return only the summary text, no labels.\n\n"
+            f"Resume:\n{raw_text[:1500]}"
+        )
+        return LLMRouter().complete(prompt)
+    except Exception:
+        return ""
+
+
+# ── Public entry point ────────────────────────────────────────────────────────
+
+def structure_resume(raw_text: str) -> tuple[dict, str]:
+    """Parse resume and optionally enhance career_summary via LLM.
+
+    Returns (result_dict, error_message).
+    """
+    result, err = parse_resume(raw_text)
+    if not result:
+        return result, err
+
+    # Enhance career summary via LLM if the section wasn't found in the document
+    if not result.get("career_summary"):
+        try:
+            summary = _llm_career_summary(raw_text)
+        except Exception:
+            summary = ""
+        if summary:
+            result["career_summary"] = summary.strip()
+
+    return result, ""
--- a/tests/test_resume_parser.py
+++ b/tests/test_resume_parser.py
@ -41,51 +41,62 @@ def test_extract_docx_returns_string():
    assert "Senior Developer" in result


-def test_structure_resume_returns_dict():
-    """structure_resume returns a dict with expected keys when LLM returns valid JSON."""
-    raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023"
-    llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}'
-
-    with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
+def test_structure_resume_returns_tuple_with_keys():
+    """structure_resume returns (dict, str) tuple with expected keys from plain text."""
+    raw_text = (
+        "Jane Doe\njane@example.com\n\n"
+        "Experience\nSoftware Engineer | Acme Corp\nJan 2020 - Dec 2023\n• Built things\n\n"
+        "Skills\nPython, SQL"
+    )
    from scripts.resume_parser import structure_resume
-        result = structure_resume(raw_text)
+    result, err = structure_resume(raw_text)

+    assert err == ""
    assert isinstance(result, dict)
    assert "experience" in result
    assert isinstance(result["experience"], list)
    assert result["name"] == "Jane Doe"
+    assert result["email"] == "jane@example.com"


-def test_structure_resume_strips_markdown_fences():
-    """structure_resume handles LLM output wrapped in ```json ... ``` fences."""
-    raw_text = "Some resume"
-    llm_response = '```json\n{"name": "Bob", "experience": []}\n```'
-
-    with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
+def test_structure_resume_empty_text_returns_error():
+    """structure_resume returns empty dict + error message for empty input."""
    from scripts.resume_parser import structure_resume
-        result = structure_resume(raw_text)
+    result, err = structure_resume("   ")

-    assert result.get("name") == "Bob"
-
-
-def test_structure_resume_invalid_json_returns_empty():
-    """structure_resume returns {} on invalid JSON instead of crashing."""
-    with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
-        from scripts.resume_parser import structure_resume
-        result = structure_resume("some text")
-
-    assert isinstance(result, dict)
    assert result == {}
+    assert err != ""


-def test_structure_resume_llm_exception_returns_empty():
-    """structure_resume returns {} when LLM raises an exception."""
-    with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")):
+def test_parse_resume_contact_extraction():
+    """parse_resume correctly extracts name, email, and phone from header block."""
+    raw_text = (
+        "Alice Smith\nalice.smith@email.com | (206) 555-9999\n\n"
+        "Skills\nLeadership, Communication"
+    )
+    from scripts.resume_parser import parse_resume
+    result, err = parse_resume(raw_text)
+
+    assert err == ""
+    assert result["name"] == "Alice Smith"
+    assert result["email"] == "alice.smith@email.com"
+    assert "555-9999" in result["phone"]
+
+
+def test_structure_resume_llm_failure_still_returns_result():
+    """structure_resume returns usable result even when LLM career summary fails."""
+    raw_text = (
+        "Bob Jones\nbob@test.com\n\n"
+        "Skills\nProject Management, Agile"
+    )
+    with patch("scripts.resume_parser._llm_career_summary", side_effect=Exception("LLM down")):
        from scripts.resume_parser import structure_resume
-        result = structure_resume("some text")
+        result, err = structure_resume(raw_text)

-    assert isinstance(result, dict)
-    assert result == {}
+    # Regex parse should still succeed even if LLM summary enhancement fails
+    assert err == ""
+    assert result["name"] == "Bob Jones"
+    assert "Project Management" in result["skills"]


 def test_extract_pdf_empty_page_returns_string():