refactor: replace LLM-based resume parser with section regex parser

Primary parse path is now fully deterministic — no LLM, no token limits,
no JSON generation. Handles two-column experience headers, institution-before-
or-after-degree education layouts, and header bleed prevention via
looks_like_header detection.

LLM path retained as optional career_summary enhancement only (1500 chars,
falls back silently). structure_resume() now returns tuple[dict, str].
Tests updated to match the new API.
This commit is contained in:
pyr0ball 2026-02-26 07:34:25 -08:00
parent 9297477ba0
commit b9f5dd1fc3
2 changed files with 312 additions and 81 deletions

View file

@ -1,86 +1,306 @@
""" """
Resume parser extract text from PDF/DOCX and structure via LLM. Resume parser extract text from PDF/DOCX and structure via section parsing.
Fast path: file bytes raw text LLM structures into resume dict. Primary path: regex + section detection (no LLM, no token limits).
Result dict keys mirror plain_text_resume.yaml sections. Optional enhancement: LLM-generated career_summary if a capable backend is configured.
Falls back to empty dict on any LLM/parsing error caller should Falls back to empty dict on unrecoverable errors caller shows the form builder.
then show the guided form builder.
""" """
from __future__ import annotations from __future__ import annotations
import io import io
import json import json
import logging import logging
import re import re
from pathlib import Path
import pdfplumber import pdfplumber
from docx import Document from docx import Document
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# ── Section header detection ──────────────────────────────────────────────────
_SECTION_NAMES = {
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
}
# Degrees — used to detect education lines
_DEGREE_RE = re.compile(
r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
re.I,
)
# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
_DATE_RE = re.compile(
r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
r"july|august|september|october|november|december)?\s*\d{4}\b"
r"|\b\d{1,2}/\d{4}\b",
re.I,
)
_DATE_RANGE_RE = re.compile(
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r")"
r"\s*[-—to]+\s*"
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r"|present|current|now"
r")",
re.I,
)
# Contact info
_EMAIL_RE = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
_PHONE_RE = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
# ── Text extraction ───────────────────────────────────────────────────────────
def extract_text_from_pdf(file_bytes: bytes) -> str: def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract raw text from PDF bytes using pdfplumber.
Returns empty string if extraction fails for any page.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages] pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages) return "\n".join(pages)
def extract_text_from_docx(file_bytes: bytes) -> str: def extract_text_from_docx(file_bytes: bytes) -> str:
"""Extract raw text from DOCX bytes using python-docx."""
doc = Document(io.BytesIO(file_bytes)) doc = Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def _llm_structure(raw_text: str) -> str: # ── Section splitter ──────────────────────────────────────────────────────────
"""Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
from scripts.llm_router import LLMRouter def _split_sections(text: str) -> dict[str, list[str]]:
prompt = ( """Split resume text into named sections. Lines that don't match a known
"You are a resume parser. Convert the following resume text into a JSON object.\n\n" section header go into 'header' (assumed to be contact/name block)."""
"Required JSON keys:\n" sections: dict[str, list[str]] = {"header": []}
"- name (string)\n" current = "header"
"- email (string, may be empty)\n" for line in text.splitlines():
"- phone (string, may be empty)\n" stripped = line.strip()
"- career_summary (string: 2-4 sentence professional summary)\n" if not stripped:
"- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n" continue
"- education (list of objects with: institution, degree, field, graduation_year)\n" matched = False
"- skills (list of strings)\n" for section, pattern in _SECTION_NAMES.items():
"- achievements (list of strings, may be empty)\n\n" # Match if the line IS a section header (short + matches pattern)
"Return ONLY valid JSON. No markdown, no explanation.\n\n" if pattern.match(stripped) and len(stripped.split()) <= 5:
f"Resume text:\n{raw_text[:4000]}" current = section
) matched = True
router = LLMRouter() break
return router.complete(prompt, max_tokens=2048) if not matched:
sections.setdefault(current, []).append(stripped)
return sections
def structure_resume(raw_text: str) -> tuple[dict, str]: # ── Contact info ──────────────────────────────────────────────────────────────
"""Convert raw resume text to a structured dict via LLM.
def _parse_header(lines: list[str]) -> dict:
"""Extract name, email, phone from the top-of-resume block."""
full_text = "\n".join(lines)
email_m = _EMAIL_RE.search(full_text)
phone_m = _PHONE_RE.search(full_text)
# Name heuristic: first non-empty line that has no @ and no digits-only tokens
name = ""
for line in lines[:5]:
if "@" in line or re.match(r"^\d", line.strip()):
continue
# Skip lines that look like city/state/zip
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
continue
candidate = re.sub(r"[|•·,]+", " ", line).strip()
candidate = re.sub(r"\s{2,}", " ", candidate)
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
name = candidate
break
return {
"name": name,
"email": email_m.group(0) if email_m else "",
"phone": phone_m.group(0) if phone_m else "",
}
# ── Experience ────────────────────────────────────────────────────────────────
def _parse_experience(lines: list[str]) -> list[dict]:
"""Parse work experience entries from section lines.
Handles two common layouts:
(A) Title | Company (B) Title | Company | Dates
Dates bullet
bullet
"""
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
date_match = _DATE_RANGE_RE.search(line)
if date_match:
if current:
entries.append(current)
# Title/company may be on this line (layout B) or the previous line (layout A)
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
header = same_line if same_line.strip() else prev_line
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
current = {
"title": parts[0].strip() if parts else "",
"company": parts[1].strip() if len(parts) > 1 else "",
"start_date": date_match.group(1),
"end_date": date_match.group(2),
"bullets": [],
}
prev_line = ""
elif current is not None:
is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
looks_like_header = (
not is_bullet
and " | " in line
and not _DATE_RE.search(line)
)
if looks_like_header:
# Likely the title/company of the next entry — hold it as prev_line
prev_line = line
else:
clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
if clean:
current["bullets"].append(clean)
prev_line = line
else:
prev_line = line
if current:
entries.append(current)
return entries
# ── Education ─────────────────────────────────────────────────────────────────
def _parse_education(lines: list[str]) -> list[dict]:
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
if _DEGREE_RE.search(line):
if current:
entries.append(current)
current = {
"institution": "",
"degree": "",
"field": "",
"graduation_year": "",
}
year_m = re.search(r"\b(19|20)\d{2}\b", line)
if year_m:
current["graduation_year"] = year_m.group(0)
degree_m = _DEGREE_RE.search(line)
if degree_m:
current["degree"] = degree_m.group(0).upper()
remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
current["field"] = remainder.strip(" ,–—|•.")
# Layout A: institution was on the line before the degree line
if prev_line and not _DEGREE_RE.search(prev_line):
current["institution"] = prev_line.strip(" ,–—|•")
elif current is not None and not current["institution"]:
# Layout B: institution follows the degree line
clean = line.strip(" ,–—|•")
if clean:
current["institution"] = clean
prev_line = line.strip()
if current:
entries.append(current)
return entries
# ── Skills ────────────────────────────────────────────────────────────────────
def _parse_skills(lines: list[str]) -> list[str]:
skills: list[str] = []
for line in lines:
# Split on common delimiters
for item in re.split(r"[,|•·/]+", line):
clean = item.strip(" -–—*◦▪▸►()")
if 1 < len(clean) <= 50:
skills.append(clean)
return skills
# ── Main parser ───────────────────────────────────────────────────────────────
def parse_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume text into a structured dict using section detection + regex.
Returns (result_dict, error_message). result_dict is empty on failure. Returns (result_dict, error_message). result_dict is empty on failure.
""" """
import traceback
if not raw_text.strip(): if not raw_text.strip():
return {}, "Text extraction returned empty — the file may be image-based or unreadable." return {}, "Text extraction returned empty — the file may be image-based or unreadable."
raw = ""
try: try:
raw = _llm_structure(raw_text) sections = _split_sections(raw_text)
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip()) contact = _parse_header(sections.get("header", []))
cleaned = re.sub(r"\s*```$", "", cleaned) result = {
try: **contact,
return json.loads(cleaned), "" "career_summary": " ".join(sections.get("summary", [])),
except json.JSONDecodeError: "experience": _parse_experience(sections.get("experience", [])),
# Try json-repair before giving up — handles truncation and minor malformations "education": _parse_education(sections.get("education", [])),
from json_repair import repair_json "skills": _parse_skills(sections.get("skills", [])),
repaired = repair_json(cleaned) "achievements": sections.get("achievements", []),
result = json.loads(repaired) }
log.warning("[resume_parser] Used json-repair to recover malformed output") return result, ""
return result, ""
except json.JSONDecodeError as e:
log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
return {}, f"LLM returned invalid JSON: {e}"
except Exception as e: except Exception as e:
log.error("[resume_parser] Error:\n%s", traceback.format_exc()) import traceback
log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
return {}, str(e) return {}, str(e)
# ── LLM enhancement (career summary only, optional) ──────────────────────────
def _llm_career_summary(raw_text: str) -> str:
"""Use LLM to generate a career summary. Returns empty string on any failure."""
try:
from scripts.llm_router import LLMRouter
prompt = (
"Write a 2-3 sentence professional career summary for this candidate "
"based on their resume. Return only the summary text, no labels.\n\n"
f"Resume:\n{raw_text[:1500]}"
)
return LLMRouter().complete(prompt)
except Exception:
return ""
# ── Public entry point ────────────────────────────────────────────────────────
def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume and optionally enhance career_summary via LLM.
Returns (result_dict, error_message).
"""
result, err = parse_resume(raw_text)
if not result:
return result, err
# Enhance career summary via LLM if the section wasn't found in the document
if not result.get("career_summary"):
try:
summary = _llm_career_summary(raw_text)
except Exception:
summary = ""
if summary:
result["career_summary"] = summary.strip()
return result, ""

View file

@ -41,51 +41,62 @@ def test_extract_docx_returns_string():
assert "Senior Developer" in result assert "Senior Developer" in result
def test_structure_resume_returns_dict(): def test_structure_resume_returns_tuple_with_keys():
"""structure_resume returns a dict with expected keys when LLM returns valid JSON.""" """structure_resume returns (dict, str) tuple with expected keys from plain text."""
raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023" raw_text = (
llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}' "Jane Doe\njane@example.com\n\n"
"Experience\nSoftware Engineer | Acme Corp\nJan 2020 - Dec 2023\n• Built things\n\n"
with patch("scripts.resume_parser._llm_structure", return_value=llm_response): "Skills\nPython, SQL"
from scripts.resume_parser import structure_resume )
result = structure_resume(raw_text) from scripts.resume_parser import structure_resume
result, err = structure_resume(raw_text)
assert err == ""
assert isinstance(result, dict) assert isinstance(result, dict)
assert "experience" in result assert "experience" in result
assert isinstance(result["experience"], list) assert isinstance(result["experience"], list)
assert result["name"] == "Jane Doe" assert result["name"] == "Jane Doe"
assert result["email"] == "jane@example.com"
def test_structure_resume_strips_markdown_fences(): def test_structure_resume_empty_text_returns_error():
"""structure_resume handles LLM output wrapped in ```json ... ``` fences.""" """structure_resume returns empty dict + error message for empty input."""
raw_text = "Some resume" from scripts.resume_parser import structure_resume
llm_response = '```json\n{"name": "Bob", "experience": []}\n```' result, err = structure_resume(" ")
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
from scripts.resume_parser import structure_resume
result = structure_resume(raw_text)
assert result.get("name") == "Bob"
def test_structure_resume_invalid_json_returns_empty():
"""structure_resume returns {} on invalid JSON instead of crashing."""
with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
from scripts.resume_parser import structure_resume
result = structure_resume("some text")
assert isinstance(result, dict)
assert result == {} assert result == {}
assert err != ""
def test_structure_resume_llm_exception_returns_empty(): def test_parse_resume_contact_extraction():
"""structure_resume returns {} when LLM raises an exception.""" """parse_resume correctly extracts name, email, and phone from header block."""
with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")): raw_text = (
"Alice Smith\nalice.smith@email.com | (206) 555-9999\n\n"
"Skills\nLeadership, Communication"
)
from scripts.resume_parser import parse_resume
result, err = parse_resume(raw_text)
assert err == ""
assert result["name"] == "Alice Smith"
assert result["email"] == "alice.smith@email.com"
assert "555-9999" in result["phone"]
def test_structure_resume_llm_failure_still_returns_result():
"""structure_resume returns usable result even when LLM career summary fails."""
raw_text = (
"Bob Jones\nbob@test.com\n\n"
"Skills\nProject Management, Agile"
)
with patch("scripts.resume_parser._llm_career_summary", side_effect=Exception("LLM down")):
from scripts.resume_parser import structure_resume from scripts.resume_parser import structure_resume
result = structure_resume("some text") result, err = structure_resume(raw_text)
assert isinstance(result, dict) # Regex parse should still succeed even if LLM summary enhancement fails
assert result == {} assert err == ""
assert result["name"] == "Bob Jones"
assert "Project Management" in result["skills"]
def test_extract_pdf_empty_page_returns_string(): def test_extract_pdf_empty_page_returns_string():