refactor: replace LLM-based resume parser with section regex parser

Primary parse path is now fully deterministic — no LLM, no token limits,
no JSON generation. Handles two-column experience headers, institution-before-
or-after-degree education layouts, and header bleed prevention via
looks_like_header detection.

LLM path retained as optional career_summary enhancement only (1500 chars,
falls back silently). structure_resume() now returns tuple[dict, str].
Tests updated to match the new API.
This commit is contained in:
pyr0ball 2026-02-26 07:34:25 -08:00
parent c8d8434371
commit 26563a0990
2 changed files with 312 additions and 81 deletions

View file

@ -1,86 +1,306 @@
"""
Resume parser extract text from PDF/DOCX and structure via LLM.
Resume parser extract text from PDF/DOCX and structure via section parsing.
Fast path: file bytes raw text LLM structures into resume dict.
Result dict keys mirror plain_text_resume.yaml sections.
Primary path: regex + section detection (no LLM, no token limits).
Optional enhancement: LLM-generated career_summary if a capable backend is configured.
Falls back to empty dict on any LLM/parsing error caller should
then show the guided form builder.
Falls back to empty dict on unrecoverable errors caller shows the form builder.
"""
from __future__ import annotations
import io
import json
import logging
import re
from pathlib import Path
import pdfplumber
from docx import Document
log = logging.getLogger(__name__)
# ── Section header detection ──────────────────────────────────────────────────
_SECTION_NAMES = {
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
}
# Degrees — used to detect education lines
_DEGREE_RE = re.compile(
r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
re.I,
)
# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
_DATE_RE = re.compile(
r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
r"july|august|september|october|november|december)?\s*\d{4}\b"
r"|\b\d{1,2}/\d{4}\b",
re.I,
)
_DATE_RANGE_RE = re.compile(
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r")"
r"\s*[-—to]+\s*"
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r"|present|current|now"
r")",
re.I,
)
# Contact info
_EMAIL_RE = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
_PHONE_RE = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
# ── Text extraction ───────────────────────────────────────────────────────────
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract raw text from PDF bytes using pdfplumber.
Returns empty string if extraction fails for any page.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages)
def extract_text_from_docx(file_bytes: bytes) -> str:
"""Extract raw text from DOCX bytes using python-docx."""
doc = Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def _llm_structure(raw_text: str) -> str:
"""Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
from scripts.llm_router import LLMRouter
prompt = (
"You are a resume parser. Convert the following resume text into a JSON object.\n\n"
"Required JSON keys:\n"
"- name (string)\n"
"- email (string, may be empty)\n"
"- phone (string, may be empty)\n"
"- career_summary (string: 2-4 sentence professional summary)\n"
"- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
"- education (list of objects with: institution, degree, field, graduation_year)\n"
"- skills (list of strings)\n"
"- achievements (list of strings, may be empty)\n\n"
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
f"Resume text:\n{raw_text[:4000]}"
# ── Section splitter ──────────────────────────────────────────────────────────
def _split_sections(text: str) -> dict[str, list[str]]:
"""Split resume text into named sections. Lines that don't match a known
section header go into 'header' (assumed to be contact/name block)."""
sections: dict[str, list[str]] = {"header": []}
current = "header"
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
matched = False
for section, pattern in _SECTION_NAMES.items():
# Match if the line IS a section header (short + matches pattern)
if pattern.match(stripped) and len(stripped.split()) <= 5:
current = section
matched = True
break
if not matched:
sections.setdefault(current, []).append(stripped)
return sections
# ── Contact info ──────────────────────────────────────────────────────────────
def _parse_header(lines: list[str]) -> dict:
"""Extract name, email, phone from the top-of-resume block."""
full_text = "\n".join(lines)
email_m = _EMAIL_RE.search(full_text)
phone_m = _PHONE_RE.search(full_text)
# Name heuristic: first non-empty line that has no @ and no digits-only tokens
name = ""
for line in lines[:5]:
if "@" in line or re.match(r"^\d", line.strip()):
continue
# Skip lines that look like city/state/zip
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
continue
candidate = re.sub(r"[|•·,]+", " ", line).strip()
candidate = re.sub(r"\s{2,}", " ", candidate)
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
name = candidate
break
return {
"name": name,
"email": email_m.group(0) if email_m else "",
"phone": phone_m.group(0) if phone_m else "",
}
# ── Experience ────────────────────────────────────────────────────────────────
def _parse_experience(lines: list[str]) -> list[dict]:
"""Parse work experience entries from section lines.
Handles two common layouts:
(A) Title | Company (B) Title | Company | Dates
Dates bullet
bullet
"""
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
date_match = _DATE_RANGE_RE.search(line)
if date_match:
if current:
entries.append(current)
# Title/company may be on this line (layout B) or the previous line (layout A)
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
header = same_line if same_line.strip() else prev_line
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
current = {
"title": parts[0].strip() if parts else "",
"company": parts[1].strip() if len(parts) > 1 else "",
"start_date": date_match.group(1),
"end_date": date_match.group(2),
"bullets": [],
}
prev_line = ""
elif current is not None:
is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
looks_like_header = (
not is_bullet
and " | " in line
and not _DATE_RE.search(line)
)
router = LLMRouter()
return router.complete(prompt, max_tokens=2048)
if looks_like_header:
# Likely the title/company of the next entry — hold it as prev_line
prev_line = line
else:
clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
if clean:
current["bullets"].append(clean)
prev_line = line
else:
prev_line = line
if current:
entries.append(current)
return entries
def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Convert raw resume text to a structured dict via LLM.
# ── Education ─────────────────────────────────────────────────────────────────
def _parse_education(lines: list[str]) -> list[dict]:
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
if _DEGREE_RE.search(line):
if current:
entries.append(current)
current = {
"institution": "",
"degree": "",
"field": "",
"graduation_year": "",
}
year_m = re.search(r"\b(19|20)\d{2}\b", line)
if year_m:
current["graduation_year"] = year_m.group(0)
degree_m = _DEGREE_RE.search(line)
if degree_m:
current["degree"] = degree_m.group(0).upper()
remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
current["field"] = remainder.strip(" ,–—|•.")
# Layout A: institution was on the line before the degree line
if prev_line and not _DEGREE_RE.search(prev_line):
current["institution"] = prev_line.strip(" ,–—|•")
elif current is not None and not current["institution"]:
# Layout B: institution follows the degree line
clean = line.strip(" ,–—|•")
if clean:
current["institution"] = clean
prev_line = line.strip()
if current:
entries.append(current)
return entries
# ── Skills ────────────────────────────────────────────────────────────────────
def _parse_skills(lines: list[str]) -> list[str]:
skills: list[str] = []
for line in lines:
# Split on common delimiters
for item in re.split(r"[,|•·/]+", line):
clean = item.strip(" -–—*◦▪▸►()")
if 1 < len(clean) <= 50:
skills.append(clean)
return skills
# ── Main parser ───────────────────────────────────────────────────────────────
def parse_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume text into a structured dict using section detection + regex.
Returns (result_dict, error_message). result_dict is empty on failure.
"""
import traceback
if not raw_text.strip():
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
raw = ""
try:
raw = _llm_structure(raw_text)
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
cleaned = re.sub(r"\s*```$", "", cleaned)
try:
return json.loads(cleaned), ""
except json.JSONDecodeError:
# Try json-repair before giving up — handles truncation and minor malformations
from json_repair import repair_json
repaired = repair_json(cleaned)
result = json.loads(repaired)
log.warning("[resume_parser] Used json-repair to recover malformed output")
sections = _split_sections(raw_text)
contact = _parse_header(sections.get("header", []))
result = {
**contact,
"career_summary": " ".join(sections.get("summary", [])),
"experience": _parse_experience(sections.get("experience", [])),
"education": _parse_education(sections.get("education", [])),
"skills": _parse_skills(sections.get("skills", [])),
"achievements": sections.get("achievements", []),
}
return result, ""
except json.JSONDecodeError as e:
log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
return {}, f"LLM returned invalid JSON: {e}"
except Exception as e:
log.error("[resume_parser] Error:\n%s", traceback.format_exc())
import traceback
log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
return {}, str(e)
# ── LLM enhancement (career summary only, optional) ──────────────────────────
def _llm_career_summary(raw_text: str) -> str:
"""Use LLM to generate a career summary. Returns empty string on any failure."""
try:
from scripts.llm_router import LLMRouter
prompt = (
"Write a 2-3 sentence professional career summary for this candidate "
"based on their resume. Return only the summary text, no labels.\n\n"
f"Resume:\n{raw_text[:1500]}"
)
return LLMRouter().complete(prompt)
except Exception:
return ""
# ── Public entry point ────────────────────────────────────────────────────────
def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume and optionally enhance career_summary via LLM.
Returns (result_dict, error_message).
"""
result, err = parse_resume(raw_text)
if not result:
return result, err
# Enhance career summary via LLM if the section wasn't found in the document
if not result.get("career_summary"):
try:
summary = _llm_career_summary(raw_text)
except Exception:
summary = ""
if summary:
result["career_summary"] = summary.strip()
return result, ""

View file

@ -41,51 +41,62 @@ def test_extract_docx_returns_string():
assert "Senior Developer" in result
def test_structure_resume_returns_dict():
"""structure_resume returns a dict with expected keys when LLM returns valid JSON."""
raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023"
llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}'
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
def test_structure_resume_returns_tuple_with_keys():
"""structure_resume returns (dict, str) tuple with expected keys from plain text."""
raw_text = (
"Jane Doe\njane@example.com\n\n"
"Experience\nSoftware Engineer | Acme Corp\nJan 2020 - Dec 2023\n• Built things\n\n"
"Skills\nPython, SQL"
)
from scripts.resume_parser import structure_resume
result = structure_resume(raw_text)
result, err = structure_resume(raw_text)
assert err == ""
assert isinstance(result, dict)
assert "experience" in result
assert isinstance(result["experience"], list)
assert result["name"] == "Jane Doe"
assert result["email"] == "jane@example.com"
def test_structure_resume_strips_markdown_fences():
"""structure_resume handles LLM output wrapped in ```json ... ``` fences."""
raw_text = "Some resume"
llm_response = '```json\n{"name": "Bob", "experience": []}\n```'
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
def test_structure_resume_empty_text_returns_error():
"""structure_resume returns empty dict + error message for empty input."""
from scripts.resume_parser import structure_resume
result = structure_resume(raw_text)
result, err = structure_resume(" ")
assert result.get("name") == "Bob"
def test_structure_resume_invalid_json_returns_empty():
"""structure_resume returns {} on invalid JSON instead of crashing."""
with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
from scripts.resume_parser import structure_resume
result = structure_resume("some text")
assert isinstance(result, dict)
assert result == {}
assert err != ""
def test_structure_resume_llm_exception_returns_empty():
"""structure_resume returns {} when LLM raises an exception."""
with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")):
def test_parse_resume_contact_extraction():
"""parse_resume correctly extracts name, email, and phone from header block."""
raw_text = (
"Alice Smith\nalice.smith@email.com | (206) 555-9999\n\n"
"Skills\nLeadership, Communication"
)
from scripts.resume_parser import parse_resume
result, err = parse_resume(raw_text)
assert err == ""
assert result["name"] == "Alice Smith"
assert result["email"] == "alice.smith@email.com"
assert "555-9999" in result["phone"]
def test_structure_resume_llm_failure_still_returns_result():
"""structure_resume returns usable result even when LLM career summary fails."""
raw_text = (
"Bob Jones\nbob@test.com\n\n"
"Skills\nProject Management, Agile"
)
with patch("scripts.resume_parser._llm_career_summary", side_effect=Exception("LLM down")):
from scripts.resume_parser import structure_resume
result = structure_resume("some text")
result, err = structure_resume(raw_text)
assert isinstance(result, dict)
assert result == {}
# Regex parse should still succeed even if LLM summary enhancement fails
assert err == ""
assert result["name"] == "Bob Jones"
assert "Project Management" in result["skills"]
def test_extract_pdf_empty_page_returns_string():