refactor: replace LLM-based resume parser with section regex parser
Primary parse path is now fully deterministic — no LLM, no token limits, no JSON generation. Handles two-column experience headers, institution-before- or-after-degree education layouts, and header bleed prevention via looks_like_header detection. LLM path retained as optional career_summary enhancement only (1500 chars, falls back silently). structure_resume() now returns tuple[dict, str]. Tests updated to match the new API.
This commit is contained in:
parent
9297477ba0
commit
b9f5dd1fc3
2 changed files with 312 additions and 81 deletions
|
|
@ -1,86 +1,306 @@
|
||||||
"""
|
"""
|
||||||
Resume parser — extract text from PDF/DOCX and structure via LLM.
|
Resume parser — extract text from PDF/DOCX and structure via section parsing.
|
||||||
|
|
||||||
Fast path: file bytes → raw text → LLM structures into resume dict.
|
Primary path: regex + section detection (no LLM, no token limits).
|
||||||
Result dict keys mirror plain_text_resume.yaml sections.
|
Optional enhancement: LLM-generated career_summary if a capable backend is configured.
|
||||||
|
|
||||||
Falls back to empty dict on any LLM/parsing error — caller should
|
Falls back to empty dict on unrecoverable errors — caller shows the form builder.
|
||||||
then show the guided form builder.
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ── Section header detection ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_SECTION_NAMES = {
|
||||||
|
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
|
||||||
|
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
|
||||||
|
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
|
||||||
|
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
|
||||||
|
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
|
||||||
|
}
|
||||||
|
|
||||||
|
# Degrees — used to detect education lines
|
||||||
|
_DEGREE_RE = re.compile(
|
||||||
|
r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
|
||||||
|
_DATE_RE = re.compile(
|
||||||
|
r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
|
||||||
|
r"july|august|september|october|november|december)?\s*\d{4}\b"
|
||||||
|
r"|\b\d{1,2}/\d{4}\b",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
_DATE_RANGE_RE = re.compile(
|
||||||
|
r"("
|
||||||
|
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
|
||||||
|
r"|\d{1,2}/\d{4}"
|
||||||
|
r"|\d{4}"
|
||||||
|
r")"
|
||||||
|
r"\s*[-–—to]+\s*"
|
||||||
|
r"("
|
||||||
|
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
|
||||||
|
r"|\d{1,2}/\d{4}"
|
||||||
|
r"|\d{4}"
|
||||||
|
r"|present|current|now"
|
||||||
|
r")",
|
||||||
|
re.I,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Contact info
|
||||||
|
_EMAIL_RE = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
|
||||||
|
_PHONE_RE = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
|
||||||
|
_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Text extraction ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
||||||
"""Extract raw text from PDF bytes using pdfplumber.
|
|
||||||
|
|
||||||
Returns empty string if extraction fails for any page.
|
|
||||||
"""
|
|
||||||
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
||||||
pages = [page.extract_text() or "" for page in pdf.pages]
|
pages = [page.extract_text() or "" for page in pdf.pages]
|
||||||
return "\n".join(pages)
|
return "\n".join(pages)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_docx(file_bytes: bytes) -> str:
|
def extract_text_from_docx(file_bytes: bytes) -> str:
|
||||||
"""Extract raw text from DOCX bytes using python-docx."""
|
|
||||||
doc = Document(io.BytesIO(file_bytes))
|
doc = Document(io.BytesIO(file_bytes))
|
||||||
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
||||||
|
|
||||||
|
|
||||||
def _llm_structure(raw_text: str) -> str:
|
# ── Section splitter ──────────────────────────────────────────────────────────
|
||||||
"""Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
|
|
||||||
from scripts.llm_router import LLMRouter
|
def _split_sections(text: str) -> dict[str, list[str]]:
|
||||||
prompt = (
|
"""Split resume text into named sections. Lines that don't match a known
|
||||||
"You are a resume parser. Convert the following resume text into a JSON object.\n\n"
|
section header go into 'header' (assumed to be contact/name block)."""
|
||||||
"Required JSON keys:\n"
|
sections: dict[str, list[str]] = {"header": []}
|
||||||
"- name (string)\n"
|
current = "header"
|
||||||
"- email (string, may be empty)\n"
|
for line in text.splitlines():
|
||||||
"- phone (string, may be empty)\n"
|
stripped = line.strip()
|
||||||
"- career_summary (string: 2-4 sentence professional summary)\n"
|
if not stripped:
|
||||||
"- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
|
continue
|
||||||
"- education (list of objects with: institution, degree, field, graduation_year)\n"
|
matched = False
|
||||||
"- skills (list of strings)\n"
|
for section, pattern in _SECTION_NAMES.items():
|
||||||
"- achievements (list of strings, may be empty)\n\n"
|
# Match if the line IS a section header (short + matches pattern)
|
||||||
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
|
if pattern.match(stripped) and len(stripped.split()) <= 5:
|
||||||
f"Resume text:\n{raw_text[:4000]}"
|
current = section
|
||||||
)
|
matched = True
|
||||||
router = LLMRouter()
|
break
|
||||||
return router.complete(prompt, max_tokens=2048)
|
if not matched:
|
||||||
|
sections.setdefault(current, []).append(stripped)
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
def structure_resume(raw_text: str) -> tuple[dict, str]:
|
# ── Contact info ──────────────────────────────────────────────────────────────
|
||||||
"""Convert raw resume text to a structured dict via LLM.
|
|
||||||
|
def _parse_header(lines: list[str]) -> dict:
|
||||||
|
"""Extract name, email, phone from the top-of-resume block."""
|
||||||
|
full_text = "\n".join(lines)
|
||||||
|
email_m = _EMAIL_RE.search(full_text)
|
||||||
|
phone_m = _PHONE_RE.search(full_text)
|
||||||
|
|
||||||
|
# Name heuristic: first non-empty line that has no @ and no digits-only tokens
|
||||||
|
name = ""
|
||||||
|
for line in lines[:5]:
|
||||||
|
if "@" in line or re.match(r"^\d", line.strip()):
|
||||||
|
continue
|
||||||
|
# Skip lines that look like city/state/zip
|
||||||
|
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
|
||||||
|
continue
|
||||||
|
candidate = re.sub(r"[|•·,]+", " ", line).strip()
|
||||||
|
candidate = re.sub(r"\s{2,}", " ", candidate)
|
||||||
|
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
|
||||||
|
name = candidate
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"name": name,
|
||||||
|
"email": email_m.group(0) if email_m else "",
|
||||||
|
"phone": phone_m.group(0) if phone_m else "",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Experience ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _parse_experience(lines: list[str]) -> list[dict]:
|
||||||
|
"""Parse work experience entries from section lines.
|
||||||
|
|
||||||
|
Handles two common layouts:
|
||||||
|
(A) Title | Company (B) Title | Company | Dates
|
||||||
|
Dates • bullet
|
||||||
|
• bullet
|
||||||
|
"""
|
||||||
|
entries: list[dict] = []
|
||||||
|
current: dict | None = None
|
||||||
|
prev_line = ""
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
date_match = _DATE_RANGE_RE.search(line)
|
||||||
|
if date_match:
|
||||||
|
if current:
|
||||||
|
entries.append(current)
|
||||||
|
# Title/company may be on this line (layout B) or the previous line (layout A)
|
||||||
|
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
|
||||||
|
header = same_line if same_line.strip() else prev_line
|
||||||
|
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
|
||||||
|
current = {
|
||||||
|
"title": parts[0].strip() if parts else "",
|
||||||
|
"company": parts[1].strip() if len(parts) > 1 else "",
|
||||||
|
"start_date": date_match.group(1),
|
||||||
|
"end_date": date_match.group(2),
|
||||||
|
"bullets": [],
|
||||||
|
}
|
||||||
|
prev_line = ""
|
||||||
|
elif current is not None:
|
||||||
|
is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
|
||||||
|
looks_like_header = (
|
||||||
|
not is_bullet
|
||||||
|
and " | " in line
|
||||||
|
and not _DATE_RE.search(line)
|
||||||
|
)
|
||||||
|
if looks_like_header:
|
||||||
|
# Likely the title/company of the next entry — hold it as prev_line
|
||||||
|
prev_line = line
|
||||||
|
else:
|
||||||
|
clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
|
||||||
|
if clean:
|
||||||
|
current["bullets"].append(clean)
|
||||||
|
prev_line = line
|
||||||
|
else:
|
||||||
|
prev_line = line
|
||||||
|
|
||||||
|
if current:
|
||||||
|
entries.append(current)
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
# ── Education ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _parse_education(lines: list[str]) -> list[dict]:
|
||||||
|
entries: list[dict] = []
|
||||||
|
current: dict | None = None
|
||||||
|
prev_line = ""
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
if _DEGREE_RE.search(line):
|
||||||
|
if current:
|
||||||
|
entries.append(current)
|
||||||
|
current = {
|
||||||
|
"institution": "",
|
||||||
|
"degree": "",
|
||||||
|
"field": "",
|
||||||
|
"graduation_year": "",
|
||||||
|
}
|
||||||
|
year_m = re.search(r"\b(19|20)\d{2}\b", line)
|
||||||
|
if year_m:
|
||||||
|
current["graduation_year"] = year_m.group(0)
|
||||||
|
degree_m = _DEGREE_RE.search(line)
|
||||||
|
if degree_m:
|
||||||
|
current["degree"] = degree_m.group(0).upper()
|
||||||
|
remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
|
||||||
|
remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
|
||||||
|
current["field"] = remainder.strip(" ,–—|•.")
|
||||||
|
# Layout A: institution was on the line before the degree line
|
||||||
|
if prev_line and not _DEGREE_RE.search(prev_line):
|
||||||
|
current["institution"] = prev_line.strip(" ,–—|•")
|
||||||
|
elif current is not None and not current["institution"]:
|
||||||
|
# Layout B: institution follows the degree line
|
||||||
|
clean = line.strip(" ,–—|•")
|
||||||
|
if clean:
|
||||||
|
current["institution"] = clean
|
||||||
|
prev_line = line.strip()
|
||||||
|
|
||||||
|
if current:
|
||||||
|
entries.append(current)
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
# ── Skills ────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _parse_skills(lines: list[str]) -> list[str]:
|
||||||
|
skills: list[str] = []
|
||||||
|
for line in lines:
|
||||||
|
# Split on common delimiters
|
||||||
|
for item in re.split(r"[,|•·/]+", line):
|
||||||
|
clean = item.strip(" -–—*◦▪▸►()")
|
||||||
|
if 1 < len(clean) <= 50:
|
||||||
|
skills.append(clean)
|
||||||
|
return skills
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main parser ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def parse_resume(raw_text: str) -> tuple[dict, str]:
|
||||||
|
"""Parse resume text into a structured dict using section detection + regex.
|
||||||
|
|
||||||
Returns (result_dict, error_message). result_dict is empty on failure.
|
Returns (result_dict, error_message). result_dict is empty on failure.
|
||||||
"""
|
"""
|
||||||
import traceback
|
|
||||||
if not raw_text.strip():
|
if not raw_text.strip():
|
||||||
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
|
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
|
||||||
raw = ""
|
|
||||||
try:
|
try:
|
||||||
raw = _llm_structure(raw_text)
|
sections = _split_sections(raw_text)
|
||||||
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
|
contact = _parse_header(sections.get("header", []))
|
||||||
cleaned = re.sub(r"\s*```$", "", cleaned)
|
result = {
|
||||||
try:
|
**contact,
|
||||||
return json.loads(cleaned), ""
|
"career_summary": " ".join(sections.get("summary", [])),
|
||||||
except json.JSONDecodeError:
|
"experience": _parse_experience(sections.get("experience", [])),
|
||||||
# Try json-repair before giving up — handles truncation and minor malformations
|
"education": _parse_education(sections.get("education", [])),
|
||||||
from json_repair import repair_json
|
"skills": _parse_skills(sections.get("skills", [])),
|
||||||
repaired = repair_json(cleaned)
|
"achievements": sections.get("achievements", []),
|
||||||
result = json.loads(repaired)
|
}
|
||||||
log.warning("[resume_parser] Used json-repair to recover malformed output")
|
return result, ""
|
||||||
return result, ""
|
|
||||||
except json.JSONDecodeError as e:
|
|
||||||
log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
|
|
||||||
return {}, f"LLM returned invalid JSON: {e}"
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error("[resume_parser] Error:\n%s", traceback.format_exc())
|
import traceback
|
||||||
|
log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
|
||||||
return {}, str(e)
|
return {}, str(e)
|
||||||
|
|
||||||
|
|
||||||
|
# ── LLM enhancement (career summary only, optional) ──────────────────────────
|
||||||
|
|
||||||
|
def _llm_career_summary(raw_text: str) -> str:
|
||||||
|
"""Use LLM to generate a career summary. Returns empty string on any failure."""
|
||||||
|
try:
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
prompt = (
|
||||||
|
"Write a 2-3 sentence professional career summary for this candidate "
|
||||||
|
"based on their resume. Return only the summary text, no labels.\n\n"
|
||||||
|
f"Resume:\n{raw_text[:1500]}"
|
||||||
|
)
|
||||||
|
return LLMRouter().complete(prompt)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public entry point ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def structure_resume(raw_text: str) -> tuple[dict, str]:
|
||||||
|
"""Parse resume and optionally enhance career_summary via LLM.
|
||||||
|
|
||||||
|
Returns (result_dict, error_message).
|
||||||
|
"""
|
||||||
|
result, err = parse_resume(raw_text)
|
||||||
|
if not result:
|
||||||
|
return result, err
|
||||||
|
|
||||||
|
# Enhance career summary via LLM if the section wasn't found in the document
|
||||||
|
if not result.get("career_summary"):
|
||||||
|
try:
|
||||||
|
summary = _llm_career_summary(raw_text)
|
||||||
|
except Exception:
|
||||||
|
summary = ""
|
||||||
|
if summary:
|
||||||
|
result["career_summary"] = summary.strip()
|
||||||
|
|
||||||
|
return result, ""
|
||||||
|
|
|
||||||
|
|
@ -41,51 +41,62 @@ def test_extract_docx_returns_string():
|
||||||
assert "Senior Developer" in result
|
assert "Senior Developer" in result
|
||||||
|
|
||||||
|
|
||||||
def test_structure_resume_returns_dict():
|
def test_structure_resume_returns_tuple_with_keys():
|
||||||
"""structure_resume returns a dict with expected keys when LLM returns valid JSON."""
|
"""structure_resume returns (dict, str) tuple with expected keys from plain text."""
|
||||||
raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023"
|
raw_text = (
|
||||||
llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}'
|
"Jane Doe\njane@example.com\n\n"
|
||||||
|
"Experience\nSoftware Engineer | Acme Corp\nJan 2020 - Dec 2023\n• Built things\n\n"
|
||||||
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
|
"Skills\nPython, SQL"
|
||||||
from scripts.resume_parser import structure_resume
|
)
|
||||||
result = structure_resume(raw_text)
|
from scripts.resume_parser import structure_resume
|
||||||
|
result, err = structure_resume(raw_text)
|
||||||
|
|
||||||
|
assert err == ""
|
||||||
assert isinstance(result, dict)
|
assert isinstance(result, dict)
|
||||||
assert "experience" in result
|
assert "experience" in result
|
||||||
assert isinstance(result["experience"], list)
|
assert isinstance(result["experience"], list)
|
||||||
assert result["name"] == "Jane Doe"
|
assert result["name"] == "Jane Doe"
|
||||||
|
assert result["email"] == "jane@example.com"
|
||||||
|
|
||||||
|
|
||||||
def test_structure_resume_strips_markdown_fences():
|
def test_structure_resume_empty_text_returns_error():
|
||||||
"""structure_resume handles LLM output wrapped in ```json ... ``` fences."""
|
"""structure_resume returns empty dict + error message for empty input."""
|
||||||
raw_text = "Some resume"
|
from scripts.resume_parser import structure_resume
|
||||||
llm_response = '```json\n{"name": "Bob", "experience": []}\n```'
|
result, err = structure_resume(" ")
|
||||||
|
|
||||||
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
|
|
||||||
from scripts.resume_parser import structure_resume
|
|
||||||
result = structure_resume(raw_text)
|
|
||||||
|
|
||||||
assert result.get("name") == "Bob"
|
|
||||||
|
|
||||||
|
|
||||||
def test_structure_resume_invalid_json_returns_empty():
|
|
||||||
"""structure_resume returns {} on invalid JSON instead of crashing."""
|
|
||||||
with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
|
|
||||||
from scripts.resume_parser import structure_resume
|
|
||||||
result = structure_resume("some text")
|
|
||||||
|
|
||||||
assert isinstance(result, dict)
|
|
||||||
assert result == {}
|
assert result == {}
|
||||||
|
assert err != ""
|
||||||
|
|
||||||
|
|
||||||
def test_structure_resume_llm_exception_returns_empty():
|
def test_parse_resume_contact_extraction():
|
||||||
"""structure_resume returns {} when LLM raises an exception."""
|
"""parse_resume correctly extracts name, email, and phone from header block."""
|
||||||
with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")):
|
raw_text = (
|
||||||
|
"Alice Smith\nalice.smith@email.com | (206) 555-9999\n\n"
|
||||||
|
"Skills\nLeadership, Communication"
|
||||||
|
)
|
||||||
|
from scripts.resume_parser import parse_resume
|
||||||
|
result, err = parse_resume(raw_text)
|
||||||
|
|
||||||
|
assert err == ""
|
||||||
|
assert result["name"] == "Alice Smith"
|
||||||
|
assert result["email"] == "alice.smith@email.com"
|
||||||
|
assert "555-9999" in result["phone"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_structure_resume_llm_failure_still_returns_result():
|
||||||
|
"""structure_resume returns usable result even when LLM career summary fails."""
|
||||||
|
raw_text = (
|
||||||
|
"Bob Jones\nbob@test.com\n\n"
|
||||||
|
"Skills\nProject Management, Agile"
|
||||||
|
)
|
||||||
|
with patch("scripts.resume_parser._llm_career_summary", side_effect=Exception("LLM down")):
|
||||||
from scripts.resume_parser import structure_resume
|
from scripts.resume_parser import structure_resume
|
||||||
result = structure_resume("some text")
|
result, err = structure_resume(raw_text)
|
||||||
|
|
||||||
assert isinstance(result, dict)
|
# Regex parse should still succeed even if LLM summary enhancement fails
|
||||||
assert result == {}
|
assert err == ""
|
||||||
|
assert result["name"] == "Bob Jones"
|
||||||
|
assert "Project Management" in result["skills"]
|
||||||
|
|
||||||
|
|
||||||
def test_extract_pdf_empty_page_returns_string():
|
def test_extract_pdf_empty_page_returns_string():
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue