peregrine/scripts/resume_parser.py
pyr0ball ab6d7f2c87 refactor: replace LLM-based resume parser with section regex parser
Primary parse path is now fully deterministic — no LLM, no token limits,
no JSON generation. Handles two-column experience headers, institution-before-
or-after-degree education layouts, and header bleed prevention via
looks_like_header detection.

LLM path retained as optional career_summary enhancement only (1500 chars,
falls back silently). structure_resume() now returns tuple[dict, str].
Tests updated to match the new API.
2026-02-26 07:34:25 -08:00

306 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Resume parser — extract text from PDF/DOCX and structure via section parsing.
Primary path: regex + section detection (no LLM, no token limits).
Optional enhancement: LLM-generated career_summary if a capable backend is configured.
Falls back to empty dict on unrecoverable errors — caller shows the form builder.
"""
from __future__ import annotations
import io
import json
import logging
import re
from pathlib import Path
import pdfplumber
from docx import Document
log = logging.getLogger(__name__)
# ── Section header detection ──────────────────────────────────────────────────
_SECTION_NAMES = {
"summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I),
"experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I),
"education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I),
"skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I),
"achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I),
}
# Degrees — used to detect education lines
_DEGREE_RE = re.compile(
r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b",
re.I,
)
# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022"
_DATE_RE = re.compile(
r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|"
r"july|august|september|october|november|december)?\s*\d{4}\b"
r"|\b\d{1,2}/\d{4}\b",
re.I,
)
_DATE_RANGE_RE = re.compile(
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r")"
r"\s*[-—to]+\s*"
r"("
r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}"
r"|\d{1,2}/\d{4}"
r"|\d{4}"
r"|present|current|now"
r")",
re.I,
)
# Contact info
_EMAIL_RE = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+")
_PHONE_RE = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}")
_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I)
# ── Text extraction ───────────────────────────────────────────────────────────
def extract_text_from_pdf(file_bytes: bytes) -> str:
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages)
def extract_text_from_docx(file_bytes: bytes) -> str:
doc = Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
# ── Section splitter ──────────────────────────────────────────────────────────
def _split_sections(text: str) -> dict[str, list[str]]:
"""Split resume text into named sections. Lines that don't match a known
section header go into 'header' (assumed to be contact/name block)."""
sections: dict[str, list[str]] = {"header": []}
current = "header"
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
matched = False
for section, pattern in _SECTION_NAMES.items():
# Match if the line IS a section header (short + matches pattern)
if pattern.match(stripped) and len(stripped.split()) <= 5:
current = section
matched = True
break
if not matched:
sections.setdefault(current, []).append(stripped)
return sections
# ── Contact info ──────────────────────────────────────────────────────────────
def _parse_header(lines: list[str]) -> dict:
"""Extract name, email, phone from the top-of-resume block."""
full_text = "\n".join(lines)
email_m = _EMAIL_RE.search(full_text)
phone_m = _PHONE_RE.search(full_text)
# Name heuristic: first non-empty line that has no @ and no digits-only tokens
name = ""
for line in lines[:5]:
if "@" in line or re.match(r"^\d", line.strip()):
continue
# Skip lines that look like city/state/zip
if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line):
continue
candidate = re.sub(r"[|•·,]+", " ", line).strip()
candidate = re.sub(r"\s{2,}", " ", candidate)
if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha():
name = candidate
break
return {
"name": name,
"email": email_m.group(0) if email_m else "",
"phone": phone_m.group(0) if phone_m else "",
}
# ── Experience ────────────────────────────────────────────────────────────────
def _parse_experience(lines: list[str]) -> list[dict]:
"""Parse work experience entries from section lines.
Handles two common layouts:
(A) Title | Company (B) Title | Company | Dates
Dates • bullet
• bullet
"""
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
date_match = _DATE_RANGE_RE.search(line)
if date_match:
if current:
entries.append(current)
# Title/company may be on this line (layout B) or the previous line (layout A)
same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•")
header = same_line if same_line.strip() else prev_line
parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1)
current = {
"title": parts[0].strip() if parts else "",
"company": parts[1].strip() if len(parts) > 1 else "",
"start_date": date_match.group(1),
"end_date": date_match.group(2),
"bullets": [],
}
prev_line = ""
elif current is not None:
is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line))
looks_like_header = (
not is_bullet
and " | " in line
and not _DATE_RE.search(line)
)
if looks_like_header:
# Likely the title/company of the next entry — hold it as prev_line
prev_line = line
else:
clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip()
if clean:
current["bullets"].append(clean)
prev_line = line
else:
prev_line = line
if current:
entries.append(current)
return entries
# ── Education ─────────────────────────────────────────────────────────────────
def _parse_education(lines: list[str]) -> list[dict]:
entries: list[dict] = []
current: dict | None = None
prev_line = ""
for line in lines:
if _DEGREE_RE.search(line):
if current:
entries.append(current)
current = {
"institution": "",
"degree": "",
"field": "",
"graduation_year": "",
}
year_m = re.search(r"\b(19|20)\d{2}\b", line)
if year_m:
current["graduation_year"] = year_m.group(0)
degree_m = _DEGREE_RE.search(line)
if degree_m:
current["degree"] = degree_m.group(0).upper()
remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line))
remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder)
current["field"] = remainder.strip(" ,–—|•.")
# Layout A: institution was on the line before the degree line
if prev_line and not _DEGREE_RE.search(prev_line):
current["institution"] = prev_line.strip(" ,–—|•")
elif current is not None and not current["institution"]:
# Layout B: institution follows the degree line
clean = line.strip(" ,–—|•")
if clean:
current["institution"] = clean
prev_line = line.strip()
if current:
entries.append(current)
return entries
# ── Skills ────────────────────────────────────────────────────────────────────
def _parse_skills(lines: list[str]) -> list[str]:
skills: list[str] = []
for line in lines:
# Split on common delimiters
for item in re.split(r"[,|•·/]+", line):
clean = item.strip(" -–—*◦▪▸►()")
if 1 < len(clean) <= 50:
skills.append(clean)
return skills
# ── Main parser ───────────────────────────────────────────────────────────────
def parse_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume text into a structured dict using section detection + regex.
Returns (result_dict, error_message). result_dict is empty on failure.
"""
if not raw_text.strip():
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
try:
sections = _split_sections(raw_text)
contact = _parse_header(sections.get("header", []))
result = {
**contact,
"career_summary": " ".join(sections.get("summary", [])),
"experience": _parse_experience(sections.get("experience", [])),
"education": _parse_education(sections.get("education", [])),
"skills": _parse_skills(sections.get("skills", [])),
"achievements": sections.get("achievements", []),
}
return result, ""
except Exception as e:
import traceback
log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc())
return {}, str(e)
# ── LLM enhancement (career summary only, optional) ──────────────────────────
def _llm_career_summary(raw_text: str) -> str:
"""Use LLM to generate a career summary. Returns empty string on any failure."""
try:
from scripts.llm_router import LLMRouter
prompt = (
"Write a 2-3 sentence professional career summary for this candidate "
"based on their resume. Return only the summary text, no labels.\n\n"
f"Resume:\n{raw_text[:1500]}"
)
return LLMRouter().complete(prompt)
except Exception:
return ""
# ── Public entry point ────────────────────────────────────────────────────────
def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Parse resume and optionally enhance career_summary via LLM.
Returns (result_dict, error_message).
"""
result, err = parse_resume(raw_text)
if not result:
return result, err
# Enhance career summary via LLM if the section wasn't found in the document
if not result.get("career_summary"):
try:
summary = _llm_career_summary(raw_text)
except Exception:
summary = ""
if summary:
result["career_summary"] = summary.strip()
return result, ""