From b9f5dd1fc3eda229f562f18af0bfe5818ac29910 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 26 Feb 2026 07:34:25 -0800 Subject: [PATCH] refactor: replace LLM-based resume parser with section regex parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Primary parse path is now fully deterministic — no LLM, no token limits, no JSON generation. Handles two-column experience headers, institution-before- or-after-degree education layouts, and header bleed prevention via looks_like_header detection. LLM path retained as optional career_summary enhancement only (1500 chars, falls back silently). structure_resume() now returns tuple[dict, str]. Tests updated to match the new API. --- scripts/resume_parser.py | 318 ++++++++++++++++++++++++++++++------ tests/test_resume_parser.py | 75 +++++---- 2 files changed, 312 insertions(+), 81 deletions(-) diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py index 53cd0a6..6644779 100644 --- a/scripts/resume_parser.py +++ b/scripts/resume_parser.py @@ -1,86 +1,306 @@ """ -Resume parser — extract text from PDF/DOCX and structure via LLM. +Resume parser — extract text from PDF/DOCX and structure via section parsing. -Fast path: file bytes → raw text → LLM structures into resume dict. -Result dict keys mirror plain_text_resume.yaml sections. +Primary path: regex + section detection (no LLM, no token limits). +Optional enhancement: LLM-generated career_summary if a capable backend is configured. -Falls back to empty dict on any LLM/parsing error — caller should -then show the guided form builder. +Falls back to empty dict on unrecoverable errors — caller shows the form builder. """ from __future__ import annotations + import io import json import logging import re +from pathlib import Path import pdfplumber from docx import Document log = logging.getLogger(__name__) +# ── Section header detection ────────────────────────────────────────────────── + +_SECTION_NAMES = { + "summary": re.compile(r"^(summary|objective|profile|about me|professional summary)", re.I), + "experience": re.compile(r"^(experience|work experience|employment|work history|professional experience)", re.I), + "education": re.compile(r"^(education|academic|qualifications|degrees?)", re.I), + "skills": re.compile(r"^(skills?|technical skills?|core competencies|competencies|expertise)", re.I), + "achievements": re.compile(r"^(achievements?|accomplishments?|awards?|honors?|certifications?)", re.I), +} + +# Degrees — used to detect education lines +_DEGREE_RE = re.compile( + r"\b(b\.?s\.?|b\.?a\.?|m\.?s\.?|m\.?b\.?a\.?|ph\.?d\.?|bachelor|master|associate|doctorate|diploma)\b", + re.I, +) + +# Date patterns for experience entries: "Jan 2020", "2020", "01/2020", "2019 - 2022" +_DATE_RE = re.compile( + r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|june|" + r"july|august|september|october|november|december)?\s*\d{4}\b" + r"|\b\d{1,2}/\d{4}\b", + re.I, +) +_DATE_RANGE_RE = re.compile( + r"(" + r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}" + r"|\d{1,2}/\d{4}" + r"|\d{4}" + r")" + r"\s*[-–—to]+\s*" + r"(" + r"(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\w*\.?\s+\d{4}" + r"|\d{1,2}/\d{4}" + r"|\d{4}" + r"|present|current|now" + r")", + re.I, +) + +# Contact info +_EMAIL_RE = re.compile(r"[\w.+\-]+@[\w\-]+\.[\w.\-]+") +_PHONE_RE = re.compile(r"(?:\+1[\s.\-]?)?\(?\d{3}\)?[\s.\-]?\d{3}[\s.\-]?\d{4}") +_LINKEDIN_RE = re.compile(r"linkedin\.com/in/[\w\-]+", re.I) + + +# ── Text extraction ─────────────────────────────────────────────────────────── def extract_text_from_pdf(file_bytes: bytes) -> str: - """Extract raw text from PDF bytes using pdfplumber. - - Returns empty string if extraction fails for any page. - """ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: pages = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages) def extract_text_from_docx(file_bytes: bytes) -> str: - """Extract raw text from DOCX bytes using python-docx.""" doc = Document(io.BytesIO(file_bytes)) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) -def _llm_structure(raw_text: str) -> str: - """Call LLM to convert raw resume text to JSON. Returns raw LLM output string.""" - from scripts.llm_router import LLMRouter - prompt = ( - "You are a resume parser. Convert the following resume text into a JSON object.\n\n" - "Required JSON keys:\n" - "- name (string)\n" - "- email (string, may be empty)\n" - "- phone (string, may be empty)\n" - "- career_summary (string: 2-4 sentence professional summary)\n" - "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n" - "- education (list of objects with: institution, degree, field, graduation_year)\n" - "- skills (list of strings)\n" - "- achievements (list of strings, may be empty)\n\n" - "Return ONLY valid JSON. No markdown, no explanation.\n\n" - f"Resume text:\n{raw_text[:4000]}" - ) - router = LLMRouter() - return router.complete(prompt, max_tokens=2048) +# ── Section splitter ────────────────────────────────────────────────────────── + +def _split_sections(text: str) -> dict[str, list[str]]: + """Split resume text into named sections. Lines that don't match a known + section header go into 'header' (assumed to be contact/name block).""" + sections: dict[str, list[str]] = {"header": []} + current = "header" + for line in text.splitlines(): + stripped = line.strip() + if not stripped: + continue + matched = False + for section, pattern in _SECTION_NAMES.items(): + # Match if the line IS a section header (short + matches pattern) + if pattern.match(stripped) and len(stripped.split()) <= 5: + current = section + matched = True + break + if not matched: + sections.setdefault(current, []).append(stripped) + return sections -def structure_resume(raw_text: str) -> tuple[dict, str]: - """Convert raw resume text to a structured dict via LLM. +# ── Contact info ────────────────────────────────────────────────────────────── + +def _parse_header(lines: list[str]) -> dict: + """Extract name, email, phone from the top-of-resume block.""" + full_text = "\n".join(lines) + email_m = _EMAIL_RE.search(full_text) + phone_m = _PHONE_RE.search(full_text) + + # Name heuristic: first non-empty line that has no @ and no digits-only tokens + name = "" + for line in lines[:5]: + if "@" in line or re.match(r"^\d", line.strip()): + continue + # Skip lines that look like city/state/zip + if re.search(r"\b[A-Z]{2}\b\s*\d{5}", line): + continue + candidate = re.sub(r"[|•·,]+", " ", line).strip() + candidate = re.sub(r"\s{2,}", " ", candidate) + if 2 <= len(candidate.split()) <= 5 and candidate.replace(" ", "").isalpha(): + name = candidate + break + + return { + "name": name, + "email": email_m.group(0) if email_m else "", + "phone": phone_m.group(0) if phone_m else "", + } + + +# ── Experience ──────────────────────────────────────────────────────────────── + +def _parse_experience(lines: list[str]) -> list[dict]: + """Parse work experience entries from section lines. + + Handles two common layouts: + (A) Title | Company (B) Title | Company | Dates + Dates • bullet + • bullet + """ + entries: list[dict] = [] + current: dict | None = None + prev_line = "" + + for line in lines: + date_match = _DATE_RANGE_RE.search(line) + if date_match: + if current: + entries.append(current) + # Title/company may be on this line (layout B) or the previous line (layout A) + same_line = _DATE_RANGE_RE.sub("", line).strip(" –—|-•") + header = same_line if same_line.strip() else prev_line + parts = re.split(r"\s{2,}|[|•·,–—]\s*", header.strip(), maxsplit=1) + current = { + "title": parts[0].strip() if parts else "", + "company": parts[1].strip() if len(parts) > 1 else "", + "start_date": date_match.group(1), + "end_date": date_match.group(2), + "bullets": [], + } + prev_line = "" + elif current is not None: + is_bullet = bool(re.match(r"^[•\-–—*◦▪▸►]\s*", line)) + looks_like_header = ( + not is_bullet + and " | " in line + and not _DATE_RE.search(line) + ) + if looks_like_header: + # Likely the title/company of the next entry — hold it as prev_line + prev_line = line + else: + clean = re.sub(r"^[•\-–—*◦▪▸►]\s*", "", line).strip() + if clean: + current["bullets"].append(clean) + prev_line = line + else: + prev_line = line + + if current: + entries.append(current) + + return entries + + +# ── Education ───────────────────────────────────────────────────────────────── + +def _parse_education(lines: list[str]) -> list[dict]: + entries: list[dict] = [] + current: dict | None = None + prev_line = "" + + for line in lines: + if _DEGREE_RE.search(line): + if current: + entries.append(current) + current = { + "institution": "", + "degree": "", + "field": "", + "graduation_year": "", + } + year_m = re.search(r"\b(19|20)\d{2}\b", line) + if year_m: + current["graduation_year"] = year_m.group(0) + degree_m = _DEGREE_RE.search(line) + if degree_m: + current["degree"] = degree_m.group(0).upper() + remainder = _DEGREE_RE.sub("", _DATE_RE.sub("", line)) + remainder = re.sub(r"\b(19|20)\d{2}\b", "", remainder) + current["field"] = remainder.strip(" ,–—|•.") + # Layout A: institution was on the line before the degree line + if prev_line and not _DEGREE_RE.search(prev_line): + current["institution"] = prev_line.strip(" ,–—|•") + elif current is not None and not current["institution"]: + # Layout B: institution follows the degree line + clean = line.strip(" ,–—|•") + if clean: + current["institution"] = clean + prev_line = line.strip() + + if current: + entries.append(current) + + return entries + + +# ── Skills ──────────────────────────────────────────────────────────────────── + +def _parse_skills(lines: list[str]) -> list[str]: + skills: list[str] = [] + for line in lines: + # Split on common delimiters + for item in re.split(r"[,|•·/]+", line): + clean = item.strip(" -–—*◦▪▸►()") + if 1 < len(clean) <= 50: + skills.append(clean) + return skills + + +# ── Main parser ─────────────────────────────────────────────────────────────── + +def parse_resume(raw_text: str) -> tuple[dict, str]: + """Parse resume text into a structured dict using section detection + regex. Returns (result_dict, error_message). result_dict is empty on failure. """ - import traceback if not raw_text.strip(): return {}, "Text extraction returned empty — the file may be image-based or unreadable." - raw = "" + try: - raw = _llm_structure(raw_text) - cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip()) - cleaned = re.sub(r"\s*```$", "", cleaned) - try: - return json.loads(cleaned), "" - except json.JSONDecodeError: - # Try json-repair before giving up — handles truncation and minor malformations - from json_repair import repair_json - repaired = repair_json(cleaned) - result = json.loads(repaired) - log.warning("[resume_parser] Used json-repair to recover malformed output") - return result, "" - except json.JSONDecodeError as e: - log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500]) - return {}, f"LLM returned invalid JSON: {e}" + sections = _split_sections(raw_text) + contact = _parse_header(sections.get("header", [])) + result = { + **contact, + "career_summary": " ".join(sections.get("summary", [])), + "experience": _parse_experience(sections.get("experience", [])), + "education": _parse_education(sections.get("education", [])), + "skills": _parse_skills(sections.get("skills", [])), + "achievements": sections.get("achievements", []), + } + return result, "" except Exception as e: - log.error("[resume_parser] Error:\n%s", traceback.format_exc()) + import traceback + log.error("[resume_parser] parse_resume error:\n%s", traceback.format_exc()) return {}, str(e) + + +# ── LLM enhancement (career summary only, optional) ────────────────────────── + +def _llm_career_summary(raw_text: str) -> str: + """Use LLM to generate a career summary. Returns empty string on any failure.""" + try: + from scripts.llm_router import LLMRouter + prompt = ( + "Write a 2-3 sentence professional career summary for this candidate " + "based on their resume. Return only the summary text, no labels.\n\n" + f"Resume:\n{raw_text[:1500]}" + ) + return LLMRouter().complete(prompt) + except Exception: + return "" + + +# ── Public entry point ──────────────────────────────────────────────────────── + +def structure_resume(raw_text: str) -> tuple[dict, str]: + """Parse resume and optionally enhance career_summary via LLM. + + Returns (result_dict, error_message). + """ + result, err = parse_resume(raw_text) + if not result: + return result, err + + # Enhance career summary via LLM if the section wasn't found in the document + if not result.get("career_summary"): + try: + summary = _llm_career_summary(raw_text) + except Exception: + summary = "" + if summary: + result["career_summary"] = summary.strip() + + return result, "" diff --git a/tests/test_resume_parser.py b/tests/test_resume_parser.py index a0e363c..43e4ec5 100644 --- a/tests/test_resume_parser.py +++ b/tests/test_resume_parser.py @@ -41,51 +41,62 @@ def test_extract_docx_returns_string(): assert "Senior Developer" in result -def test_structure_resume_returns_dict(): - """structure_resume returns a dict with expected keys when LLM returns valid JSON.""" - raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023" - llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}' - - with patch("scripts.resume_parser._llm_structure", return_value=llm_response): - from scripts.resume_parser import structure_resume - result = structure_resume(raw_text) +def test_structure_resume_returns_tuple_with_keys(): + """structure_resume returns (dict, str) tuple with expected keys from plain text.""" + raw_text = ( + "Jane Doe\njane@example.com\n\n" + "Experience\nSoftware Engineer | Acme Corp\nJan 2020 - Dec 2023\n• Built things\n\n" + "Skills\nPython, SQL" + ) + from scripts.resume_parser import structure_resume + result, err = structure_resume(raw_text) + assert err == "" assert isinstance(result, dict) assert "experience" in result assert isinstance(result["experience"], list) assert result["name"] == "Jane Doe" + assert result["email"] == "jane@example.com" -def test_structure_resume_strips_markdown_fences(): - """structure_resume handles LLM output wrapped in ```json ... ``` fences.""" - raw_text = "Some resume" - llm_response = '```json\n{"name": "Bob", "experience": []}\n```' +def test_structure_resume_empty_text_returns_error(): + """structure_resume returns empty dict + error message for empty input.""" + from scripts.resume_parser import structure_resume + result, err = structure_resume(" ") - with patch("scripts.resume_parser._llm_structure", return_value=llm_response): - from scripts.resume_parser import structure_resume - result = structure_resume(raw_text) - - assert result.get("name") == "Bob" - - -def test_structure_resume_invalid_json_returns_empty(): - """structure_resume returns {} on invalid JSON instead of crashing.""" - with patch("scripts.resume_parser._llm_structure", return_value="not json at all"): - from scripts.resume_parser import structure_resume - result = structure_resume("some text") - - assert isinstance(result, dict) assert result == {} + assert err != "" -def test_structure_resume_llm_exception_returns_empty(): - """structure_resume returns {} when LLM raises an exception.""" - with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")): +def test_parse_resume_contact_extraction(): + """parse_resume correctly extracts name, email, and phone from header block.""" + raw_text = ( + "Alice Smith\nalice.smith@email.com | (206) 555-9999\n\n" + "Skills\nLeadership, Communication" + ) + from scripts.resume_parser import parse_resume + result, err = parse_resume(raw_text) + + assert err == "" + assert result["name"] == "Alice Smith" + assert result["email"] == "alice.smith@email.com" + assert "555-9999" in result["phone"] + + +def test_structure_resume_llm_failure_still_returns_result(): + """structure_resume returns usable result even when LLM career summary fails.""" + raw_text = ( + "Bob Jones\nbob@test.com\n\n" + "Skills\nProject Management, Agile" + ) + with patch("scripts.resume_parser._llm_career_summary", side_effect=Exception("LLM down")): from scripts.resume_parser import structure_resume - result = structure_resume("some text") + result, err = structure_resume(raw_text) - assert isinstance(result, dict) - assert result == {} + # Regex parse should still succeed even if LLM summary enhancement fails + assert err == "" + assert result["name"] == "Bob Jones" + assert "Project Management" in result["skills"] def test_extract_pdf_empty_page_returns_string():