feat: resume parser — PDF/DOCX extraction + LLM structuring

2026-02-25 08:04:48 -08:00 · 2026-02-25 08:04:48 -08:00 · 9258a91fd6
commit 9258a91fd6
parent 69057f6d10
2 changed files with 174 additions and 0 deletions
--- a/scripts/resume_parser.py
+++ b/scripts/resume_parser.py
@ -0,0 +1,68 @@
+"""
+Resume parser — extract text from PDF/DOCX and structure via LLM.
+
+Fast path: file bytes → raw text → LLM structures into resume dict.
+Result dict keys mirror plain_text_resume.yaml sections.
+
+Falls back to empty dict on any LLM/parsing error — caller should
+then show the guided form builder.
+"""
+from __future__ import annotations
+import io
+import json
+import re
+
+import pdfplumber
+from docx import Document
+
+
+def extract_text_from_pdf(file_bytes: bytes) -> str:
+    """Extract raw text from PDF bytes using pdfplumber.
+
+    Returns empty string if extraction fails for any page.
+    """
+    with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
+        pages = [page.extract_text() or "" for page in pdf.pages]
+    return "\n".join(pages)
+
+
+def extract_text_from_docx(file_bytes: bytes) -> str:
+    """Extract raw text from DOCX bytes using python-docx."""
+    doc = Document(io.BytesIO(file_bytes))
+    return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
+
+
+def _llm_structure(raw_text: str) -> str:
+    """Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
+    from scripts.llm_router import LLMRouter
+    prompt = (
+        "You are a resume parser. Convert the following resume text into a JSON object.\n\n"
+        "Required JSON keys:\n"
+        "- name (string)\n"
+        "- email (string, may be empty)\n"
+        "- phone (string, may be empty)\n"
+        "- career_summary (string: 2-4 sentence professional summary)\n"
+        "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
+        "- education (list of objects with: institution, degree, field, graduation_year)\n"
+        "- skills (list of strings)\n"
+        "- achievements (list of strings, may be empty)\n\n"
+        "Return ONLY valid JSON. No markdown, no explanation.\n\n"
+        f"Resume text:\n{raw_text[:6000]}"
+    )
+    router = LLMRouter()
+    return router.complete(prompt)
+
+
+def structure_resume(raw_text: str) -> dict:
+    """Convert raw resume text to a structured dict via LLM.
+
+    Returns an empty dict on any failure — caller should fall back to form builder.
+    """
+    try:
+        raw = _llm_structure(raw_text)
+        # Strip markdown code fences if present
+        raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
+        raw = re.sub(r"\s*```$", "", raw)
+        return json.loads(raw)
+    except Exception:
+        return {}
--- a/tests/test_resume_parser.py
+++ b/tests/test_resume_parser.py
@ -0,0 +1,106 @@
+import sys
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+
+def test_extract_pdf_returns_string():
+    """PDF extraction returns a string containing the expected text."""
+    mock_page = MagicMock()
+    mock_page.extract_text.return_value = "Jane Doe\nSoftware Engineer"
+    mock_pdf_context = MagicMock()
+    mock_pdf_context.pages = [mock_page]
+    mock_pdf_cm = MagicMock()
+    mock_pdf_cm.__enter__ = MagicMock(return_value=mock_pdf_context)
+    mock_pdf_cm.__exit__ = MagicMock(return_value=False)
+
+    with patch("scripts.resume_parser.pdfplumber") as mock_pdfplumber:
+        mock_pdfplumber.open.return_value = mock_pdf_cm
+        from scripts.resume_parser import extract_text_from_pdf
+        result = extract_text_from_pdf(b"%PDF-fake")
+
+    assert isinstance(result, str)
+    assert "Jane Doe" in result
+
+
+def test_extract_docx_returns_string():
+    """DOCX extraction returns a string containing the expected text."""
+    mock_para1 = MagicMock()
+    mock_para1.text = "Alice Smith"
+    mock_para2 = MagicMock()
+    mock_para2.text = "Senior Developer"
+    mock_doc = MagicMock()
+    mock_doc.paragraphs = [mock_para1, mock_para2]
+
+    with patch("scripts.resume_parser.Document", return_value=mock_doc):
+        from scripts.resume_parser import extract_text_from_docx
+        result = extract_text_from_docx(b"PK fake docx bytes")
+
+    assert isinstance(result, str)
+    assert "Alice Smith" in result
+    assert "Senior Developer" in result
+
+
+def test_structure_resume_returns_dict():
+    """structure_resume returns a dict with expected keys when LLM returns valid JSON."""
+    raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023"
+    llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}'
+
+    with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
+        from scripts.resume_parser import structure_resume
+        result = structure_resume(raw_text)
+
+    assert isinstance(result, dict)
+    assert "experience" in result
+    assert isinstance(result["experience"], list)
+    assert result["name"] == "Jane Doe"
+
+
+def test_structure_resume_strips_markdown_fences():
+    """structure_resume handles LLM output wrapped in ```json ... ``` fences."""
+    raw_text = "Some resume"
+    llm_response = '```json\n{"name": "Bob", "experience": []}\n```'
+
+    with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
+        from scripts.resume_parser import structure_resume
+        result = structure_resume(raw_text)
+
+    assert result.get("name") == "Bob"
+
+
+def test_structure_resume_invalid_json_returns_empty():
+    """structure_resume returns {} on invalid JSON instead of crashing."""
+    with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
+        from scripts.resume_parser import structure_resume
+        result = structure_resume("some text")
+
+    assert isinstance(result, dict)
+    assert result == {}
+
+
+def test_structure_resume_llm_exception_returns_empty():
+    """structure_resume returns {} when LLM raises an exception."""
+    with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")):
+        from scripts.resume_parser import structure_resume
+        result = structure_resume("some text")
+
+    assert isinstance(result, dict)
+    assert result == {}
+
+
+def test_extract_pdf_empty_page_returns_string():
+    """PDF with empty pages still returns a string (not None or crash)."""
+    mock_page = MagicMock()
+    mock_page.extract_text.return_value = None  # pdfplumber can return None for empty pages
+    mock_pdf_context = MagicMock()
+    mock_pdf_context.pages = [mock_page]
+    mock_pdf_cm = MagicMock()
+    mock_pdf_cm.__enter__ = MagicMock(return_value=mock_pdf_context)
+    mock_pdf_cm.__exit__ = MagicMock(return_value=False)
+
+    with patch("scripts.resume_parser.pdfplumber") as mock_pdfplumber:
+        mock_pdfplumber.open.return_value = mock_pdf_cm
+        from scripts.resume_parser import extract_text_from_pdf
+        result = extract_text_from_pdf(b"%PDF-empty")
+
+    assert isinstance(result, str)