From 9258a91fd67574186f45a874cf53a74d53d7e1f6 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 25 Feb 2026 08:04:48 -0800 Subject: [PATCH] =?UTF-8?q?feat:=20resume=20parser=20=E2=80=94=20PDF/DOCX?= =?UTF-8?q?=20extraction=20+=20LLM=20structuring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/resume_parser.py | 68 +++++++++++++++++++++++ tests/test_resume_parser.py | 106 ++++++++++++++++++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 scripts/resume_parser.py create mode 100644 tests/test_resume_parser.py diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py new file mode 100644 index 0000000..fceccfe --- /dev/null +++ b/scripts/resume_parser.py @@ -0,0 +1,68 @@ +""" +Resume parser — extract text from PDF/DOCX and structure via LLM. + +Fast path: file bytes → raw text → LLM structures into resume dict. +Result dict keys mirror plain_text_resume.yaml sections. + +Falls back to empty dict on any LLM/parsing error — caller should +then show the guided form builder. +""" +from __future__ import annotations +import io +import json +import re + +import pdfplumber +from docx import Document + + +def extract_text_from_pdf(file_bytes: bytes) -> str: + """Extract raw text from PDF bytes using pdfplumber. + + Returns empty string if extraction fails for any page. + """ + with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: + pages = [page.extract_text() or "" for page in pdf.pages] + return "\n".join(pages) + + +def extract_text_from_docx(file_bytes: bytes) -> str: + """Extract raw text from DOCX bytes using python-docx.""" + doc = Document(io.BytesIO(file_bytes)) + return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) + + +def _llm_structure(raw_text: str) -> str: + """Call LLM to convert raw resume text to JSON. Returns raw LLM output string.""" + from scripts.llm_router import LLMRouter + prompt = ( + "You are a resume parser. Convert the following resume text into a JSON object.\n\n" + "Required JSON keys:\n" + "- name (string)\n" + "- email (string, may be empty)\n" + "- phone (string, may be empty)\n" + "- career_summary (string: 2-4 sentence professional summary)\n" + "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n" + "- education (list of objects with: institution, degree, field, graduation_year)\n" + "- skills (list of strings)\n" + "- achievements (list of strings, may be empty)\n\n" + "Return ONLY valid JSON. No markdown, no explanation.\n\n" + f"Resume text:\n{raw_text[:6000]}" + ) + router = LLMRouter() + return router.complete(prompt) + + +def structure_resume(raw_text: str) -> dict: + """Convert raw resume text to a structured dict via LLM. + + Returns an empty dict on any failure — caller should fall back to form builder. + """ + try: + raw = _llm_structure(raw_text) + # Strip markdown code fences if present + raw = re.sub(r"^```(?:json)?\s*", "", raw.strip()) + raw = re.sub(r"\s*```$", "", raw) + return json.loads(raw) + except Exception: + return {} diff --git a/tests/test_resume_parser.py b/tests/test_resume_parser.py new file mode 100644 index 0000000..a0e363c --- /dev/null +++ b/tests/test_resume_parser.py @@ -0,0 +1,106 @@ +import sys +from pathlib import Path +from unittest.mock import patch, MagicMock +sys.path.insert(0, str(Path(__file__).parent.parent)) + + +def test_extract_pdf_returns_string(): + """PDF extraction returns a string containing the expected text.""" + mock_page = MagicMock() + mock_page.extract_text.return_value = "Jane Doe\nSoftware Engineer" + mock_pdf_context = MagicMock() + mock_pdf_context.pages = [mock_page] + mock_pdf_cm = MagicMock() + mock_pdf_cm.__enter__ = MagicMock(return_value=mock_pdf_context) + mock_pdf_cm.__exit__ = MagicMock(return_value=False) + + with patch("scripts.resume_parser.pdfplumber") as mock_pdfplumber: + mock_pdfplumber.open.return_value = mock_pdf_cm + from scripts.resume_parser import extract_text_from_pdf + result = extract_text_from_pdf(b"%PDF-fake") + + assert isinstance(result, str) + assert "Jane Doe" in result + + +def test_extract_docx_returns_string(): + """DOCX extraction returns a string containing the expected text.""" + mock_para1 = MagicMock() + mock_para1.text = "Alice Smith" + mock_para2 = MagicMock() + mock_para2.text = "Senior Developer" + mock_doc = MagicMock() + mock_doc.paragraphs = [mock_para1, mock_para2] + + with patch("scripts.resume_parser.Document", return_value=mock_doc): + from scripts.resume_parser import extract_text_from_docx + result = extract_text_from_docx(b"PK fake docx bytes") + + assert isinstance(result, str) + assert "Alice Smith" in result + assert "Senior Developer" in result + + +def test_structure_resume_returns_dict(): + """structure_resume returns a dict with expected keys when LLM returns valid JSON.""" + raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023" + llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}' + + with patch("scripts.resume_parser._llm_structure", return_value=llm_response): + from scripts.resume_parser import structure_resume + result = structure_resume(raw_text) + + assert isinstance(result, dict) + assert "experience" in result + assert isinstance(result["experience"], list) + assert result["name"] == "Jane Doe" + + +def test_structure_resume_strips_markdown_fences(): + """structure_resume handles LLM output wrapped in ```json ... ``` fences.""" + raw_text = "Some resume" + llm_response = '```json\n{"name": "Bob", "experience": []}\n```' + + with patch("scripts.resume_parser._llm_structure", return_value=llm_response): + from scripts.resume_parser import structure_resume + result = structure_resume(raw_text) + + assert result.get("name") == "Bob" + + +def test_structure_resume_invalid_json_returns_empty(): + """structure_resume returns {} on invalid JSON instead of crashing.""" + with patch("scripts.resume_parser._llm_structure", return_value="not json at all"): + from scripts.resume_parser import structure_resume + result = structure_resume("some text") + + assert isinstance(result, dict) + assert result == {} + + +def test_structure_resume_llm_exception_returns_empty(): + """structure_resume returns {} when LLM raises an exception.""" + with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")): + from scripts.resume_parser import structure_resume + result = structure_resume("some text") + + assert isinstance(result, dict) + assert result == {} + + +def test_extract_pdf_empty_page_returns_string(): + """PDF with empty pages still returns a string (not None or crash).""" + mock_page = MagicMock() + mock_page.extract_text.return_value = None # pdfplumber can return None for empty pages + mock_pdf_context = MagicMock() + mock_pdf_context.pages = [mock_page] + mock_pdf_cm = MagicMock() + mock_pdf_cm.__enter__ = MagicMock(return_value=mock_pdf_context) + mock_pdf_cm.__exit__ = MagicMock(return_value=False) + + with patch("scripts.resume_parser.pdfplumber") as mock_pdfplumber: + mock_pdfplumber.open.return_value = mock_pdf_cm + from scripts.resume_parser import extract_text_from_pdf + result = extract_text_from_pdf(b"%PDF-empty") + + assert isinstance(result, str)