feat: resume parser — PDF/DOCX extraction + LLM structuring
This commit is contained in:
parent
2a7f88a771
commit
c7e4749fc6
2 changed files with 174 additions and 0 deletions
68
scripts/resume_parser.py
Normal file
68
scripts/resume_parser.py
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
"""
|
||||
Resume parser — extract text from PDF/DOCX and structure via LLM.
|
||||
|
||||
Fast path: file bytes → raw text → LLM structures into resume dict.
|
||||
Result dict keys mirror plain_text_resume.yaml sections.
|
||||
|
||||
Falls back to empty dict on any LLM/parsing error — caller should
|
||||
then show the guided form builder.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
|
||||
import pdfplumber
|
||||
from docx import Document
|
||||
|
||||
|
||||
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
||||
"""Extract raw text from PDF bytes using pdfplumber.
|
||||
|
||||
Returns empty string if extraction fails for any page.
|
||||
"""
|
||||
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
|
||||
pages = [page.extract_text() or "" for page in pdf.pages]
|
||||
return "\n".join(pages)
|
||||
|
||||
|
||||
def extract_text_from_docx(file_bytes: bytes) -> str:
|
||||
"""Extract raw text from DOCX bytes using python-docx."""
|
||||
doc = Document(io.BytesIO(file_bytes))
|
||||
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
|
||||
|
||||
|
||||
def _llm_structure(raw_text: str) -> str:
|
||||
"""Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
|
||||
from scripts.llm_router import LLMRouter
|
||||
prompt = (
|
||||
"You are a resume parser. Convert the following resume text into a JSON object.\n\n"
|
||||
"Required JSON keys:\n"
|
||||
"- name (string)\n"
|
||||
"- email (string, may be empty)\n"
|
||||
"- phone (string, may be empty)\n"
|
||||
"- career_summary (string: 2-4 sentence professional summary)\n"
|
||||
"- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
|
||||
"- education (list of objects with: institution, degree, field, graduation_year)\n"
|
||||
"- skills (list of strings)\n"
|
||||
"- achievements (list of strings, may be empty)\n\n"
|
||||
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
|
||||
f"Resume text:\n{raw_text[:6000]}"
|
||||
)
|
||||
router = LLMRouter()
|
||||
return router.complete(prompt)
|
||||
|
||||
|
||||
def structure_resume(raw_text: str) -> dict:
|
||||
"""Convert raw resume text to a structured dict via LLM.
|
||||
|
||||
Returns an empty dict on any failure — caller should fall back to form builder.
|
||||
"""
|
||||
try:
|
||||
raw = _llm_structure(raw_text)
|
||||
# Strip markdown code fences if present
|
||||
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
|
||||
raw = re.sub(r"\s*```$", "", raw)
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
return {}
|
||||
106
tests/test_resume_parser.py
Normal file
106
tests/test_resume_parser.py
Normal file
|
|
@ -0,0 +1,106 @@
|
|||
import sys
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch, MagicMock
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
|
||||
def test_extract_pdf_returns_string():
|
||||
"""PDF extraction returns a string containing the expected text."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.extract_text.return_value = "Jane Doe\nSoftware Engineer"
|
||||
mock_pdf_context = MagicMock()
|
||||
mock_pdf_context.pages = [mock_page]
|
||||
mock_pdf_cm = MagicMock()
|
||||
mock_pdf_cm.__enter__ = MagicMock(return_value=mock_pdf_context)
|
||||
mock_pdf_cm.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch("scripts.resume_parser.pdfplumber") as mock_pdfplumber:
|
||||
mock_pdfplumber.open.return_value = mock_pdf_cm
|
||||
from scripts.resume_parser import extract_text_from_pdf
|
||||
result = extract_text_from_pdf(b"%PDF-fake")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert "Jane Doe" in result
|
||||
|
||||
|
||||
def test_extract_docx_returns_string():
|
||||
"""DOCX extraction returns a string containing the expected text."""
|
||||
mock_para1 = MagicMock()
|
||||
mock_para1.text = "Alice Smith"
|
||||
mock_para2 = MagicMock()
|
||||
mock_para2.text = "Senior Developer"
|
||||
mock_doc = MagicMock()
|
||||
mock_doc.paragraphs = [mock_para1, mock_para2]
|
||||
|
||||
with patch("scripts.resume_parser.Document", return_value=mock_doc):
|
||||
from scripts.resume_parser import extract_text_from_docx
|
||||
result = extract_text_from_docx(b"PK fake docx bytes")
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert "Alice Smith" in result
|
||||
assert "Senior Developer" in result
|
||||
|
||||
|
||||
def test_structure_resume_returns_dict():
|
||||
"""structure_resume returns a dict with expected keys when LLM returns valid JSON."""
|
||||
raw_text = "Jane Doe\nSoftware Engineer at Acme 2020-2023"
|
||||
llm_response = '{"name": "Jane Doe", "experience": [{"company": "Acme", "title": "Engineer", "bullets": []}], "skills": [], "education": []}'
|
||||
|
||||
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
|
||||
from scripts.resume_parser import structure_resume
|
||||
result = structure_resume(raw_text)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "experience" in result
|
||||
assert isinstance(result["experience"], list)
|
||||
assert result["name"] == "Jane Doe"
|
||||
|
||||
|
||||
def test_structure_resume_strips_markdown_fences():
|
||||
"""structure_resume handles LLM output wrapped in ```json ... ``` fences."""
|
||||
raw_text = "Some resume"
|
||||
llm_response = '```json\n{"name": "Bob", "experience": []}\n```'
|
||||
|
||||
with patch("scripts.resume_parser._llm_structure", return_value=llm_response):
|
||||
from scripts.resume_parser import structure_resume
|
||||
result = structure_resume(raw_text)
|
||||
|
||||
assert result.get("name") == "Bob"
|
||||
|
||||
|
||||
def test_structure_resume_invalid_json_returns_empty():
|
||||
"""structure_resume returns {} on invalid JSON instead of crashing."""
|
||||
with patch("scripts.resume_parser._llm_structure", return_value="not json at all"):
|
||||
from scripts.resume_parser import structure_resume
|
||||
result = structure_resume("some text")
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert result == {}
|
||||
|
||||
|
||||
def test_structure_resume_llm_exception_returns_empty():
|
||||
"""structure_resume returns {} when LLM raises an exception."""
|
||||
with patch("scripts.resume_parser._llm_structure", side_effect=Exception("LLM down")):
|
||||
from scripts.resume_parser import structure_resume
|
||||
result = structure_resume("some text")
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert result == {}
|
||||
|
||||
|
||||
def test_extract_pdf_empty_page_returns_string():
|
||||
"""PDF with empty pages still returns a string (not None or crash)."""
|
||||
mock_page = MagicMock()
|
||||
mock_page.extract_text.return_value = None # pdfplumber can return None for empty pages
|
||||
mock_pdf_context = MagicMock()
|
||||
mock_pdf_context.pages = [mock_page]
|
||||
mock_pdf_cm = MagicMock()
|
||||
mock_pdf_cm.__enter__ = MagicMock(return_value=mock_pdf_context)
|
||||
mock_pdf_cm.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch("scripts.resume_parser.pdfplumber") as mock_pdfplumber:
|
||||
mock_pdfplumber.open.return_value = mock_pdf_cm
|
||||
from scripts.resume_parser import extract_text_from_pdf
|
||||
result = extract_text_from_pdf(b"%PDF-empty")
|
||||
|
||||
assert isinstance(result, str)
|
||||
Loading…
Reference in a new issue