peregrine/scripts/resume_parser.py

86 lines
3.2 KiB
Python

"""
Resume parser — extract text from PDF/DOCX and structure via LLM.
Fast path: file bytes → raw text → LLM structures into resume dict.
Result dict keys mirror plain_text_resume.yaml sections.
Falls back to empty dict on any LLM/parsing error — caller should
then show the guided form builder.
"""
from __future__ import annotations
import io
import json
import logging
import re
import pdfplumber
from docx import Document
log = logging.getLogger(__name__)
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract raw text from PDF bytes using pdfplumber.
Returns empty string if extraction fails for any page.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages)
def extract_text_from_docx(file_bytes: bytes) -> str:
"""Extract raw text from DOCX bytes using python-docx."""
doc = Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def _llm_structure(raw_text: str) -> str:
"""Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
from scripts.llm_router import LLMRouter
prompt = (
"You are a resume parser. Convert the following resume text into a JSON object.\n\n"
"Required JSON keys:\n"
"- name (string)\n"
"- email (string, may be empty)\n"
"- phone (string, may be empty)\n"
"- career_summary (string: 2-4 sentence professional summary)\n"
"- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
"- education (list of objects with: institution, degree, field, graduation_year)\n"
"- skills (list of strings)\n"
"- achievements (list of strings, may be empty)\n\n"
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
f"Resume text:\n{raw_text[:4000]}"
)
router = LLMRouter()
return router.complete(prompt, max_tokens=2048)
def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Convert raw resume text to a structured dict via LLM.
Returns (result_dict, error_message). result_dict is empty on failure.
"""
import traceback
if not raw_text.strip():
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
raw = ""
try:
raw = _llm_structure(raw_text)
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
cleaned = re.sub(r"\s*```$", "", cleaned)
try:
return json.loads(cleaned), ""
except json.JSONDecodeError:
# Try json-repair before giving up — handles truncation and minor malformations
from json_repair import repair_json
repaired = repair_json(cleaned)
result = json.loads(repaired)
log.warning("[resume_parser] Used json-repair to recover malformed output")
return result, ""
except json.JSONDecodeError as e:
log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
return {}, f"LLM returned invalid JSON: {e}"
except Exception as e:
log.error("[resume_parser] Error:\n%s", traceback.format_exc())
return {}, str(e)