peregrine/scripts/resume_parser.py

68 lines
2.4 KiB
Python

"""
Resume parser — extract text from PDF/DOCX and structure via LLM.
Fast path: file bytes → raw text → LLM structures into resume dict.
Result dict keys mirror plain_text_resume.yaml sections.
Falls back to empty dict on any LLM/parsing error — caller should
then show the guided form builder.
"""
from __future__ import annotations
import io
import json
import re
import pdfplumber
from docx import Document
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract raw text from PDF bytes using pdfplumber.
Returns empty string if extraction fails for any page.
"""
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
pages = [page.extract_text() or "" for page in pdf.pages]
return "\n".join(pages)
def extract_text_from_docx(file_bytes: bytes) -> str:
"""Extract raw text from DOCX bytes using python-docx."""
doc = Document(io.BytesIO(file_bytes))
return "\n".join(p.text for p in doc.paragraphs if p.text.strip())
def _llm_structure(raw_text: str) -> str:
"""Call LLM to convert raw resume text to JSON. Returns raw LLM output string."""
from scripts.llm_router import LLMRouter
prompt = (
"You are a resume parser. Convert the following resume text into a JSON object.\n\n"
"Required JSON keys:\n"
"- name (string)\n"
"- email (string, may be empty)\n"
"- phone (string, may be empty)\n"
"- career_summary (string: 2-4 sentence professional summary)\n"
"- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n"
"- education (list of objects with: institution, degree, field, graduation_year)\n"
"- skills (list of strings)\n"
"- achievements (list of strings, may be empty)\n\n"
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
f"Resume text:\n{raw_text[:6000]}"
)
router = LLMRouter()
return router.complete(prompt)
def structure_resume(raw_text: str) -> dict:
"""Convert raw resume text to a structured dict via LLM.
Returns an empty dict on any failure — caller should fall back to form builder.
"""
try:
raw = _llm_structure(raw_text)
# Strip markdown code fences if present
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
raw = re.sub(r"\s*```$", "", raw)
return json.loads(raw)
except Exception:
return {}