""" Resume parser — extract text from PDF/DOCX and structure via LLM. Fast path: file bytes → raw text → LLM structures into resume dict. Result dict keys mirror plain_text_resume.yaml sections. Falls back to empty dict on any LLM/parsing error — caller should then show the guided form builder. """ from __future__ import annotations import io import json import logging import re import pdfplumber from docx import Document log = logging.getLogger(__name__) def extract_text_from_pdf(file_bytes: bytes) -> str: """Extract raw text from PDF bytes using pdfplumber. Returns empty string if extraction fails for any page. """ with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: pages = [page.extract_text() or "" for page in pdf.pages] return "\n".join(pages) def extract_text_from_docx(file_bytes: bytes) -> str: """Extract raw text from DOCX bytes using python-docx.""" doc = Document(io.BytesIO(file_bytes)) return "\n".join(p.text for p in doc.paragraphs if p.text.strip()) def _llm_structure(raw_text: str) -> str: """Call LLM to convert raw resume text to JSON. Returns raw LLM output string.""" from scripts.llm_router import LLMRouter prompt = ( "You are a resume parser. Convert the following resume text into a JSON object.\n\n" "Required JSON keys:\n" "- name (string)\n" "- email (string, may be empty)\n" "- phone (string, may be empty)\n" "- career_summary (string: 2-4 sentence professional summary)\n" "- experience (list of objects with: company, title, start_date, end_date, bullets list of strings)\n" "- education (list of objects with: institution, degree, field, graduation_year)\n" "- skills (list of strings)\n" "- achievements (list of strings, may be empty)\n\n" "Return ONLY valid JSON. No markdown, no explanation.\n\n" f"Resume text:\n{raw_text[:4000]}" ) router = LLMRouter() return router.complete(prompt, max_tokens=2048) def structure_resume(raw_text: str) -> tuple[dict, str]: """Convert raw resume text to a structured dict via LLM. Returns (result_dict, error_message). result_dict is empty on failure. """ import traceback if not raw_text.strip(): return {}, "Text extraction returned empty — the file may be image-based or unreadable." raw = "" try: raw = _llm_structure(raw_text) cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip()) cleaned = re.sub(r"\s*```$", "", cleaned) try: return json.loads(cleaned), "" except json.JSONDecodeError: # Try json-repair before giving up — handles truncation and minor malformations from json_repair import repair_json repaired = repair_json(cleaned) result = json.loads(repaired) log.warning("[resume_parser] Used json-repair to recover malformed output") return result, "" except json.JSONDecodeError as e: log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500]) return {}, f"LLM returned invalid JSON: {e}" except Exception as e: log.error("[resume_parser] Error:\n%s", traceback.format_exc()) return {}, str(e)