fix: resume parser — max_tokens, json-repair fallback, logging, PYTHONUNBUFFERED

This commit is contained in:
pyr0ball 2026-02-26 00:00:23 -08:00
parent 07d33b6e34
commit 7393ad2a14
5 changed files with 40 additions and 14 deletions

View file

@ -7,11 +7,14 @@ a "System" section so it doesn't crowd the navigation.
Run: streamlit run app/app.py Run: streamlit run app/app.py
bash scripts/manage-ui.sh start bash scripts/manage-ui.sh start
""" """
import logging
import sys import sys
from pathlib import Path from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
logging.basicConfig(level=logging.WARNING, format="%(name)s %(levelname)s: %(message)s")
import streamlit as st import streamlit as st
from scripts.db import DEFAULT_DB, init_db, get_active_tasks from scripts.db import DEFAULT_DB, init_db, get_active_tasks
import sqlite3 import sqlite3

View file

@ -19,6 +19,8 @@ services:
- PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0} - PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0}
- PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-} - PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-}
- RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote} - RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote}
- PYTHONUNBUFFERED=1
- PYTHONLOGGING=WARNING
depends_on: depends_on:
searxng: searxng:
condition: service_healthy condition: service_healthy

View file

@ -47,6 +47,7 @@ pypdf
pdfminer-six pdfminer-six
pdfplumber pdfplumber
python-docx python-docx
json-repair
pyyaml>=6.0 pyyaml>=6.0
python-dotenv python-dotenv

View file

@ -35,7 +35,8 @@ class LLMRouter:
def complete(self, prompt: str, system: str | None = None, def complete(self, prompt: str, system: str | None = None,
model_override: str | None = None, model_override: str | None = None,
fallback_order: list[str] | None = None, fallback_order: list[str] | None = None,
images: list[str] | None = None) -> str: images: list[str] | None = None,
max_tokens: int | None = None) -> str:
""" """
Generate a completion. Tries each backend in fallback_order. Generate a completion. Tries each backend in fallback_order.
@ -114,9 +115,10 @@ class LLMRouter:
else: else:
messages.append({"role": "user", "content": prompt}) messages.append({"role": "user", "content": prompt})
resp = client.chat.completions.create( create_kwargs: dict = {"model": model, "messages": messages}
model=model, messages=messages if max_tokens is not None:
) create_kwargs["max_tokens"] = max_tokens
resp = client.chat.completions.create(**create_kwargs)
print(f"[LLMRouter] Used backend: {name} ({model})") print(f"[LLMRouter] Used backend: {name} ({model})")
return resp.choices[0].message.content return resp.choices[0].message.content

View file

@ -10,11 +10,14 @@ then show the guided form builder.
from __future__ import annotations from __future__ import annotations
import io import io
import json import json
import logging
import re import re
import pdfplumber import pdfplumber
from docx import Document from docx import Document
log = logging.getLogger(__name__)
def extract_text_from_pdf(file_bytes: bytes) -> str: def extract_text_from_pdf(file_bytes: bytes) -> str:
"""Extract raw text from PDF bytes using pdfplumber. """Extract raw text from PDF bytes using pdfplumber.
@ -47,22 +50,37 @@ def _llm_structure(raw_text: str) -> str:
"- skills (list of strings)\n" "- skills (list of strings)\n"
"- achievements (list of strings, may be empty)\n\n" "- achievements (list of strings, may be empty)\n\n"
"Return ONLY valid JSON. No markdown, no explanation.\n\n" "Return ONLY valid JSON. No markdown, no explanation.\n\n"
f"Resume text:\n{raw_text[:6000]}" f"Resume text:\n{raw_text[:4000]}"
) )
router = LLMRouter() router = LLMRouter()
return router.complete(prompt) return router.complete(prompt, max_tokens=2048)
def structure_resume(raw_text: str) -> dict: def structure_resume(raw_text: str) -> tuple[dict, str]:
"""Convert raw resume text to a structured dict via LLM. """Convert raw resume text to a structured dict via LLM.
Returns an empty dict on any failure caller should fall back to form builder. Returns (result_dict, error_message). result_dict is empty on failure.
""" """
import traceback
if not raw_text.strip():
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
raw = ""
try: try:
raw = _llm_structure(raw_text) raw = _llm_structure(raw_text)
# Strip markdown code fences if present cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip()) cleaned = re.sub(r"\s*```$", "", cleaned)
raw = re.sub(r"\s*```$", "", raw) try:
return json.loads(raw) return json.loads(cleaned), ""
except Exception: except json.JSONDecodeError:
return {} # Try json-repair before giving up — handles truncation and minor malformations
from json_repair import repair_json
repaired = repair_json(cleaned)
result = json.loads(repaired)
log.warning("[resume_parser] Used json-repair to recover malformed output")
return result, ""
except json.JSONDecodeError as e:
log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
return {}, f"LLM returned invalid JSON: {e}"
except Exception as e:
log.error("[resume_parser] Error:\n%s", traceback.format_exc())
return {}, str(e)