From 7393ad2a1495842dd178fd0f31f49734cd081801 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 26 Feb 2026 00:00:23 -0800 Subject: [PATCH] =?UTF-8?q?fix:=20resume=20parser=20=E2=80=94=20max=5Ftoke?= =?UTF-8?q?ns,=20json-repair=20fallback,=20logging,=20PYTHONUNBUFFERED?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/app.py | 3 +++ compose.yml | 2 ++ requirements.txt | 1 + scripts/llm_router.py | 10 ++++++---- scripts/resume_parser.py | 38 ++++++++++++++++++++++++++++---------- 5 files changed, 40 insertions(+), 14 deletions(-) diff --git a/app/app.py b/app/app.py index 1d4ceb0..b30c6a1 100644 --- a/app/app.py +++ b/app/app.py @@ -7,11 +7,14 @@ a "System" section so it doesn't crowd the navigation. Run: streamlit run app/app.py bash scripts/manage-ui.sh start """ +import logging import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) +logging.basicConfig(level=logging.WARNING, format="%(name)s %(levelname)s: %(message)s") + import streamlit as st from scripts.db import DEFAULT_DB, init_db, get_active_tasks import sqlite3 diff --git a/compose.yml b/compose.yml index b262cdb..c95a304 100644 --- a/compose.yml +++ b/compose.yml @@ -19,6 +19,8 @@ services: - PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0} - PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-} - RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote} + - PYTHONUNBUFFERED=1 + - PYTHONLOGGING=WARNING depends_on: searxng: condition: service_healthy diff --git a/requirements.txt b/requirements.txt index 2e24bff..e31b83e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,7 @@ pypdf pdfminer-six pdfplumber python-docx +json-repair pyyaml>=6.0 python-dotenv diff --git a/scripts/llm_router.py b/scripts/llm_router.py index d4eb237..bbf6a8b 100644 --- a/scripts/llm_router.py +++ b/scripts/llm_router.py @@ -35,7 +35,8 @@ class LLMRouter: def complete(self, prompt: str, system: str | None = None, model_override: str | None = None, fallback_order: list[str] | None = None, - images: list[str] | None = None) -> str: + images: list[str] | None = None, + max_tokens: int | None = None) -> str: """ Generate a completion. Tries each backend in fallback_order. @@ -114,9 +115,10 @@ class LLMRouter: else: messages.append({"role": "user", "content": prompt}) - resp = client.chat.completions.create( - model=model, messages=messages - ) + create_kwargs: dict = {"model": model, "messages": messages} + if max_tokens is not None: + create_kwargs["max_tokens"] = max_tokens + resp = client.chat.completions.create(**create_kwargs) print(f"[LLMRouter] Used backend: {name} ({model})") return resp.choices[0].message.content diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py index fceccfe..53cd0a6 100644 --- a/scripts/resume_parser.py +++ b/scripts/resume_parser.py @@ -10,11 +10,14 @@ then show the guided form builder. from __future__ import annotations import io import json +import logging import re import pdfplumber from docx import Document +log = logging.getLogger(__name__) + def extract_text_from_pdf(file_bytes: bytes) -> str: """Extract raw text from PDF bytes using pdfplumber. @@ -47,22 +50,37 @@ def _llm_structure(raw_text: str) -> str: "- skills (list of strings)\n" "- achievements (list of strings, may be empty)\n\n" "Return ONLY valid JSON. No markdown, no explanation.\n\n" - f"Resume text:\n{raw_text[:6000]}" + f"Resume text:\n{raw_text[:4000]}" ) router = LLMRouter() - return router.complete(prompt) + return router.complete(prompt, max_tokens=2048) -def structure_resume(raw_text: str) -> dict: +def structure_resume(raw_text: str) -> tuple[dict, str]: """Convert raw resume text to a structured dict via LLM. - Returns an empty dict on any failure — caller should fall back to form builder. + Returns (result_dict, error_message). result_dict is empty on failure. """ + import traceback + if not raw_text.strip(): + return {}, "Text extraction returned empty — the file may be image-based or unreadable." + raw = "" try: raw = _llm_structure(raw_text) - # Strip markdown code fences if present - raw = re.sub(r"^```(?:json)?\s*", "", raw.strip()) - raw = re.sub(r"\s*```$", "", raw) - return json.loads(raw) - except Exception: - return {} + cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip()) + cleaned = re.sub(r"\s*```$", "", cleaned) + try: + return json.loads(cleaned), "" + except json.JSONDecodeError: + # Try json-repair before giving up — handles truncation and minor malformations + from json_repair import repair_json + repaired = repair_json(cleaned) + result = json.loads(repaired) + log.warning("[resume_parser] Used json-repair to recover malformed output") + return result, "" + except json.JSONDecodeError as e: + log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500]) + return {}, f"LLM returned invalid JSON: {e}" + except Exception as e: + log.error("[resume_parser] Error:\n%s", traceback.format_exc()) + return {}, str(e)