From 7393ad2a1495842dd178fd0f31f49734cd081801 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Thu, 26 Feb 2026 00:00:23 -0800
Subject: [PATCH] =?UTF-8?q?fix:=20resume=20parser=20=E2=80=94=20max=5Ftoke?=
 =?UTF-8?q?ns,=20json-repair=20fallback,=20logging,=20PYTHONUNBUFFERED?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/app.py               |  3 +++
 compose.yml              |  2 ++
 requirements.txt         |  1 +
 scripts/llm_router.py    | 10 ++++++----
 scripts/resume_parser.py | 38 ++++++++++++++++++++++++++++----------
 5 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/app/app.py b/app/app.py
index 1d4ceb0..b30c6a1 100644
--- a/app/app.py
+++ b/app/app.py
@@ -7,11 +7,14 @@ a "System" section so it doesn't crowd the navigation.
 Run: streamlit run app/app.py
      bash scripts/manage-ui.sh start
 """
+import logging
 import sys
 from pathlib import Path
 
 sys.path.insert(0, str(Path(__file__).parent.parent))
 
+logging.basicConfig(level=logging.WARNING, format="%(name)s %(levelname)s: %(message)s")
+
 import streamlit as st
 from scripts.db import DEFAULT_DB, init_db, get_active_tasks
 import sqlite3
diff --git a/compose.yml b/compose.yml
index b262cdb..c95a304 100644
--- a/compose.yml
+++ b/compose.yml
@@ -19,6 +19,8 @@ services:
       - PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0}
       - PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-}
       - RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote}
+      - PYTHONUNBUFFERED=1
+      - PYTHONLOGGING=WARNING
     depends_on:
       searxng:
         condition: service_healthy
diff --git a/requirements.txt b/requirements.txt
index 2e24bff..e31b83e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -47,6 +47,7 @@ pypdf
 pdfminer-six
 pdfplumber
 python-docx
+json-repair
 pyyaml>=6.0
 python-dotenv
 
diff --git a/scripts/llm_router.py b/scripts/llm_router.py
index d4eb237..bbf6a8b 100644
--- a/scripts/llm_router.py
+++ b/scripts/llm_router.py
@@ -35,7 +35,8 @@ class LLMRouter:
     def complete(self, prompt: str, system: str | None = None,
                  model_override: str | None = None,
                  fallback_order: list[str] | None = None,
-                 images: list[str] | None = None) -> str:
+                 images: list[str] | None = None,
+                 max_tokens: int | None = None) -> str:
         """
         Generate a completion. Tries each backend in fallback_order.
 
@@ -114,9 +115,10 @@ class LLMRouter:
                     else:
                         messages.append({"role": "user", "content": prompt})
 
-                    resp = client.chat.completions.create(
-                        model=model, messages=messages
-                    )
+                    create_kwargs: dict = {"model": model, "messages": messages}
+                    if max_tokens is not None:
+                        create_kwargs["max_tokens"] = max_tokens
+                    resp = client.chat.completions.create(**create_kwargs)
                     print(f"[LLMRouter] Used backend: {name} ({model})")
                     return resp.choices[0].message.content
 
diff --git a/scripts/resume_parser.py b/scripts/resume_parser.py
index fceccfe..53cd0a6 100644
--- a/scripts/resume_parser.py
+++ b/scripts/resume_parser.py
@@ -10,11 +10,14 @@ then show the guided form builder.
 from __future__ import annotations
 import io
 import json
+import logging
 import re
 
 import pdfplumber
 from docx import Document
 
+log = logging.getLogger(__name__)
+
 
 def extract_text_from_pdf(file_bytes: bytes) -> str:
     """Extract raw text from PDF bytes using pdfplumber.
@@ -47,22 +50,37 @@ def _llm_structure(raw_text: str) -> str:
         "- skills (list of strings)\n"
         "- achievements (list of strings, may be empty)\n\n"
         "Return ONLY valid JSON. No markdown, no explanation.\n\n"
-        f"Resume text:\n{raw_text[:6000]}"
+        f"Resume text:\n{raw_text[:4000]}"
     )
     router = LLMRouter()
-    return router.complete(prompt)
+    return router.complete(prompt, max_tokens=2048)
 
 
-def structure_resume(raw_text: str) -> dict:
+def structure_resume(raw_text: str) -> tuple[dict, str]:
     """Convert raw resume text to a structured dict via LLM.
 
-    Returns an empty dict on any failure — caller should fall back to form builder.
+    Returns (result_dict, error_message). result_dict is empty on failure.
     """
+    import traceback
+    if not raw_text.strip():
+        return {}, "Text extraction returned empty — the file may be image-based or unreadable."
+    raw = ""
     try:
         raw = _llm_structure(raw_text)
-        # Strip markdown code fences if present
-        raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
-        raw = re.sub(r"\s*```$", "", raw)
-        return json.loads(raw)
-    except Exception:
-        return {}
+        cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
+        cleaned = re.sub(r"\s*```$", "", cleaned)
+        try:
+            return json.loads(cleaned), ""
+        except json.JSONDecodeError:
+            # Try json-repair before giving up — handles truncation and minor malformations
+            from json_repair import repair_json
+            repaired = repair_json(cleaned)
+            result = json.loads(repaired)
+            log.warning("[resume_parser] Used json-repair to recover malformed output")
+            return result, ""
+    except json.JSONDecodeError as e:
+        log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
+        return {}, f"LLM returned invalid JSON: {e}"
+    except Exception as e:
+        log.error("[resume_parser] Error:\n%s", traceback.format_exc())
+        return {}, str(e)