fix: resume parser — max_tokens, json-repair fallback, logging, PYTHONUNBUFFERED
This commit is contained in:
parent
4cee76211e
commit
9297477ba0
5 changed files with 40 additions and 14 deletions
|
|
@ -7,11 +7,14 @@ a "System" section so it doesn't crowd the navigation.
|
||||||
Run: streamlit run app/app.py
|
Run: streamlit run app/app.py
|
||||||
bash scripts/manage-ui.sh start
|
bash scripts/manage-ui.sh start
|
||||||
"""
|
"""
|
||||||
|
import logging
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.WARNING, format="%(name)s %(levelname)s: %(message)s")
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
from scripts.db import DEFAULT_DB, init_db, get_active_tasks
|
from scripts.db import DEFAULT_DB, init_db, get_active_tasks
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,8 @@ services:
|
||||||
- PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0}
|
- PEREGRINE_GPU_COUNT=${PEREGRINE_GPU_COUNT:-0}
|
||||||
- PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-}
|
- PEREGRINE_GPU_NAMES=${PEREGRINE_GPU_NAMES:-}
|
||||||
- RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote}
|
- RECOMMENDED_PROFILE=${RECOMMENDED_PROFILE:-remote}
|
||||||
|
- PYTHONUNBUFFERED=1
|
||||||
|
- PYTHONLOGGING=WARNING
|
||||||
depends_on:
|
depends_on:
|
||||||
searxng:
|
searxng:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ pypdf
|
||||||
pdfminer-six
|
pdfminer-six
|
||||||
pdfplumber
|
pdfplumber
|
||||||
python-docx
|
python-docx
|
||||||
|
json-repair
|
||||||
pyyaml>=6.0
|
pyyaml>=6.0
|
||||||
python-dotenv
|
python-dotenv
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,8 @@ class LLMRouter:
|
||||||
def complete(self, prompt: str, system: str | None = None,
|
def complete(self, prompt: str, system: str | None = None,
|
||||||
model_override: str | None = None,
|
model_override: str | None = None,
|
||||||
fallback_order: list[str] | None = None,
|
fallback_order: list[str] | None = None,
|
||||||
images: list[str] | None = None) -> str:
|
images: list[str] | None = None,
|
||||||
|
max_tokens: int | None = None) -> str:
|
||||||
"""
|
"""
|
||||||
Generate a completion. Tries each backend in fallback_order.
|
Generate a completion. Tries each backend in fallback_order.
|
||||||
|
|
||||||
|
|
@ -114,9 +115,10 @@ class LLMRouter:
|
||||||
else:
|
else:
|
||||||
messages.append({"role": "user", "content": prompt})
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
|
||||||
resp = client.chat.completions.create(
|
create_kwargs: dict = {"model": model, "messages": messages}
|
||||||
model=model, messages=messages
|
if max_tokens is not None:
|
||||||
)
|
create_kwargs["max_tokens"] = max_tokens
|
||||||
|
resp = client.chat.completions.create(**create_kwargs)
|
||||||
print(f"[LLMRouter] Used backend: {name} ({model})")
|
print(f"[LLMRouter] Used backend: {name} ({model})")
|
||||||
return resp.choices[0].message.content
|
return resp.choices[0].message.content
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,11 +10,14 @@ then show the guided form builder.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
from docx import Document
|
from docx import Document
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
def extract_text_from_pdf(file_bytes: bytes) -> str:
|
||||||
"""Extract raw text from PDF bytes using pdfplumber.
|
"""Extract raw text from PDF bytes using pdfplumber.
|
||||||
|
|
@ -47,22 +50,37 @@ def _llm_structure(raw_text: str) -> str:
|
||||||
"- skills (list of strings)\n"
|
"- skills (list of strings)\n"
|
||||||
"- achievements (list of strings, may be empty)\n\n"
|
"- achievements (list of strings, may be empty)\n\n"
|
||||||
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
|
"Return ONLY valid JSON. No markdown, no explanation.\n\n"
|
||||||
f"Resume text:\n{raw_text[:6000]}"
|
f"Resume text:\n{raw_text[:4000]}"
|
||||||
)
|
)
|
||||||
router = LLMRouter()
|
router = LLMRouter()
|
||||||
return router.complete(prompt)
|
return router.complete(prompt, max_tokens=2048)
|
||||||
|
|
||||||
|
|
||||||
def structure_resume(raw_text: str) -> dict:
|
def structure_resume(raw_text: str) -> tuple[dict, str]:
|
||||||
"""Convert raw resume text to a structured dict via LLM.
|
"""Convert raw resume text to a structured dict via LLM.
|
||||||
|
|
||||||
Returns an empty dict on any failure — caller should fall back to form builder.
|
Returns (result_dict, error_message). result_dict is empty on failure.
|
||||||
"""
|
"""
|
||||||
|
import traceback
|
||||||
|
if not raw_text.strip():
|
||||||
|
return {}, "Text extraction returned empty — the file may be image-based or unreadable."
|
||||||
|
raw = ""
|
||||||
try:
|
try:
|
||||||
raw = _llm_structure(raw_text)
|
raw = _llm_structure(raw_text)
|
||||||
# Strip markdown code fences if present
|
cleaned = re.sub(r"^```(?:json)?\s*", "", raw.strip())
|
||||||
raw = re.sub(r"^```(?:json)?\s*", "", raw.strip())
|
cleaned = re.sub(r"\s*```$", "", cleaned)
|
||||||
raw = re.sub(r"\s*```$", "", raw)
|
try:
|
||||||
return json.loads(raw)
|
return json.loads(cleaned), ""
|
||||||
except Exception:
|
except json.JSONDecodeError:
|
||||||
return {}
|
# Try json-repair before giving up — handles truncation and minor malformations
|
||||||
|
from json_repair import repair_json
|
||||||
|
repaired = repair_json(cleaned)
|
||||||
|
result = json.loads(repaired)
|
||||||
|
log.warning("[resume_parser] Used json-repair to recover malformed output")
|
||||||
|
return result, ""
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
log.error("[resume_parser] JSON parse error (even after repair): %s\nRaw output:\n%s", e, raw[:500])
|
||||||
|
return {}, f"LLM returned invalid JSON: {e}"
|
||||||
|
except Exception as e:
|
||||||
|
log.error("[resume_parser] Error:\n%s", traceback.format_exc())
|
||||||
|
return {}, str(e)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue