fix: sft router — yaml error handling, none filter, shared jsonl utils, fixture restore

This commit is contained in:
pyr0ball 2026-04-08 14:07:09 -07:00
parent 597ffc7324
commit b330e84111
3 changed files with 54 additions and 34 deletions

View file

@ -9,13 +9,17 @@ set_sft_config_dir() in test fixtures.
""" """
from __future__ import annotations from __future__ import annotations
import json import logging
from pathlib import Path from pathlib import Path
import yaml import yaml
from fastapi import APIRouter, HTTPException from fastapi import APIRouter, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
from app.utils import append_jsonl, read_jsonl, write_jsonl
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent _ROOT = Path(__file__).parent.parent
_SFT_DATA_DIR: Path = _ROOT / "data" _SFT_DATA_DIR: Path = _ROOT / "data"
_SFT_CONFIG_DIR: Path | None = None _SFT_CONFIG_DIR: Path | None = None
@ -47,7 +51,11 @@ def _get_bench_results_dir() -> Path:
f = _config_file() f = _config_file()
if not f.exists(): if not f.exists():
return Path("/nonexistent-bench-results") return Path("/nonexistent-bench-results")
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse SFT config %s: %s", f, exc)
return Path("/nonexistent-bench-results")
d = raw.get("sft", {}).get("bench_results_dir", "") d = raw.get("sft", {}).get("bench_results_dir", "")
return Path(d) if d else Path("/nonexistent-bench-results") return Path(d) if d else Path("/nonexistent-bench-results")
@ -60,39 +68,12 @@ def _approved_file() -> Path:
return _SFT_DATA_DIR / "sft_approved.jsonl" return _SFT_DATA_DIR / "sft_approved.jsonl"
def _read_jsonl(path: Path) -> list[dict]:
if not path.exists():
return []
records: list[dict] = []
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line:
continue
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return records
def _write_jsonl(path: Path, records: list[dict]) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
content = "\n".join(json.dumps(r) for r in records)
path.write_text(content + ("\n" if records else ""), encoding="utf-8")
def _append_jsonl(path: Path, record: dict) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "a", encoding="utf-8") as fh:
fh.write(json.dumps(record) + "\n")
def _read_candidates() -> list[dict]: def _read_candidates() -> list[dict]:
return _read_jsonl(_candidates_file()) return read_jsonl(_candidates_file())
def _write_candidates(records: list[dict]) -> None: def _write_candidates(records: list[dict]) -> None:
_write_jsonl(_candidates_file(), records) write_jsonl(_candidates_file(), records)
# ── GET /runs ────────────────────────────────────────────────────────────── # ── GET /runs ──────────────────────────────────────────────────────────────
@ -103,7 +84,12 @@ def get_runs():
from scripts.sft_import import discover_runs from scripts.sft_import import discover_runs
bench_dir = _get_bench_results_dir() bench_dir = _get_bench_results_dir()
existing = _read_candidates() existing = _read_candidates()
imported_run_ids = {r.get("benchmark_run_id") for r in existing} # benchmark_run_id in each record equals the run's directory name by cf-orch convention
imported_run_ids = {
r["benchmark_run_id"]
for r in existing
if r.get("benchmark_run_id") is not None
}
runs = discover_runs(bench_dir) runs = discover_runs(bench_dir)
return [ return [
{ {

View file

@ -5,8 +5,10 @@ These are reused by the FastAPI backend and the test suite.
""" """
from __future__ import annotations from __future__ import annotations
import json
import re import re
from html.parser import HTMLParser from html.parser import HTMLParser
from pathlib import Path
from typing import Any from typing import Any
@ -83,3 +85,33 @@ def extract_body(msg: Any) -> str:
except Exception: except Exception:
pass pass
return "" return ""
def read_jsonl(path: Path) -> list[dict]:
"""Read a JSONL file, returning valid records. Skips blank lines and malformed JSON."""
if not path.exists():
return []
records: list[dict] = []
for line in path.read_text(encoding="utf-8").splitlines():
line = line.strip()
if not line:
continue
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return records
def write_jsonl(path: Path, records: list[dict]) -> None:
"""Write records to a JSONL file, overwriting any existing content."""
path.parent.mkdir(parents=True, exist_ok=True)
content = "\n".join(json.dumps(r) for r in records)
path.write_text(content + ("\n" if records else ""), encoding="utf-8")
def append_jsonl(path: Path, record: dict) -> None:
"""Append a single record to a JSONL file."""
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "a", encoding="utf-8") as fh:
fh.write(json.dumps(record) + "\n")

View file

@ -8,11 +8,13 @@ from pathlib import Path
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def reset_sft_globals(tmp_path): def reset_sft_globals(tmp_path):
from app import sft as sft_module from app import sft as sft_module
_prev_data = sft_module._SFT_DATA_DIR
_prev_cfg = sft_module._SFT_CONFIG_DIR
sft_module.set_sft_data_dir(tmp_path) sft_module.set_sft_data_dir(tmp_path)
sft_module.set_sft_config_dir(tmp_path) sft_module.set_sft_config_dir(tmp_path)
yield yield
sft_module.set_sft_data_dir(Path(__file__).parent.parent / "data") sft_module.set_sft_data_dir(_prev_data)
sft_module.set_sft_config_dir(None) sft_module.set_sft_config_dir(_prev_cfg)
@pytest.fixture @pytest.fixture