From b330e841110964e6c38e5f64747430da7255271e Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 8 Apr 2026 14:07:09 -0700 Subject: [PATCH] =?UTF-8?q?fix:=20sft=20router=20=E2=80=94=20yaml=20error?= =?UTF-8?q?=20handling,=20none=20filter,=20shared=20jsonl=20utils,=20fixtu?= =?UTF-8?q?re=20restore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/sft.py | 50 +++++++++++++++++------------------------------ app/utils.py | 32 ++++++++++++++++++++++++++++++ tests/test_sft.py | 6 ++++-- 3 files changed, 54 insertions(+), 34 deletions(-) diff --git a/app/sft.py b/app/sft.py index 80ed061..1609e80 100644 --- a/app/sft.py +++ b/app/sft.py @@ -9,13 +9,17 @@ set_sft_config_dir() in test fixtures. """ from __future__ import annotations -import json +import logging from pathlib import Path import yaml from fastapi import APIRouter, HTTPException from pydantic import BaseModel +from app.utils import append_jsonl, read_jsonl, write_jsonl + +logger = logging.getLogger(__name__) + _ROOT = Path(__file__).parent.parent _SFT_DATA_DIR: Path = _ROOT / "data" _SFT_CONFIG_DIR: Path | None = None @@ -47,7 +51,11 @@ def _get_bench_results_dir() -> Path: f = _config_file() if not f.exists(): return Path("/nonexistent-bench-results") - raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} + try: + raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} + except yaml.YAMLError as exc: + logger.warning("Failed to parse SFT config %s: %s", f, exc) + return Path("/nonexistent-bench-results") d = raw.get("sft", {}).get("bench_results_dir", "") return Path(d) if d else Path("/nonexistent-bench-results") @@ -60,39 +68,12 @@ def _approved_file() -> Path: return _SFT_DATA_DIR / "sft_approved.jsonl" -def _read_jsonl(path: Path) -> list[dict]: - if not path.exists(): - return [] - records: list[dict] = [] - for line in path.read_text(encoding="utf-8").splitlines(): - line = line.strip() - if not line: - continue - try: - records.append(json.loads(line)) - except json.JSONDecodeError: - pass - return records - - -def _write_jsonl(path: Path, records: list[dict]) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - content = "\n".join(json.dumps(r) for r in records) - path.write_text(content + ("\n" if records else ""), encoding="utf-8") - - -def _append_jsonl(path: Path, record: dict) -> None: - path.parent.mkdir(parents=True, exist_ok=True) - with open(path, "a", encoding="utf-8") as fh: - fh.write(json.dumps(record) + "\n") - - def _read_candidates() -> list[dict]: - return _read_jsonl(_candidates_file()) + return read_jsonl(_candidates_file()) def _write_candidates(records: list[dict]) -> None: - _write_jsonl(_candidates_file(), records) + write_jsonl(_candidates_file(), records) # ── GET /runs ────────────────────────────────────────────────────────────── @@ -103,7 +84,12 @@ def get_runs(): from scripts.sft_import import discover_runs bench_dir = _get_bench_results_dir() existing = _read_candidates() - imported_run_ids = {r.get("benchmark_run_id") for r in existing} + # benchmark_run_id in each record equals the run's directory name by cf-orch convention + imported_run_ids = { + r["benchmark_run_id"] + for r in existing + if r.get("benchmark_run_id") is not None + } runs = discover_runs(bench_dir) return [ { diff --git a/app/utils.py b/app/utils.py index a98088e..4b40ddd 100644 --- a/app/utils.py +++ b/app/utils.py @@ -5,8 +5,10 @@ These are reused by the FastAPI backend and the test suite. """ from __future__ import annotations +import json import re from html.parser import HTMLParser +from pathlib import Path from typing import Any @@ -83,3 +85,33 @@ def extract_body(msg: Any) -> str: except Exception: pass return "" + + +def read_jsonl(path: Path) -> list[dict]: + """Read a JSONL file, returning valid records. Skips blank lines and malformed JSON.""" + if not path.exists(): + return [] + records: list[dict] = [] + for line in path.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line: + continue + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records + + +def write_jsonl(path: Path, records: list[dict]) -> None: + """Write records to a JSONL file, overwriting any existing content.""" + path.parent.mkdir(parents=True, exist_ok=True) + content = "\n".join(json.dumps(r) for r in records) + path.write_text(content + ("\n" if records else ""), encoding="utf-8") + + +def append_jsonl(path: Path, record: dict) -> None: + """Append a single record to a JSONL file.""" + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "a", encoding="utf-8") as fh: + fh.write(json.dumps(record) + "\n") diff --git a/tests/test_sft.py b/tests/test_sft.py index a66da27..7e8de2f 100644 --- a/tests/test_sft.py +++ b/tests/test_sft.py @@ -8,11 +8,13 @@ from pathlib import Path @pytest.fixture(autouse=True) def reset_sft_globals(tmp_path): from app import sft as sft_module + _prev_data = sft_module._SFT_DATA_DIR + _prev_cfg = sft_module._SFT_CONFIG_DIR sft_module.set_sft_data_dir(tmp_path) sft_module.set_sft_config_dir(tmp_path) yield - sft_module.set_sft_data_dir(Path(__file__).parent.parent / "data") - sft_module.set_sft_config_dir(None) + sft_module.set_sft_data_dir(_prev_data) + sft_module.set_sft_config_dir(_prev_cfg) @pytest.fixture