From b330e841110964e6c38e5f64747430da7255271e Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Wed, 8 Apr 2026 14:07:09 -0700
Subject: [PATCH] =?UTF-8?q?fix:=20sft=20router=20=E2=80=94=20yaml=20error?=
 =?UTF-8?q?=20handling,=20none=20filter,=20shared=20jsonl=20utils,=20fixtu?=
 =?UTF-8?q?re=20restore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/sft.py        | 50 +++++++++++++++++------------------------------
 app/utils.py      | 32 ++++++++++++++++++++++++++++++
 tests/test_sft.py |  6 ++++--
 3 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/app/sft.py b/app/sft.py
index 80ed061..1609e80 100644
--- a/app/sft.py
+++ b/app/sft.py
@@ -9,13 +9,17 @@ set_sft_config_dir() in test fixtures.
 """
 from __future__ import annotations
 
-import json
+import logging
 from pathlib import Path
 
 import yaml
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
 
+from app.utils import append_jsonl, read_jsonl, write_jsonl
+
+logger = logging.getLogger(__name__)
+
 _ROOT = Path(__file__).parent.parent
 _SFT_DATA_DIR: Path = _ROOT / "data"
 _SFT_CONFIG_DIR: Path | None = None
@@ -47,7 +51,11 @@ def _get_bench_results_dir() -> Path:
     f = _config_file()
     if not f.exists():
         return Path("/nonexistent-bench-results")
-    raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
+    try:
+        raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
+    except yaml.YAMLError as exc:
+        logger.warning("Failed to parse SFT config %s: %s", f, exc)
+        return Path("/nonexistent-bench-results")
     d = raw.get("sft", {}).get("bench_results_dir", "")
     return Path(d) if d else Path("/nonexistent-bench-results")
 
@@ -60,39 +68,12 @@ def _approved_file() -> Path:
     return _SFT_DATA_DIR / "sft_approved.jsonl"
 
 
-def _read_jsonl(path: Path) -> list[dict]:
-    if not path.exists():
-        return []
-    records: list[dict] = []
-    for line in path.read_text(encoding="utf-8").splitlines():
-        line = line.strip()
-        if not line:
-            continue
-        try:
-            records.append(json.loads(line))
-        except json.JSONDecodeError:
-            pass
-    return records
-
-
-def _write_jsonl(path: Path, records: list[dict]) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    content = "\n".join(json.dumps(r) for r in records)
-    path.write_text(content + ("\n" if records else ""), encoding="utf-8")
-
-
-def _append_jsonl(path: Path, record: dict) -> None:
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with open(path, "a", encoding="utf-8") as fh:
-        fh.write(json.dumps(record) + "\n")
-
-
 def _read_candidates() -> list[dict]:
-    return _read_jsonl(_candidates_file())
+    return read_jsonl(_candidates_file())
 
 
 def _write_candidates(records: list[dict]) -> None:
-    _write_jsonl(_candidates_file(), records)
+    write_jsonl(_candidates_file(), records)
 
 
 # ── GET /runs ──────────────────────────────────────────────────────────────
@@ -103,7 +84,12 @@ def get_runs():
     from scripts.sft_import import discover_runs
     bench_dir = _get_bench_results_dir()
     existing = _read_candidates()
-    imported_run_ids = {r.get("benchmark_run_id") for r in existing}
+    # benchmark_run_id in each record equals the run's directory name by cf-orch convention
+    imported_run_ids = {
+        r["benchmark_run_id"]
+        for r in existing
+        if r.get("benchmark_run_id") is not None
+    }
     runs = discover_runs(bench_dir)
     return [
         {
diff --git a/app/utils.py b/app/utils.py
index a98088e..4b40ddd 100644
--- a/app/utils.py
+++ b/app/utils.py
@@ -5,8 +5,10 @@ These are reused by the FastAPI backend and the test suite.
 """
 from __future__ import annotations
 
+import json
 import re
 from html.parser import HTMLParser
+from pathlib import Path
 from typing import Any
 
 
@@ -83,3 +85,33 @@ def extract_body(msg: Any) -> str:
         except Exception:
             pass
     return ""
+
+
+def read_jsonl(path: Path) -> list[dict]:
+    """Read a JSONL file, returning valid records. Skips blank lines and malformed JSON."""
+    if not path.exists():
+        return []
+    records: list[dict] = []
+    for line in path.read_text(encoding="utf-8").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            records.append(json.loads(line))
+        except json.JSONDecodeError:
+            pass
+    return records
+
+
+def write_jsonl(path: Path, records: list[dict]) -> None:
+    """Write records to a JSONL file, overwriting any existing content."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    content = "\n".join(json.dumps(r) for r in records)
+    path.write_text(content + ("\n" if records else ""), encoding="utf-8")
+
+
+def append_jsonl(path: Path, record: dict) -> None:
+    """Append a single record to a JSONL file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "a", encoding="utf-8") as fh:
+        fh.write(json.dumps(record) + "\n")
diff --git a/tests/test_sft.py b/tests/test_sft.py
index a66da27..7e8de2f 100644
--- a/tests/test_sft.py
+++ b/tests/test_sft.py
@@ -8,11 +8,13 @@ from pathlib import Path
 @pytest.fixture(autouse=True)
 def reset_sft_globals(tmp_path):
     from app import sft as sft_module
+    _prev_data = sft_module._SFT_DATA_DIR
+    _prev_cfg = sft_module._SFT_CONFIG_DIR
     sft_module.set_sft_data_dir(tmp_path)
     sft_module.set_sft_config_dir(tmp_path)
     yield
-    sft_module.set_sft_data_dir(Path(__file__).parent.parent / "data")
-    sft_module.set_sft_config_dir(None)
+    sft_module.set_sft_data_dir(_prev_data)
+    sft_module.set_sft_config_dir(_prev_cfg)
 
 
 @pytest.fixture