fix: sft router — yaml error handling, none filter, shared jsonl utils, fixture restore
This commit is contained in:
parent
597ffc7324
commit
b330e84111
3 changed files with 54 additions and 34 deletions
48
app/sft.py
48
app/sft.py
|
|
@ -9,13 +9,17 @@ set_sft_config_dir() in test fixtures.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from fastapi import APIRouter, HTTPException
|
from fastapi import APIRouter, HTTPException
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from app.utils import append_jsonl, read_jsonl, write_jsonl
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_ROOT = Path(__file__).parent.parent
|
_ROOT = Path(__file__).parent.parent
|
||||||
_SFT_DATA_DIR: Path = _ROOT / "data"
|
_SFT_DATA_DIR: Path = _ROOT / "data"
|
||||||
_SFT_CONFIG_DIR: Path | None = None
|
_SFT_CONFIG_DIR: Path | None = None
|
||||||
|
|
@ -47,7 +51,11 @@ def _get_bench_results_dir() -> Path:
|
||||||
f = _config_file()
|
f = _config_file()
|
||||||
if not f.exists():
|
if not f.exists():
|
||||||
return Path("/nonexistent-bench-results")
|
return Path("/nonexistent-bench-results")
|
||||||
|
try:
|
||||||
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||||
|
except yaml.YAMLError as exc:
|
||||||
|
logger.warning("Failed to parse SFT config %s: %s", f, exc)
|
||||||
|
return Path("/nonexistent-bench-results")
|
||||||
d = raw.get("sft", {}).get("bench_results_dir", "")
|
d = raw.get("sft", {}).get("bench_results_dir", "")
|
||||||
return Path(d) if d else Path("/nonexistent-bench-results")
|
return Path(d) if d else Path("/nonexistent-bench-results")
|
||||||
|
|
||||||
|
|
@ -60,39 +68,12 @@ def _approved_file() -> Path:
|
||||||
return _SFT_DATA_DIR / "sft_approved.jsonl"
|
return _SFT_DATA_DIR / "sft_approved.jsonl"
|
||||||
|
|
||||||
|
|
||||||
def _read_jsonl(path: Path) -> list[dict]:
|
|
||||||
if not path.exists():
|
|
||||||
return []
|
|
||||||
records: list[dict] = []
|
|
||||||
for line in path.read_text(encoding="utf-8").splitlines():
|
|
||||||
line = line.strip()
|
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
records.append(json.loads(line))
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
pass
|
|
||||||
return records
|
|
||||||
|
|
||||||
|
|
||||||
def _write_jsonl(path: Path, records: list[dict]) -> None:
|
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
content = "\n".join(json.dumps(r) for r in records)
|
|
||||||
path.write_text(content + ("\n" if records else ""), encoding="utf-8")
|
|
||||||
|
|
||||||
|
|
||||||
def _append_jsonl(path: Path, record: dict) -> None:
|
|
||||||
path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with open(path, "a", encoding="utf-8") as fh:
|
|
||||||
fh.write(json.dumps(record) + "\n")
|
|
||||||
|
|
||||||
|
|
||||||
def _read_candidates() -> list[dict]:
|
def _read_candidates() -> list[dict]:
|
||||||
return _read_jsonl(_candidates_file())
|
return read_jsonl(_candidates_file())
|
||||||
|
|
||||||
|
|
||||||
def _write_candidates(records: list[dict]) -> None:
|
def _write_candidates(records: list[dict]) -> None:
|
||||||
_write_jsonl(_candidates_file(), records)
|
write_jsonl(_candidates_file(), records)
|
||||||
|
|
||||||
|
|
||||||
# ── GET /runs ──────────────────────────────────────────────────────────────
|
# ── GET /runs ──────────────────────────────────────────────────────────────
|
||||||
|
|
@ -103,7 +84,12 @@ def get_runs():
|
||||||
from scripts.sft_import import discover_runs
|
from scripts.sft_import import discover_runs
|
||||||
bench_dir = _get_bench_results_dir()
|
bench_dir = _get_bench_results_dir()
|
||||||
existing = _read_candidates()
|
existing = _read_candidates()
|
||||||
imported_run_ids = {r.get("benchmark_run_id") for r in existing}
|
# benchmark_run_id in each record equals the run's directory name by cf-orch convention
|
||||||
|
imported_run_ids = {
|
||||||
|
r["benchmark_run_id"]
|
||||||
|
for r in existing
|
||||||
|
if r.get("benchmark_run_id") is not None
|
||||||
|
}
|
||||||
runs = discover_runs(bench_dir)
|
runs = discover_runs(bench_dir)
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
|
|
|
||||||
32
app/utils.py
32
app/utils.py
|
|
@ -5,8 +5,10 @@ These are reused by the FastAPI backend and the test suite.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import re
|
import re
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -83,3 +85,33 @@ def extract_body(msg: Any) -> str:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def read_jsonl(path: Path) -> list[dict]:
|
||||||
|
"""Read a JSONL file, returning valid records. Skips blank lines and malformed JSON."""
|
||||||
|
if not path.exists():
|
||||||
|
return []
|
||||||
|
records: list[dict] = []
|
||||||
|
for line in path.read_text(encoding="utf-8").splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
records.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def write_jsonl(path: Path, records: list[dict]) -> None:
|
||||||
|
"""Write records to a JSONL file, overwriting any existing content."""
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
content = "\n".join(json.dumps(r) for r in records)
|
||||||
|
path.write_text(content + ("\n" if records else ""), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def append_jsonl(path: Path, record: dict) -> None:
|
||||||
|
"""Append a single record to a JSONL file."""
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, "a", encoding="utf-8") as fh:
|
||||||
|
fh.write(json.dumps(record) + "\n")
|
||||||
|
|
|
||||||
|
|
@ -8,11 +8,13 @@ from pathlib import Path
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def reset_sft_globals(tmp_path):
|
def reset_sft_globals(tmp_path):
|
||||||
from app import sft as sft_module
|
from app import sft as sft_module
|
||||||
|
_prev_data = sft_module._SFT_DATA_DIR
|
||||||
|
_prev_cfg = sft_module._SFT_CONFIG_DIR
|
||||||
sft_module.set_sft_data_dir(tmp_path)
|
sft_module.set_sft_data_dir(tmp_path)
|
||||||
sft_module.set_sft_config_dir(tmp_path)
|
sft_module.set_sft_config_dir(tmp_path)
|
||||||
yield
|
yield
|
||||||
sft_module.set_sft_data_dir(Path(__file__).parent.parent / "data")
|
sft_module.set_sft_data_dir(_prev_data)
|
||||||
sft_module.set_sft_config_dir(None)
|
sft_module.set_sft_config_dir(_prev_cfg)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue