fix: log warning when sft record is missing id field

This commit is contained in:
pyr0ball 2026-04-08 07:30:46 -07:00
parent 03dac57fd9
commit bbfae1a622
2 changed files with 14 additions and 4 deletions

View file

@ -6,8 +6,11 @@ Used by app/sft.py endpoints and can be run standalone.
from __future__ import annotations
import json
import logging
from pathlib import Path
logger = logging.getLogger(__name__)
_CANDIDATES_FILENAME = "sft_candidates.jsonl"
@ -56,6 +59,7 @@ def import_run(sft_path: Path, data_dir: Path) -> dict[str, int]:
skipped = 0
for record in _read_jsonl(sft_path):
if "id" not in record:
logger.warning("Skipping record missing 'id' field in %s", sft_path)
continue # malformed — skip without crashing
if record["id"] in existing_ids:
skipped += 1

View file

@ -80,10 +80,16 @@ def test_import_run_deduplicates_on_id(tmp_path):
assert len(lines) == 2 # no duplicates
def test_import_run_skips_records_missing_id(tmp_path):
def test_import_run_skips_records_missing_id(tmp_path, caplog):
import logging
from scripts.sft_import import import_run
sft_path = tmp_path / "run1" / "sft_candidates.jsonl"
bad = {"source": "cf-orch-benchmark", "status": "needs_review"} # no id
_write_candidates(sft_path, [bad, _make_record("a")])
sft_path.parent.mkdir()
sft_path.write_text(
json.dumps({"model_response": "bad", "status": "needs_review"}) + "\n"
+ json.dumps({"id": "abc123", "model_response": "good", "status": "needs_review"}) + "\n"
)
with caplog.at_level(logging.WARNING, logger="scripts.sft_import"):
result = import_run(sft_path, tmp_path)
assert result == {"imported": 1, "skipped": 0}
assert "missing 'id'" in caplog.text