diff --git a/scripts/sft_import.py b/scripts/sft_import.py index 4e2ee3d..771ccdb 100644 --- a/scripts/sft_import.py +++ b/scripts/sft_import.py @@ -6,8 +6,11 @@ Used by app/sft.py endpoints and can be run standalone. from __future__ import annotations import json +import logging from pathlib import Path +logger = logging.getLogger(__name__) + _CANDIDATES_FILENAME = "sft_candidates.jsonl" @@ -56,6 +59,7 @@ def import_run(sft_path: Path, data_dir: Path) -> dict[str, int]: skipped = 0 for record in _read_jsonl(sft_path): if "id" not in record: + logger.warning("Skipping record missing 'id' field in %s", sft_path) continue # malformed — skip without crashing if record["id"] in existing_ids: skipped += 1 diff --git a/tests/test_sft_import.py b/tests/test_sft_import.py index cc1bb33..c63fb3e 100644 --- a/tests/test_sft_import.py +++ b/tests/test_sft_import.py @@ -80,10 +80,16 @@ def test_import_run_deduplicates_on_id(tmp_path): assert len(lines) == 2 # no duplicates -def test_import_run_skips_records_missing_id(tmp_path): +def test_import_run_skips_records_missing_id(tmp_path, caplog): + import logging from scripts.sft_import import import_run sft_path = tmp_path / "run1" / "sft_candidates.jsonl" - bad = {"source": "cf-orch-benchmark", "status": "needs_review"} # no id - _write_candidates(sft_path, [bad, _make_record("a")]) - result = import_run(sft_path, tmp_path) + sft_path.parent.mkdir() + sft_path.write_text( + json.dumps({"model_response": "bad", "status": "needs_review"}) + "\n" + + json.dumps({"id": "abc123", "model_response": "good", "status": "needs_review"}) + "\n" + ) + with caplog.at_level(logging.WARNING, logger="scripts.sft_import"): + result = import_run(sft_path, tmp_path) assert result == {"imported": 1, "skipped": 0} + assert "missing 'id'" in caplog.text