fix: log warning when sft record is missing id field
This commit is contained in:
parent
03dac57fd9
commit
bbfae1a622
2 changed files with 14 additions and 4 deletions
|
|
@ -6,8 +6,11 @@ Used by app/sft.py endpoints and can be run standalone.
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_CANDIDATES_FILENAME = "sft_candidates.jsonl"
|
||||
|
||||
|
||||
|
|
@ -56,6 +59,7 @@ def import_run(sft_path: Path, data_dir: Path) -> dict[str, int]:
|
|||
skipped = 0
|
||||
for record in _read_jsonl(sft_path):
|
||||
if "id" not in record:
|
||||
logger.warning("Skipping record missing 'id' field in %s", sft_path)
|
||||
continue # malformed — skip without crashing
|
||||
if record["id"] in existing_ids:
|
||||
skipped += 1
|
||||
|
|
|
|||
|
|
@ -80,10 +80,16 @@ def test_import_run_deduplicates_on_id(tmp_path):
|
|||
assert len(lines) == 2 # no duplicates
|
||||
|
||||
|
||||
def test_import_run_skips_records_missing_id(tmp_path):
|
||||
def test_import_run_skips_records_missing_id(tmp_path, caplog):
|
||||
import logging
|
||||
from scripts.sft_import import import_run
|
||||
sft_path = tmp_path / "run1" / "sft_candidates.jsonl"
|
||||
bad = {"source": "cf-orch-benchmark", "status": "needs_review"} # no id
|
||||
_write_candidates(sft_path, [bad, _make_record("a")])
|
||||
sft_path.parent.mkdir()
|
||||
sft_path.write_text(
|
||||
json.dumps({"model_response": "bad", "status": "needs_review"}) + "\n"
|
||||
+ json.dumps({"id": "abc123", "model_response": "good", "status": "needs_review"}) + "\n"
|
||||
)
|
||||
with caplog.at_level(logging.WARNING, logger="scripts.sft_import"):
|
||||
result = import_run(sft_path, tmp_path)
|
||||
assert result == {"imported": 1, "skipped": 0}
|
||||
assert "missing 'id'" in caplog.text
|
||||
|
|
|
|||
Loading…
Reference in a new issue