fix: log warning when sft record is missing id field
This commit is contained in:
parent
03dac57fd9
commit
bbfae1a622
2 changed files with 14 additions and 4 deletions
|
|
@ -6,8 +6,11 @@ Used by app/sft.py endpoints and can be run standalone.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_CANDIDATES_FILENAME = "sft_candidates.jsonl"
|
_CANDIDATES_FILENAME = "sft_candidates.jsonl"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -56,6 +59,7 @@ def import_run(sft_path: Path, data_dir: Path) -> dict[str, int]:
|
||||||
skipped = 0
|
skipped = 0
|
||||||
for record in _read_jsonl(sft_path):
|
for record in _read_jsonl(sft_path):
|
||||||
if "id" not in record:
|
if "id" not in record:
|
||||||
|
logger.warning("Skipping record missing 'id' field in %s", sft_path)
|
||||||
continue # malformed — skip without crashing
|
continue # malformed — skip without crashing
|
||||||
if record["id"] in existing_ids:
|
if record["id"] in existing_ids:
|
||||||
skipped += 1
|
skipped += 1
|
||||||
|
|
|
||||||
|
|
@ -80,10 +80,16 @@ def test_import_run_deduplicates_on_id(tmp_path):
|
||||||
assert len(lines) == 2 # no duplicates
|
assert len(lines) == 2 # no duplicates
|
||||||
|
|
||||||
|
|
||||||
def test_import_run_skips_records_missing_id(tmp_path):
|
def test_import_run_skips_records_missing_id(tmp_path, caplog):
|
||||||
|
import logging
|
||||||
from scripts.sft_import import import_run
|
from scripts.sft_import import import_run
|
||||||
sft_path = tmp_path / "run1" / "sft_candidates.jsonl"
|
sft_path = tmp_path / "run1" / "sft_candidates.jsonl"
|
||||||
bad = {"source": "cf-orch-benchmark", "status": "needs_review"} # no id
|
sft_path.parent.mkdir()
|
||||||
_write_candidates(sft_path, [bad, _make_record("a")])
|
sft_path.write_text(
|
||||||
|
json.dumps({"model_response": "bad", "status": "needs_review"}) + "\n"
|
||||||
|
+ json.dumps({"id": "abc123", "model_response": "good", "status": "needs_review"}) + "\n"
|
||||||
|
)
|
||||||
|
with caplog.at_level(logging.WARNING, logger="scripts.sft_import"):
|
||||||
result = import_run(sft_path, tmp_path)
|
result = import_run(sft_path, tmp_path)
|
||||||
assert result == {"imported": 1, "skipped": 0}
|
assert result == {"imported": 1, "skipped": 0}
|
||||||
|
assert "missing 'id'" in caplog.text
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue