fix: log on malformed json in _read_jsonl, use streaming id dedup

This commit is contained in:
pyr0ball 2026-04-08 07:37:22 -07:00
parent bbfae1a622
commit cfde474454

View file

@ -52,8 +52,7 @@ def import_run(sft_path: Path, data_dir: Path) -> dict[str, int]:
Returns {imported: N, skipped: M}. Returns {imported: N, skipped: M}.
""" """
dest = data_dir / _CANDIDATES_FILENAME dest = data_dir / _CANDIDATES_FILENAME
existing = _read_jsonl(dest) existing_ids = _read_existing_ids(dest)
existing_ids = {r["id"] for r in existing if "id" in r}
new_records: list[dict] = [] new_records: list[dict] = []
skipped = 0 skipped = 0
@ -87,6 +86,25 @@ def _read_jsonl(path: Path) -> list[dict]:
continue continue
try: try:
records.append(json.loads(line)) records.append(json.loads(line))
except json.JSONDecodeError: except json.JSONDecodeError as exc:
pass logger.warning("Skipping malformed JSON line in %s: %s", path, exc)
return records return records
def _read_existing_ids(path: Path) -> set[str]:
"""Read only the id field from each line of a JSONL file."""
if not path.exists():
return set()
ids: set[str] = set()
with path.open() as f:
for line in f:
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
if "id" in record:
ids.add(record["id"])
except json.JSONDecodeError:
pass # corrupt line, skip silently (ids file is our own output)
return ids