avocet/scripts/sft_import.py

"""Avocet — SFT candidate run discovery and JSONL import.

No FastAPI dependency — pure Python file operations.
Used by app/sft.py endpoints and can be run standalone.
"""
from __future__ import annotations

import json
import logging
from pathlib import Path

logger = logging.getLogger(__name__)

_CANDIDATES_FILENAME = "sft_candidates.jsonl"


def discover_runs(bench_results_dir: Path) -> list[dict]:
    """Return one entry per run subdirectory that contains sft_candidates.jsonl.

    Sorted newest-first by directory name (directories are named YYYY-MM-DD-HHMMSS
    by the cf-orch benchmark harness, so lexicographic order is chronological).

    Each entry: {run_id, timestamp, candidate_count, sft_path}
    """
    if not bench_results_dir.exists() or not bench_results_dir.is_dir():
        return []
    runs = []
    for subdir in bench_results_dir.iterdir():
        if not subdir.is_dir():
            continue
        sft_path = subdir / _CANDIDATES_FILENAME
        if not sft_path.exists():
            continue
        records = _read_jsonl(sft_path)
        runs.append({
            "run_id": subdir.name,
            "timestamp": subdir.name,
            "candidate_count": len(records),
            "sft_path": sft_path,
        })
    runs.sort(key=lambda r: r["run_id"], reverse=True)
    return runs


def import_run(sft_path: Path, data_dir: Path) -> dict[str, int]:
    """Append records from sft_path into data_dir/sft_candidates.jsonl.

    Deduplicates on the `id` field — records whose id already exists in the
    destination file are skipped silently. Records missing an `id` field are
    also skipped (malformed input from a partial benchmark write).

    Returns {imported: N, skipped: M}.
    """
    dest = data_dir / _CANDIDATES_FILENAME
    existing_ids = _read_existing_ids(dest)

    new_records: list[dict] = []
    skipped = 0
    for record in _read_jsonl(sft_path):
        if "id" not in record:
            logger.warning("Skipping record missing 'id' field in %s", sft_path)
            continue  # malformed — skip without crashing
        if record["id"] in existing_ids:
            skipped += 1
            continue
        new_records.append(record)
        existing_ids.add(record["id"])

    if new_records:
        dest.parent.mkdir(parents=True, exist_ok=True)
        with open(dest, "a", encoding="utf-8") as fh:
            for r in new_records:
                fh.write(json.dumps(r) + "\n")

    return {"imported": len(new_records), "skipped": skipped}


def _read_jsonl(path: Path) -> list[dict]:
    """Read a JSONL file, returning valid records. Skips blank lines and malformed JSON."""
    if not path.exists():
        return []
    records: list[dict] = []
    for line in path.read_text(encoding="utf-8").splitlines():
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError as exc:
            logger.warning("Skipping malformed JSON line in %s: %s", path, exc)
    return records


def _read_existing_ids(path: Path) -> set[str]:
    """Read only the id field from each line of a JSONL file."""
    if not path.exists():
        return set()
    ids: set[str] = set()
    with path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                record = json.loads(line)
                if "id" in record:
                    ids.add(record["id"])
            except json.JSONDecodeError:
                pass  # corrupt line, skip silently (ids file is our own output)
    return ids