fix(benchmark): guard empty exemplars, warn on malformed JSON in build_exemplars_from_jsonl

This commit is contained in:
pyr0ball 2026-05-05 12:41:46 -07:00
parent 1d4c07e4a0
commit 41584de5df
2 changed files with 25 additions and 2 deletions

View file

@ -202,15 +202,20 @@ def build_exemplars_from_jsonl(path: str, k_per_label: int = 10) -> dict[str, li
continue
try:
row = json.loads(line)
except json.JSONDecodeError:
except json.JSONDecodeError as exc:
print(f"[build_exemplars] WARN: skipping malformed line: {exc}", flush=True)
continue
label = row.get("label")
if not label:
continue
subject = row.get("subject", "")
body = row.get("body", "")
if not subject and not body:
continue
texts = result.setdefault(label, [])
if len(texts) < k_per_label:
texts.append(
f"Subject: {row.get('subject', '')}\n\n{row.get('body', '')[:600]}"
f"Subject: {subject}\n\n{body[:600]}"
)
return result

View file

@ -225,3 +225,21 @@ def test_build_exemplars_truncates_body_at_600(tmp_path):
result = build_exemplars_from_jsonl(str(f))
body_part = result["neutral"][0].split("\n\n", 1)[1]
assert len(body_part) == 600
def test_build_exemplars_skips_rows_with_no_content(tmp_path):
from scripts.benchmark_classifier import build_exemplars_from_jsonl
import json
rows = [
{"label": "neutral"}, # no subject, no body -> skip
{"subject": "S", "body": "B", "label": "neutral"}, # valid -> keep
{"label": "rejected", "subject": "", "body": ""}, # empty strings -> skip
]
f = tmp_path / "score.jsonl"
lines = [json.dumps(r) for r in rows]
f.write_text("\n".join(lines))
result = build_exemplars_from_jsonl(str(f))
assert list(result.keys()) == ["neutral"]
assert len(result["neutral"]) == 1