fix(benchmark): guard empty exemplars, warn on malformed JSON in build_exemplars_from_jsonl
This commit is contained in:
parent
1d4c07e4a0
commit
41584de5df
2 changed files with 25 additions and 2 deletions
|
|
@ -202,15 +202,20 @@ def build_exemplars_from_jsonl(path: str, k_per_label: int = 10) -> dict[str, li
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
row = json.loads(line)
|
row = json.loads(line)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError as exc:
|
||||||
|
print(f"[build_exemplars] WARN: skipping malformed line: {exc}", flush=True)
|
||||||
continue
|
continue
|
||||||
label = row.get("label")
|
label = row.get("label")
|
||||||
if not label:
|
if not label:
|
||||||
continue
|
continue
|
||||||
|
subject = row.get("subject", "")
|
||||||
|
body = row.get("body", "")
|
||||||
|
if not subject and not body:
|
||||||
|
continue
|
||||||
texts = result.setdefault(label, [])
|
texts = result.setdefault(label, [])
|
||||||
if len(texts) < k_per_label:
|
if len(texts) < k_per_label:
|
||||||
texts.append(
|
texts.append(
|
||||||
f"Subject: {row.get('subject', '')}\n\n{row.get('body', '')[:600]}"
|
f"Subject: {subject}\n\n{body[:600]}"
|
||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -225,3 +225,21 @@ def test_build_exemplars_truncates_body_at_600(tmp_path):
|
||||||
result = build_exemplars_from_jsonl(str(f))
|
result = build_exemplars_from_jsonl(str(f))
|
||||||
body_part = result["neutral"][0].split("\n\n", 1)[1]
|
body_part = result["neutral"][0].split("\n\n", 1)[1]
|
||||||
assert len(body_part) == 600
|
assert len(body_part) == 600
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_exemplars_skips_rows_with_no_content(tmp_path):
|
||||||
|
from scripts.benchmark_classifier import build_exemplars_from_jsonl
|
||||||
|
import json
|
||||||
|
|
||||||
|
rows = [
|
||||||
|
{"label": "neutral"}, # no subject, no body -> skip
|
||||||
|
{"subject": "S", "body": "B", "label": "neutral"}, # valid -> keep
|
||||||
|
{"label": "rejected", "subject": "", "body": ""}, # empty strings -> skip
|
||||||
|
]
|
||||||
|
f = tmp_path / "score.jsonl"
|
||||||
|
lines = [json.dumps(r) for r in rows]
|
||||||
|
f.write_text("\n".join(lines))
|
||||||
|
|
||||||
|
result = build_exemplars_from_jsonl(str(f))
|
||||||
|
assert list(result.keys()) == ["neutral"]
|
||||||
|
assert len(result["neutral"]) == 1
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue