fix(benchmark): guard empty exemplars, warn on malformed JSON in build_exemplars_from_jsonl
This commit is contained in:
parent
1d4c07e4a0
commit
41584de5df
2 changed files with 25 additions and 2 deletions
|
|
@ -202,15 +202,20 @@ def build_exemplars_from_jsonl(path: str, k_per_label: int = 10) -> dict[str, li
|
|||
continue
|
||||
try:
|
||||
row = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
except json.JSONDecodeError as exc:
|
||||
print(f"[build_exemplars] WARN: skipping malformed line: {exc}", flush=True)
|
||||
continue
|
||||
label = row.get("label")
|
||||
if not label:
|
||||
continue
|
||||
subject = row.get("subject", "")
|
||||
body = row.get("body", "")
|
||||
if not subject and not body:
|
||||
continue
|
||||
texts = result.setdefault(label, [])
|
||||
if len(texts) < k_per_label:
|
||||
texts.append(
|
||||
f"Subject: {row.get('subject', '')}\n\n{row.get('body', '')[:600]}"
|
||||
f"Subject: {subject}\n\n{body[:600]}"
|
||||
)
|
||||
return result
|
||||
|
||||
|
|
|
|||
|
|
@ -225,3 +225,21 @@ def test_build_exemplars_truncates_body_at_600(tmp_path):
|
|||
result = build_exemplars_from_jsonl(str(f))
|
||||
body_part = result["neutral"][0].split("\n\n", 1)[1]
|
||||
assert len(body_part) == 600
|
||||
|
||||
|
||||
def test_build_exemplars_skips_rows_with_no_content(tmp_path):
|
||||
from scripts.benchmark_classifier import build_exemplars_from_jsonl
|
||||
import json
|
||||
|
||||
rows = [
|
||||
{"label": "neutral"}, # no subject, no body -> skip
|
||||
{"subject": "S", "body": "B", "label": "neutral"}, # valid -> keep
|
||||
{"label": "rejected", "subject": "", "body": ""}, # empty strings -> skip
|
||||
]
|
||||
f = tmp_path / "score.jsonl"
|
||||
lines = [json.dumps(r) for r in rows]
|
||||
f.write_text("\n".join(lines))
|
||||
|
||||
result = build_exemplars_from_jsonl(str(f))
|
||||
assert list(result.keys()) == ["neutral"]
|
||||
assert len(result["neutral"]) == 1
|
||||
|
|
|
|||
Loading…
Reference in a new issue