From 36117b35c4bdaedecf7c6ca47243917930a75cac Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Sun, 15 Mar 2026 11:59:13 -0700 Subject: [PATCH] feat(avocet): auto-discover fine-tuned models in benchmark harness --- scripts/benchmark_classifier.py | 58 +++++++++++++++++++----- tests/test_benchmark_classifier.py | 71 ++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 10 deletions(-) diff --git a/scripts/benchmark_classifier.py b/scripts/benchmark_classifier.py index 947f909..c127ed5 100644 --- a/scripts/benchmark_classifier.py +++ b/scripts/benchmark_classifier.py @@ -32,10 +32,14 @@ from typing import Any sys.path.insert(0, str(Path(__file__).parent.parent)) +_ROOT = Path(__file__).parent.parent +_MODELS_DIR = _ROOT / "models" + from scripts.classifier_adapters import ( LABELS, LABEL_DESCRIPTIONS, ClassifierAdapter, + FineTunedAdapter, GLiClassAdapter, RerankerAdapter, ZeroShotAdapter, @@ -150,8 +154,48 @@ def load_scoring_jsonl(path: str) -> list[dict[str, str]]: return rows -def _active_models(include_slow: bool) -> dict[str, dict[str, Any]]: - return {k: v for k, v in MODEL_REGISTRY.items() if v["default"] or include_slow} +def discover_finetuned_models(models_dir: Path | None = None) -> list[dict]: + """Scan models/ for subdirs containing training_info.json. + + Returns a list of training_info dicts, each with an added 'model_dir' key. + Returns [] silently if models_dir does not exist. + """ + if models_dir is None: + models_dir = _MODELS_DIR + if not models_dir.exists(): + return [] + found = [] + for sub in models_dir.iterdir(): + if not sub.is_dir(): + continue + info_path = sub / "training_info.json" + if not info_path.exists(): + continue + info = json.loads(info_path.read_text(encoding="utf-8")) + info["model_dir"] = str(sub) + found.append(info) + return found + + +def _active_models(include_slow: bool = False) -> dict[str, dict[str, Any]]: + """Return the active model registry, merged with any discovered fine-tuned models.""" + active: dict[str, dict[str, Any]] = { + key: {**entry, "adapter_instance": entry["adapter"]( + key, + entry["model_id"], + **entry.get("kwargs", {}), + )} + for key, entry in MODEL_REGISTRY.items() + if include_slow or entry.get("default", False) + } + for info in discover_finetuned_models(): + name = info["name"] + active[name] = { + "adapter_instance": FineTunedAdapter(name, info["model_dir"]), + "params": "fine-tuned", + "default": True, + } + return active def run_scoring( @@ -347,10 +391,7 @@ def cmd_score(args: argparse.Namespace) -> None: if args.models: active = {k: v for k, v in active.items() if k in args.models} - adapters = [ - entry["adapter"](name, entry["model_id"], **entry.get("kwargs", {})) - for name, entry in active.items() - ] + adapters = [entry["adapter_instance"] for entry in active.values()] print(f"\nScoring {len(adapters)} model(s) against {args.score_file} …\n") results = run_scoring(adapters, args.score_file) @@ -412,10 +453,7 @@ def cmd_compare(args: argparse.Namespace) -> None: emails = _fetch_imap_sample(args.limit, args.days) print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n") - adapters = [ - entry["adapter"](name, entry["model_id"], **entry.get("kwargs", {})) - for name, entry in active.items() - ] + adapters = [entry["adapter_instance"] for entry in active.values()] model_names = [a.name for a in adapters] col = 22 diff --git a/tests/test_benchmark_classifier.py b/tests/test_benchmark_classifier.py index 299e69c..0fb04a1 100644 --- a/tests/test_benchmark_classifier.py +++ b/tests/test_benchmark_classifier.py @@ -92,3 +92,74 @@ def test_run_scoring_handles_classify_error(tmp_path): results = run_scoring([broken], str(score_file)) assert "broken" in results + + +# ---- Auto-discovery tests ---- + +def test_discover_finetuned_models_finds_training_info_files(tmp_path): + """discover_finetuned_models() must return one entry per training_info.json found.""" + import json + from scripts.benchmark_classifier import discover_finetuned_models + + # Create two fake model directories + for name in ("avocet-deberta-small", "avocet-bge-m3"): + model_dir = tmp_path / name + model_dir.mkdir() + info = { + "name": name, + "base_model_id": "cross-encoder/nli-deberta-v3-small", + "timestamp": "2026-03-15T12:00:00Z", + "val_macro_f1": 0.72, + "val_accuracy": 0.80, + "sample_count": 401, + } + (model_dir / "training_info.json").write_text(json.dumps(info)) + + results = discover_finetuned_models(tmp_path) + assert len(results) == 2 + names = {r["name"] for r in results} + assert "avocet-deberta-small" in names + assert "avocet-bge-m3" in names + + +def test_discover_finetuned_models_returns_empty_when_no_models_dir(): + """discover_finetuned_models() must return [] silently if models/ doesn't exist.""" + from pathlib import Path + from scripts.benchmark_classifier import discover_finetuned_models + + results = discover_finetuned_models(Path("/nonexistent/path/models")) + assert results == [] + + +def test_discover_finetuned_models_skips_dirs_without_training_info(tmp_path): + """Subdirs without training_info.json are silently skipped.""" + from scripts.benchmark_classifier import discover_finetuned_models + + # A dir WITHOUT training_info.json + (tmp_path / "some-other-dir").mkdir() + + results = discover_finetuned_models(tmp_path) + assert results == [] + + +def test_active_models_includes_discovered_finetuned(tmp_path): + """The active models dict must include FineTunedAdapter entries for discovered models.""" + import json + from unittest.mock import patch + from scripts.benchmark_classifier import _active_models + from scripts.classifier_adapters import FineTunedAdapter + + model_dir = tmp_path / "avocet-deberta-small" + model_dir.mkdir() + (model_dir / "training_info.json").write_text(json.dumps({ + "name": "avocet-deberta-small", + "base_model_id": "cross-encoder/nli-deberta-v3-small", + "val_macro_f1": 0.72, + "sample_count": 401, + })) + + with patch("scripts.benchmark_classifier._MODELS_DIR", tmp_path): + models = _active_models(include_slow=False) + + assert "avocet-deberta-small" in models + assert isinstance(models["avocet-deberta-small"]["adapter_instance"], FineTunedAdapter)