feat(avocet): auto-discover fine-tuned models in benchmark harness
This commit is contained in:
parent
da8478082e
commit
36117b35c4
2 changed files with 119 additions and 10 deletions
|
|
@ -32,10 +32,14 @@ from typing import Any
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
_ROOT = Path(__file__).parent.parent
|
||||||
|
_MODELS_DIR = _ROOT / "models"
|
||||||
|
|
||||||
from scripts.classifier_adapters import (
|
from scripts.classifier_adapters import (
|
||||||
LABELS,
|
LABELS,
|
||||||
LABEL_DESCRIPTIONS,
|
LABEL_DESCRIPTIONS,
|
||||||
ClassifierAdapter,
|
ClassifierAdapter,
|
||||||
|
FineTunedAdapter,
|
||||||
GLiClassAdapter,
|
GLiClassAdapter,
|
||||||
RerankerAdapter,
|
RerankerAdapter,
|
||||||
ZeroShotAdapter,
|
ZeroShotAdapter,
|
||||||
|
|
@ -150,8 +154,48 @@ def load_scoring_jsonl(path: str) -> list[dict[str, str]]:
|
||||||
return rows
|
return rows
|
||||||
|
|
||||||
|
|
||||||
def _active_models(include_slow: bool) -> dict[str, dict[str, Any]]:
|
def discover_finetuned_models(models_dir: Path | None = None) -> list[dict]:
|
||||||
return {k: v for k, v in MODEL_REGISTRY.items() if v["default"] or include_slow}
|
"""Scan models/ for subdirs containing training_info.json.
|
||||||
|
|
||||||
|
Returns a list of training_info dicts, each with an added 'model_dir' key.
|
||||||
|
Returns [] silently if models_dir does not exist.
|
||||||
|
"""
|
||||||
|
if models_dir is None:
|
||||||
|
models_dir = _MODELS_DIR
|
||||||
|
if not models_dir.exists():
|
||||||
|
return []
|
||||||
|
found = []
|
||||||
|
for sub in models_dir.iterdir():
|
||||||
|
if not sub.is_dir():
|
||||||
|
continue
|
||||||
|
info_path = sub / "training_info.json"
|
||||||
|
if not info_path.exists():
|
||||||
|
continue
|
||||||
|
info = json.loads(info_path.read_text(encoding="utf-8"))
|
||||||
|
info["model_dir"] = str(sub)
|
||||||
|
found.append(info)
|
||||||
|
return found
|
||||||
|
|
||||||
|
|
||||||
|
def _active_models(include_slow: bool = False) -> dict[str, dict[str, Any]]:
|
||||||
|
"""Return the active model registry, merged with any discovered fine-tuned models."""
|
||||||
|
active: dict[str, dict[str, Any]] = {
|
||||||
|
key: {**entry, "adapter_instance": entry["adapter"](
|
||||||
|
key,
|
||||||
|
entry["model_id"],
|
||||||
|
**entry.get("kwargs", {}),
|
||||||
|
)}
|
||||||
|
for key, entry in MODEL_REGISTRY.items()
|
||||||
|
if include_slow or entry.get("default", False)
|
||||||
|
}
|
||||||
|
for info in discover_finetuned_models():
|
||||||
|
name = info["name"]
|
||||||
|
active[name] = {
|
||||||
|
"adapter_instance": FineTunedAdapter(name, info["model_dir"]),
|
||||||
|
"params": "fine-tuned",
|
||||||
|
"default": True,
|
||||||
|
}
|
||||||
|
return active
|
||||||
|
|
||||||
|
|
||||||
def run_scoring(
|
def run_scoring(
|
||||||
|
|
@ -347,10 +391,7 @@ def cmd_score(args: argparse.Namespace) -> None:
|
||||||
if args.models:
|
if args.models:
|
||||||
active = {k: v for k, v in active.items() if k in args.models}
|
active = {k: v for k, v in active.items() if k in args.models}
|
||||||
|
|
||||||
adapters = [
|
adapters = [entry["adapter_instance"] for entry in active.values()]
|
||||||
entry["adapter"](name, entry["model_id"], **entry.get("kwargs", {}))
|
|
||||||
for name, entry in active.items()
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"\nScoring {len(adapters)} model(s) against {args.score_file} …\n")
|
print(f"\nScoring {len(adapters)} model(s) against {args.score_file} …\n")
|
||||||
results = run_scoring(adapters, args.score_file)
|
results = run_scoring(adapters, args.score_file)
|
||||||
|
|
@ -412,10 +453,7 @@ def cmd_compare(args: argparse.Namespace) -> None:
|
||||||
emails = _fetch_imap_sample(args.limit, args.days)
|
emails = _fetch_imap_sample(args.limit, args.days)
|
||||||
print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n")
|
print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n")
|
||||||
|
|
||||||
adapters = [
|
adapters = [entry["adapter_instance"] for entry in active.values()]
|
||||||
entry["adapter"](name, entry["model_id"], **entry.get("kwargs", {}))
|
|
||||||
for name, entry in active.items()
|
|
||||||
]
|
|
||||||
model_names = [a.name for a in adapters]
|
model_names = [a.name for a in adapters]
|
||||||
|
|
||||||
col = 22
|
col = 22
|
||||||
|
|
|
||||||
|
|
@ -92,3 +92,74 @@ def test_run_scoring_handles_classify_error(tmp_path):
|
||||||
|
|
||||||
results = run_scoring([broken], str(score_file))
|
results = run_scoring([broken], str(score_file))
|
||||||
assert "broken" in results
|
assert "broken" in results
|
||||||
|
|
||||||
|
|
||||||
|
# ---- Auto-discovery tests ----
|
||||||
|
|
||||||
|
def test_discover_finetuned_models_finds_training_info_files(tmp_path):
|
||||||
|
"""discover_finetuned_models() must return one entry per training_info.json found."""
|
||||||
|
import json
|
||||||
|
from scripts.benchmark_classifier import discover_finetuned_models
|
||||||
|
|
||||||
|
# Create two fake model directories
|
||||||
|
for name in ("avocet-deberta-small", "avocet-bge-m3"):
|
||||||
|
model_dir = tmp_path / name
|
||||||
|
model_dir.mkdir()
|
||||||
|
info = {
|
||||||
|
"name": name,
|
||||||
|
"base_model_id": "cross-encoder/nli-deberta-v3-small",
|
||||||
|
"timestamp": "2026-03-15T12:00:00Z",
|
||||||
|
"val_macro_f1": 0.72,
|
||||||
|
"val_accuracy": 0.80,
|
||||||
|
"sample_count": 401,
|
||||||
|
}
|
||||||
|
(model_dir / "training_info.json").write_text(json.dumps(info))
|
||||||
|
|
||||||
|
results = discover_finetuned_models(tmp_path)
|
||||||
|
assert len(results) == 2
|
||||||
|
names = {r["name"] for r in results}
|
||||||
|
assert "avocet-deberta-small" in names
|
||||||
|
assert "avocet-bge-m3" in names
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_finetuned_models_returns_empty_when_no_models_dir():
|
||||||
|
"""discover_finetuned_models() must return [] silently if models/ doesn't exist."""
|
||||||
|
from pathlib import Path
|
||||||
|
from scripts.benchmark_classifier import discover_finetuned_models
|
||||||
|
|
||||||
|
results = discover_finetuned_models(Path("/nonexistent/path/models"))
|
||||||
|
assert results == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_finetuned_models_skips_dirs_without_training_info(tmp_path):
|
||||||
|
"""Subdirs without training_info.json are silently skipped."""
|
||||||
|
from scripts.benchmark_classifier import discover_finetuned_models
|
||||||
|
|
||||||
|
# A dir WITHOUT training_info.json
|
||||||
|
(tmp_path / "some-other-dir").mkdir()
|
||||||
|
|
||||||
|
results = discover_finetuned_models(tmp_path)
|
||||||
|
assert results == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_active_models_includes_discovered_finetuned(tmp_path):
|
||||||
|
"""The active models dict must include FineTunedAdapter entries for discovered models."""
|
||||||
|
import json
|
||||||
|
from unittest.mock import patch
|
||||||
|
from scripts.benchmark_classifier import _active_models
|
||||||
|
from scripts.classifier_adapters import FineTunedAdapter
|
||||||
|
|
||||||
|
model_dir = tmp_path / "avocet-deberta-small"
|
||||||
|
model_dir.mkdir()
|
||||||
|
(model_dir / "training_info.json").write_text(json.dumps({
|
||||||
|
"name": "avocet-deberta-small",
|
||||||
|
"base_model_id": "cross-encoder/nli-deberta-v3-small",
|
||||||
|
"val_macro_f1": 0.72,
|
||||||
|
"sample_count": 401,
|
||||||
|
}))
|
||||||
|
|
||||||
|
with patch("scripts.benchmark_classifier._MODELS_DIR", tmp_path):
|
||||||
|
models = _active_models(include_slow=False)
|
||||||
|
|
||||||
|
assert "avocet-deberta-small" in models
|
||||||
|
assert isinstance(models["avocet-deberta-small"]["adapter_instance"], FineTunedAdapter)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue