diff --git a/app/api.py b/app/api.py
index 5355628..29f3af2 100644
--- a/app/api.py
+++ b/app/api.py
@@ -287,6 +287,59 @@ def test_account(req: AccountTestRequest):
from fastapi.responses import StreamingResponse
+# ---------------------------------------------------------------------------
+# Benchmark endpoints
+# ---------------------------------------------------------------------------
+
+@app.get("/api/benchmark/results")
+def get_benchmark_results():
+ """Return the most recently saved benchmark results, or an empty envelope."""
+ path = _DATA_DIR / "benchmark_results.json"
+ if not path.exists():
+ return {"models": {}, "sample_count": 0, "timestamp": None}
+ return json.loads(path.read_text())
+
+
+@app.get("/api/benchmark/run")
+def run_benchmark(include_slow: bool = False):
+ """Spawn the benchmark script and stream stdout as SSE progress events."""
+ import subprocess
+
+ python_bin = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python"
+ script = str(_ROOT / "scripts" / "benchmark_classifier.py")
+ cmd = [python_bin, script, "--score", "--save"]
+ if include_slow:
+ cmd.append("--include-slow")
+
+ def generate():
+ try:
+ proc = subprocess.Popen(
+ cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ bufsize=1,
+ cwd=str(_ROOT),
+ )
+ for line in proc.stdout:
+ line = line.rstrip()
+ if line:
+ yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
+ proc.wait()
+ if proc.returncode == 0:
+ yield f"data: {json.dumps({'type': 'complete'})}\n\n"
+ else:
+ yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
+ except Exception as exc:
+ yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
+
+ return StreamingResponse(
+ generate(),
+ media_type="text/event-stream",
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+ )
+
+
@app.get("/api/fetch/stream")
def fetch_stream(
accounts: str = Query(default=""),
diff --git a/docs/superpowers/plans/2026-03-15-finetune-classifier.md b/docs/superpowers/plans/2026-03-15-finetune-classifier.md
new file mode 100644
index 0000000..8b8b4e7
--- /dev/null
+++ b/docs/superpowers/plans/2026-03-15-finetune-classifier.md
@@ -0,0 +1,1861 @@
+# Fine-tune Email Classifier Implementation Plan
+
+> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Fine-tune `deberta-small` and `bge-m3` on the labeled dataset, surface trained models in the benchmark harness, and expose a UI-triggerable training workflow with SSE streaming logs.
+
+**Architecture:** A new CLI script (`scripts/finetune_classifier.py`) handles data prep, weighted training, and checkpoint saving. A new `FineTunedAdapter` in `classifier_adapters.py` loads saved checkpoints for inference. `benchmark_classifier.py` auto-discovers these adapters at startup via `training_info.json` files. Two GET endpoints in `api.py` expose status and streaming run. `BenchmarkView.vue` adds a badge row and collapsible fine-tune section.
+
+**Tech Stack:** transformers 4.57.3, torch 2.10.0, accelerate 1.12.0, scikit-learn (new), FastAPI SSE, Vue 3 + EventSource
+
+---
+
+## File Structure
+
+| File | Action | Responsibility |
+|------|--------|---------------|
+| `environment.yml` | Modify | Add `scikit-learn` dependency |
+| `scripts/classifier_adapters.py` | Modify | Add `FineTunedAdapter` class |
+| `scripts/benchmark_classifier.py` | Modify | Add `_MODELS_DIR`, `discover_finetuned_models()`, merge into model registry at startup |
+| `scripts/finetune_classifier.py` | Create | Full training pipeline: data prep, class weights, `WeightedTrainer`, CLI |
+| `app/api.py` | Modify | Add `GET /api/finetune/status` and `GET /api/finetune/run` |
+| `web/src/views/BenchmarkView.vue` | Modify | Add trained models badge row + collapsible fine-tune section |
+| `tests/test_classifier_adapters.py` | Modify | Add `FineTunedAdapter` unit tests |
+| `tests/test_benchmark_classifier.py` | Modify | Add auto-discovery unit tests |
+| `tests/test_finetune.py` | Create | Unit tests for data pipeline, `WeightedTrainer`, `compute_metrics_for_trainer` |
+| `tests/test_api.py` | Modify | Add tests for `/api/finetune/status` and `/api/finetune/run` |
+
+---
+
+## Chunk 1: Foundation — FineTunedAdapter + Auto-discovery
+
+### Task 1: Add scikit-learn to environment.yml
+
+**Files:**
+- Modify: `environment.yml`
+
+- [ ] **Step 1: Add scikit-learn**
+
+Edit `environment.yml` — add `scikit-learn>=1.4` in the pip section after `accelerate`:
+
+```yaml
+ - scikit-learn>=1.4
+```
+
+- [ ] **Step 2: Verify environment.yml is valid YAML**
+
+```bash
+python -c "import yaml; yaml.safe_load(open('environment.yml'))" && echo OK
+```
+
+Expected: `OK`
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add environment.yml
+git commit -m "chore(avocet): add scikit-learn to classifier env"
+```
+
+---
+
+### Task 2: FineTunedAdapter — write failing tests
+
+**Files:**
+- Modify: `tests/test_classifier_adapters.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+Append to `tests/test_classifier_adapters.py`:
+
+```python
+
+# ---- FineTunedAdapter tests ----
+
+def test_finetuned_adapter_classify_calls_pipeline_with_sep_format(tmp_path):
+ """classify() must format input as 'subject [SEP] body[:400]' — not the zero-shot format."""
+ from unittest.mock import MagicMock, patch
+ from scripts.classifier_adapters import FineTunedAdapter
+
+ mock_result = [{"label": "digest", "score": 0.95}]
+ mock_pipe_instance = MagicMock(return_value=mock_result)
+ mock_pipe_factory = MagicMock(return_value=mock_pipe_instance)
+
+ adapter = FineTunedAdapter("avocet-deberta-small", str(tmp_path))
+ with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
+ result = adapter.classify("Test subject", "Test body")
+
+ assert result == "digest"
+ call_args = mock_pipe_instance.call_args[0][0]
+ assert "[SEP]" in call_args
+ assert "Test subject" in call_args
+ assert "Test body" in call_args
+
+
+def test_finetuned_adapter_truncates_body_to_400():
+ """Body must be truncated to 400 chars in the [SEP] format."""
+ from unittest.mock import MagicMock, patch
+ from scripts.classifier_adapters import FineTunedAdapter, LABELS
+
+ long_body = "x" * 800
+ mock_result = [{"label": "neutral", "score": 0.9}]
+ mock_pipe_instance = MagicMock(return_value=mock_result)
+ mock_pipe_factory = MagicMock(return_value=mock_pipe_instance)
+
+ adapter = FineTunedAdapter("avocet-deberta-small", "/fake/path")
+ with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
+ adapter.classify("Subject", long_body)
+
+ call_text = mock_pipe_instance.call_args[0][0]
+ # "Subject [SEP] " prefix + 400 body chars = 414 chars max
+ assert len(call_text) <= 420
+
+
+def test_finetuned_adapter_returns_label_string():
+ """classify() must return a plain string, not a dict."""
+ from unittest.mock import MagicMock, patch
+ from scripts.classifier_adapters import FineTunedAdapter
+
+ mock_result = [{"label": "interview_scheduled", "score": 0.87}]
+ mock_pipe_instance = MagicMock(return_value=mock_result)
+ mock_pipe_factory = MagicMock(return_value=mock_pipe_instance)
+
+ adapter = FineTunedAdapter("avocet-deberta-small", "/fake/path")
+ with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
+ result = adapter.classify("S", "B")
+
+ assert isinstance(result, str)
+ assert result == "interview_scheduled"
+
+
+def test_finetuned_adapter_lazy_loads_pipeline():
+ """Pipeline factory must not be called until classify() is first called."""
+ from unittest.mock import MagicMock, patch
+ from scripts.classifier_adapters import FineTunedAdapter
+
+ mock_pipe_factory = MagicMock(return_value=MagicMock(return_value=[{"label": "neutral", "score": 0.9}]))
+
+ with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
+ adapter = FineTunedAdapter("avocet-deberta-small", "/fake/path")
+ assert not mock_pipe_factory.called
+ adapter.classify("s", "b")
+ assert mock_pipe_factory.called
+
+
+def test_finetuned_adapter_unload_clears_pipeline():
+ """unload() must set _pipeline to None so memory is released."""
+ from unittest.mock import MagicMock, patch
+ from scripts.classifier_adapters import FineTunedAdapter
+
+ mock_pipe_factory = MagicMock(return_value=MagicMock(return_value=[{"label": "neutral", "score": 0.9}]))
+
+ with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
+ adapter = FineTunedAdapter("avocet-deberta-small", "/fake/path")
+ adapter.classify("s", "b")
+ assert adapter._pipeline is not None
+ adapter.unload()
+ assert adapter._pipeline is None
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_classifier_adapters.py -k "finetuned" -v
+```
+
+Expected: `ImportError` or `AttributeError` — `FineTunedAdapter` not yet defined.
+
+---
+
+### Task 3: FineTunedAdapter — implement
+
+**Files:**
+- Modify: `scripts/classifier_adapters.py`
+
+- [ ] **Step 1: Add FineTunedAdapter to `__all__`**
+
+In `scripts/classifier_adapters.py`, add `"FineTunedAdapter"` to `__all__`.
+
+- [ ] **Step 2: Implement FineTunedAdapter**
+
+Append after `RerankerAdapter`:
+
+```python
+class FineTunedAdapter(ClassifierAdapter):
+ """Loads a fine-tuned checkpoint from a local models/ directory.
+
+ Uses pipeline("text-classification") for a single forward pass.
+ Input format: 'subject [SEP] body[:400]' — must match training format exactly.
+ Expected inference speed: ~10–20ms/email vs 111–338ms for zero-shot.
+ """
+
+ def __init__(self, name: str, model_dir: str) -> None:
+ self._name = name
+ self._model_dir = model_dir
+ self._pipeline: Any = None
+
+ @property
+ def name(self) -> str:
+ return self._name
+
+ @property
+ def model_id(self) -> str:
+ return self._model_dir
+
+ def load(self) -> None:
+ import scripts.classifier_adapters as _mod # noqa: PLC0415
+ _pipe_fn = _mod.pipeline
+ if _pipe_fn is None:
+ raise ImportError("transformers not installed")
+ self._pipeline = _pipe_fn("text-classification", model=self._model_dir)
+
+ def unload(self) -> None:
+ self._pipeline = None
+
+ def classify(self, subject: str, body: str) -> str:
+ if self._pipeline is None:
+ self.load()
+ text = f"{subject} [SEP] {body[:400]}"
+ result = self._pipeline(text)
+ return result[0]["label"]
+```
+
+- [ ] **Step 3: Run tests to verify they pass**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_classifier_adapters.py -k "finetuned" -v
+```
+
+Expected: 5 tests PASS.
+
+- [ ] **Step 4: Run full adapter test suite to verify no regressions**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_classifier_adapters.py -v
+```
+
+Expected: All tests PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add scripts/classifier_adapters.py tests/test_classifier_adapters.py
+git commit -m "feat(avocet): add FineTunedAdapter for local checkpoint inference"
+```
+
+---
+
+### Task 4: Auto-discovery in benchmark_classifier.py — write failing tests
+
+**Files:**
+- Modify: `tests/test_benchmark_classifier.py`
+
+- [ ] **Step 1: Write the failing tests**
+
+Append to `tests/test_benchmark_classifier.py`:
+
+```python
+
+# ---- Auto-discovery tests ----
+
+def test_discover_finetuned_models_finds_training_info_files(tmp_path):
+ """discover_finetuned_models() must return one entry per training_info.json found."""
+ import json
+ from scripts.benchmark_classifier import discover_finetuned_models
+
+ # Create two fake model directories
+ for name in ("avocet-deberta-small", "avocet-bge-m3"):
+ model_dir = tmp_path / name
+ model_dir.mkdir()
+ info = {
+ "name": name,
+ "base_model_id": "cross-encoder/nli-deberta-v3-small",
+ "timestamp": "2026-03-15T12:00:00Z",
+ "val_macro_f1": 0.72,
+ "val_accuracy": 0.80,
+ "sample_count": 401,
+ }
+ (model_dir / "training_info.json").write_text(json.dumps(info))
+
+ results = discover_finetuned_models(tmp_path)
+ assert len(results) == 2
+ names = {r["name"] for r in results}
+ assert "avocet-deberta-small" in names
+ assert "avocet-bge-m3" in names
+
+
+def test_discover_finetuned_models_returns_empty_when_no_models_dir():
+ """discover_finetuned_models() must return [] silently if models/ doesn't exist."""
+ from pathlib import Path
+ from scripts.benchmark_classifier import discover_finetuned_models
+
+ results = discover_finetuned_models(Path("/nonexistent/path/models"))
+ assert results == []
+
+
+def test_discover_finetuned_models_skips_dirs_without_training_info(tmp_path):
+ """Subdirs without training_info.json are silently skipped."""
+ from scripts.benchmark_classifier import discover_finetuned_models
+
+ # A dir WITHOUT training_info.json
+ (tmp_path / "some-other-dir").mkdir()
+
+ results = discover_finetuned_models(tmp_path)
+ assert results == []
+
+
+def test_active_models_includes_discovered_finetuned(tmp_path):
+ """The active models dict must include FineTunedAdapter entries for discovered models."""
+ import json
+ from unittest.mock import patch
+ from scripts.benchmark_classifier import _active_models
+ from scripts.classifier_adapters import FineTunedAdapter
+
+ model_dir = tmp_path / "avocet-deberta-small"
+ model_dir.mkdir()
+ (model_dir / "training_info.json").write_text(json.dumps({
+ "name": "avocet-deberta-small",
+ "base_model_id": "cross-encoder/nli-deberta-v3-small",
+ "val_macro_f1": 0.72,
+ "sample_count": 401,
+ }))
+
+ with patch("scripts.benchmark_classifier._MODELS_DIR", tmp_path):
+ models = _active_models(include_slow=False)
+
+ assert "avocet-deberta-small" in models
+ assert isinstance(models["avocet-deberta-small"]["adapter_instance"], FineTunedAdapter)
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_benchmark_classifier.py -k "discover or active_models" -v
+```
+
+Expected: `ImportError` — `discover_finetuned_models` and `_MODELS_DIR` not yet defined.
+
+---
+
+### Task 5: Auto-discovery — implement in benchmark_classifier.py
+
+**Files:**
+- Modify: `scripts/benchmark_classifier.py`
+
+- [ ] **Step 1: Add imports and _MODELS_DIR**
+
+Near the top of `scripts/benchmark_classifier.py`, after the existing imports, add:
+
+```python
+from scripts.classifier_adapters import FineTunedAdapter
+```
+
+And define `_MODELS_DIR` (after `_ROOT` is defined — find where `_ROOT = Path(__file__).parent.parent` is, or add it):
+
+```python
+_ROOT = Path(__file__).parent.parent
+_MODELS_DIR = _ROOT / "models"
+```
+
+(If `_ROOT` already exists in the file, only add `_MODELS_DIR`.)
+
+- [ ] **Step 2: Add discover_finetuned_models()**
+
+Add after the `MODEL_REGISTRY` dict:
+
+```python
+def discover_finetuned_models(models_dir: Path | None = None) -> list[dict]:
+ """Scan models/ for subdirs containing training_info.json.
+
+ Returns a list of training_info dicts, each with an added 'model_dir' key.
+ Returns [] silently if models_dir does not exist.
+ """
+ if models_dir is None:
+ models_dir = _MODELS_DIR
+ if not models_dir.exists():
+ return []
+ found = []
+ for sub in models_dir.iterdir():
+ if not sub.is_dir():
+ continue
+ info_path = sub / "training_info.json"
+ if not info_path.exists():
+ continue
+ info = json.loads(info_path.read_text(encoding="utf-8"))
+ info["model_dir"] = str(sub)
+ found.append(info)
+ return found
+```
+
+- [ ] **Step 3: Add _active_models() function**
+
+Add after `discover_finetuned_models()`:
+
+```python
+def _active_models(include_slow: bool = False) -> dict[str, dict]:
+ """Return the active model registry, merged with any discovered fine-tuned models."""
+ active = {
+ key: {**entry, "adapter_instance": entry["adapter"](
+ key,
+ entry["model_id"],
+ **entry.get("kwargs", {}),
+ )}
+ for key, entry in MODEL_REGISTRY.items()
+ if include_slow or entry.get("default", False)
+ }
+ for info in discover_finetuned_models():
+ name = info["name"]
+ active[name] = {
+ "adapter_instance": FineTunedAdapter(name, info["model_dir"]),
+ "params": "fine-tuned",
+ "default": True,
+ }
+ return active
+```
+
+- [ ] **Step 4: Run tests to verify they pass**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_benchmark_classifier.py -k "discover or active_models" -v
+```
+
+Expected: 4 tests PASS.
+
+- [ ] **Step 5: Run full benchmark test suite**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_benchmark_classifier.py -v
+```
+
+Expected: All tests PASS. (Existing tests that construct adapters directly from `MODEL_REGISTRY` still work because we only added new functions.)
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add scripts/benchmark_classifier.py tests/test_benchmark_classifier.py
+git commit -m "feat(avocet): auto-discover fine-tuned models in benchmark harness"
+```
+
+---
+
+## Chunk 2: Training Script — finetune_classifier.py
+
+### Task 6: Data loading and class weights — write failing tests
+
+**Files:**
+- Create: `tests/test_finetune.py`
+
+- [ ] **Step 1: Create test file with data pipeline tests**
+
+Create `tests/test_finetune.py`:
+
+```python
+"""Tests for finetune_classifier — no model downloads required."""
+from __future__ import annotations
+
+import json
+import pytest
+
+
+# ---- Data loading tests ----
+
+def test_load_and_prepare_data_drops_non_canonical_labels(tmp_path):
+ """Rows with labels not in LABELS must be silently dropped."""
+ from scripts.finetune_classifier import load_and_prepare_data
+ from scripts.classifier_adapters import LABELS
+
+ rows = [
+ {"subject": "s1", "body": "b1", "label": "digest"},
+ {"subject": "s2", "body": "b2", "label": "profile_alert"}, # non-canonical
+ {"subject": "s3", "body": "b3", "label": "neutral"},
+ ]
+ score_file = tmp_path / "email_score.jsonl"
+ score_file.write_text("\n".join(json.dumps(r) for r in rows))
+
+ texts, labels = load_and_prepare_data(score_file)
+ assert len(texts) == 2
+ assert all(l in LABELS for l in labels)
+
+
+def test_load_and_prepare_data_formats_input_as_sep():
+ """Input text must be 'subject [SEP] body[:400]'."""
+ import json
+ from pathlib import Path
+ from scripts.finetune_classifier import load_and_prepare_data
+
+ import tempfile, os
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
+ f.write(json.dumps({"subject": "Hello", "body": "World" * 100, "label": "neutral"}) + "\n")
+ fname = f.name
+
+ try:
+ texts, labels = load_and_prepare_data(Path(fname))
+ finally:
+ os.unlink(fname)
+
+ assert texts[0].startswith("Hello [SEP] ")
+ assert len(texts[0]) <= len("Hello [SEP] ") + 400 + 5 # small buffer for truncation
+
+
+def test_load_and_prepare_data_raises_on_missing_file():
+ """FileNotFoundError must be raised with actionable message."""
+ from pathlib import Path
+ from scripts.finetune_classifier import load_and_prepare_data
+
+ with pytest.raises(FileNotFoundError, match="email_score.jsonl"):
+ load_and_prepare_data(Path("/nonexistent/email_score.jsonl"))
+
+
+def test_load_and_prepare_data_drops_class_with_fewer_than_2_samples(tmp_path, capsys):
+ """Classes with < 2 total samples must be dropped with a warning."""
+ from scripts.finetune_classifier import load_and_prepare_data
+
+ rows = [
+ {"subject": "s1", "body": "b", "label": "digest"},
+ {"subject": "s2", "body": "b", "label": "digest"},
+ {"subject": "s3", "body": "b", "label": "new_lead"}, # only 1 sample — drop
+ ]
+ score_file = tmp_path / "email_score.jsonl"
+ score_file.write_text("\n".join(json.dumps(r) for r in rows))
+
+ texts, labels = load_and_prepare_data(score_file)
+ captured = capsys.readouterr()
+
+ assert "new_lead" not in labels
+ assert "new_lead" in captured.out # warning printed
+
+
+# ---- Class weights tests ----
+
+def test_compute_class_weights_returns_tensor_for_each_class():
+ """compute_class_weights must return a float tensor of length n_classes."""
+ import torch
+ from scripts.finetune_classifier import compute_class_weights
+
+ label_ids = [0, 0, 0, 1, 1, 2] # 3 classes, imbalanced
+ weights = compute_class_weights(label_ids, n_classes=3)
+
+ assert isinstance(weights, torch.Tensor)
+ assert weights.shape == (3,)
+ assert all(w > 0 for w in weights)
+
+
+def test_compute_class_weights_upweights_minority():
+ """Minority classes must receive higher weight than majority classes."""
+ from scripts.finetune_classifier import compute_class_weights
+
+ # Class 0: 10 samples, Class 1: 2 samples
+ label_ids = [0] * 10 + [1] * 2
+ weights = compute_class_weights(label_ids, n_classes=2)
+
+ assert weights[1] > weights[0]
+
+
+# ---- compute_metrics_for_trainer tests ----
+
+def test_compute_metrics_for_trainer_returns_macro_f1_key():
+ """Must return a dict with 'macro_f1' key."""
+ import numpy as np
+ from scripts.finetune_classifier import compute_metrics_for_trainer
+ from transformers import EvalPrediction
+
+ logits = np.array([[2.0, 0.1], [0.1, 2.0], [2.0, 0.1]])
+ labels = np.array([0, 1, 0])
+ pred = EvalPrediction(predictions=logits, label_ids=labels)
+
+ result = compute_metrics_for_trainer(pred)
+ assert "macro_f1" in result
+ assert result["macro_f1"] == pytest.approx(1.0)
+
+
+def test_compute_metrics_for_trainer_returns_accuracy_key():
+ """Must also return 'accuracy' key."""
+ import numpy as np
+ from scripts.finetune_classifier import compute_metrics_for_trainer
+ from transformers import EvalPrediction
+
+ logits = np.array([[2.0, 0.1], [0.1, 2.0]])
+ labels = np.array([0, 1])
+ pred = EvalPrediction(predictions=logits, label_ids=labels)
+
+ result = compute_metrics_for_trainer(pred)
+ assert "accuracy" in result
+ assert result["accuracy"] == pytest.approx(1.0)
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_finetune.py -k "load_and_prepare or class_weights or compute_metrics_for_trainer" -v
+```
+
+Expected: `ModuleNotFoundError` — `scripts.finetune_classifier` not yet created.
+
+---
+
+### Task 7: Implement data loading and class weights in finetune_classifier.py
+
+**Files:**
+- Create: `scripts/finetune_classifier.py`
+
+- [ ] **Step 1: Create finetune_classifier.py with data loading + class weights**
+
+Create `scripts/finetune_classifier.py`:
+
+```python
+"""Fine-tune email classifiers on the labeled dataset.
+
+CLI entry point. All prints use flush=True so stdout is SSE-streamable.
+
+Usage:
+ python scripts/finetune_classifier.py --model deberta-small [--epochs 5]
+
+Supported --model values: deberta-small, bge-m3
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import torch
+import torch.nn.functional as F
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, accuracy_score
+from transformers import (
+ AutoTokenizer,
+ AutoModelForSequenceClassification,
+ EvalPrediction,
+ Trainer,
+ TrainingArguments,
+ EarlyStoppingCallback,
+)
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from scripts.classifier_adapters import LABELS
+
+_ROOT = Path(__file__).parent.parent
+
+# ---------------------------------------------------------------------------
+# Model registry
+# ---------------------------------------------------------------------------
+
+_MODEL_CONFIG: dict[str, dict[str, Any]] = {
+ "deberta-small": {
+ "base_model_id": "cross-encoder/nli-deberta-v3-small",
+ "max_tokens": 512,
+ "fp16": False,
+ "batch_size": 16,
+ "grad_accum": 1,
+ "gradient_checkpointing": False,
+ },
+ "bge-m3": {
+ "base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
+ "max_tokens": 512,
+ "fp16": True,
+ "batch_size": 4,
+ "grad_accum": 4,
+ "gradient_checkpointing": True,
+ },
+}
+
+# ---------------------------------------------------------------------------
+# Data preparation
+# ---------------------------------------------------------------------------
+
+def load_and_prepare_data(score_file: Path) -> tuple[list[str], list[str]]:
+ """Load email_score.jsonl and return (texts, labels) ready for training.
+
+ - Drops rows with non-canonical labels (warns).
+ - Drops classes with < 2 total samples (warns).
+ - Warns (but continues) for classes with < 5 training samples.
+ - Input text format: 'subject [SEP] body[:400]'
+ """
+ if not score_file.exists():
+ raise FileNotFoundError(
+ f"Score file not found: {score_file}\n"
+ "Run the label tool first to create email_score.jsonl"
+ )
+
+ lines = score_file.read_text(encoding="utf-8").splitlines()
+ rows = [json.loads(l) for l in lines if l.strip()]
+
+ # Drop non-canonical labels
+ canonical = set(LABELS)
+ kept = []
+ for r in rows:
+ lbl = r.get("label", "")
+ if lbl not in canonical:
+ print(f"[data] Dropping row with non-canonical label: {lbl!r}", flush=True)
+ continue
+ kept.append(r)
+
+ # Count samples per class
+ from collections import Counter
+ counts = Counter(r["label"] for r in kept)
+
+ # Drop classes with < 2 total samples
+ drop_classes = {lbl for lbl, cnt in counts.items() if cnt < 2}
+ for lbl in sorted(drop_classes):
+ print(
+ f"[data] WARNING: Dropping class {lbl!r} — only {counts[lbl]} total sample(s). "
+ "Need at least 2 for stratified split.",
+ flush=True,
+ )
+ kept = [r for r in kept if r["label"] not in drop_classes]
+
+ # Warn for classes with < 5 samples (after drops)
+ counts = Counter(r["label"] for r in kept)
+ for lbl, cnt in sorted(counts.items()):
+ if cnt < 5:
+ print(
+ f"[data] WARNING: Class {lbl!r} has only {cnt} sample(s). "
+ "Eval F1 for this class will be unreliable.",
+ flush=True,
+ )
+
+ texts = [f"{r['subject']} [SEP] {r['body'][:400]}" for r in kept]
+ labels = [r["label"] for r in kept]
+ return texts, labels
+
+
+# ---------------------------------------------------------------------------
+# Class weights
+# ---------------------------------------------------------------------------
+
+def compute_class_weights(label_ids: list[int], n_classes: int) -> torch.Tensor:
+ """Compute per-class weights: total / (n_classes * class_count).
+
+ Returns a CPU float tensor of shape (n_classes,).
+ """
+ from collections import Counter
+ counts = Counter(label_ids)
+ total = len(label_ids)
+ weights = []
+ for i in range(n_classes):
+ cnt = counts.get(i, 1) # avoid division by zero for unseen classes
+ weights.append(total / (n_classes * cnt))
+ return torch.tensor(weights, dtype=torch.float32)
+
+
+# ---------------------------------------------------------------------------
+# compute_metrics callback for Trainer
+# ---------------------------------------------------------------------------
+
+def compute_metrics_for_trainer(eval_pred: EvalPrediction) -> dict:
+ """Trainer callback: EvalPrediction → {macro_f1, accuracy}.
+
+ Distinct from compute_metrics() in classifier_adapters.py (which operates
+ on string predictions). This one operates on numpy logits + label_ids.
+ """
+ logits, labels = eval_pred
+ preds = logits.argmax(axis=-1)
+ return {
+ "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
+ "accuracy": accuracy_score(labels, preds),
+ }
+```
+
+- [ ] **Step 2: Run data pipeline tests**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_finetune.py -k "load_and_prepare or class_weights or compute_metrics_for_trainer" -v
+```
+
+Expected: All 7 tests PASS. (Note: `compute_metrics_for_trainer` test requires transformers — run in `job-seeker-classifiers` env if needed.)
+
+```bash
+/devl/miniconda3/envs/job-seeker-classifiers/bin/pytest tests/test_finetune.py -k "load_and_prepare or class_weights or compute_metrics_for_trainer" -v
+```
+
+Expected: All 7 tests PASS.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add scripts/finetune_classifier.py tests/test_finetune.py
+git commit -m "feat(avocet): add finetune data pipeline + class weights + compute_metrics"
+```
+
+---
+
+### Task 8: WeightedTrainer — write failing tests
+
+**Files:**
+- Modify: `tests/test_finetune.py`
+
+- [ ] **Step 1: Append WeightedTrainer tests**
+
+Append to `tests/test_finetune.py`:
+
+```python
+
+# ---- WeightedTrainer tests ----
+
+def test_weighted_trainer_compute_loss_returns_scalar():
+ """compute_loss must return a scalar tensor when return_outputs=False."""
+ import torch
+ from unittest.mock import MagicMock
+ from scripts.finetune_classifier import WeightedTrainer
+
+ # Minimal mock model that returns logits
+ n_classes = 3
+ batch = 4
+ logits = torch.randn(batch, n_classes)
+
+ mock_outputs = MagicMock()
+ mock_outputs.logits = logits
+
+ mock_model = MagicMock(return_value=mock_outputs)
+
+ # Build a trainer with class weights
+ weights = torch.ones(n_classes)
+ trainer = WeightedTrainer.__new__(WeightedTrainer)
+ trainer.class_weights = weights
+
+ inputs = {
+ "input_ids": torch.zeros(batch, 10, dtype=torch.long),
+ "labels": torch.randint(0, n_classes, (batch,)),
+ }
+
+ loss = trainer.compute_loss(mock_model, inputs, return_outputs=False)
+ assert isinstance(loss, torch.Tensor)
+ assert loss.ndim == 0 # scalar
+
+
+def test_weighted_trainer_compute_loss_accepts_kwargs():
+ """compute_loss must not raise TypeError when called with num_items_in_batch kwarg.
+
+ Transformers 4.38+ passes this extra kwarg — **kwargs absorbs it.
+ """
+ import torch
+ from unittest.mock import MagicMock
+ from scripts.finetune_classifier import WeightedTrainer
+
+ n_classes = 3
+ batch = 2
+ logits = torch.randn(batch, n_classes)
+
+ mock_outputs = MagicMock()
+ mock_outputs.logits = logits
+ mock_model = MagicMock(return_value=mock_outputs)
+
+ trainer = WeightedTrainer.__new__(WeightedTrainer)
+ trainer.class_weights = torch.ones(n_classes)
+
+ inputs = {
+ "input_ids": torch.zeros(batch, 5, dtype=torch.long),
+ "labels": torch.randint(0, n_classes, (batch,)),
+ }
+
+ # Must not raise TypeError
+ loss = trainer.compute_loss(mock_model, inputs, return_outputs=False,
+ num_items_in_batch=batch)
+ assert isinstance(loss, torch.Tensor)
+
+
+def test_weighted_trainer_weighted_loss_differs_from_unweighted():
+ """Weighted loss must differ from uniform-weight loss for imbalanced inputs."""
+ import torch
+ from unittest.mock import MagicMock
+ from scripts.finetune_classifier import WeightedTrainer
+
+ n_classes = 2
+ batch = 4
+ # All labels are class 0 (majority class scenario)
+ labels = torch.zeros(batch, dtype=torch.long)
+ logits = torch.zeros(batch, n_classes) # neutral logits
+
+ mock_outputs = MagicMock()
+ mock_outputs.logits = logits
+
+ # Uniform weights
+ trainer_uniform = WeightedTrainer.__new__(WeightedTrainer)
+ trainer_uniform.class_weights = torch.ones(n_classes)
+ inputs_uniform = {"input_ids": torch.zeros(batch, 5, dtype=torch.long), "labels": labels.clone()}
+ loss_uniform = trainer_uniform.compute_loss(MagicMock(return_value=mock_outputs),
+ inputs_uniform)
+
+ # Heavily imbalanced weights: class 1 much more important
+ trainer_weighted = WeightedTrainer.__new__(WeightedTrainer)
+ trainer_weighted.class_weights = torch.tensor([0.1, 10.0])
+ inputs_weighted = {"input_ids": torch.zeros(batch, 5, dtype=torch.long), "labels": labels.clone()}
+
+ mock_outputs2 = MagicMock()
+ mock_outputs2.logits = logits.clone()
+ loss_weighted = trainer_weighted.compute_loss(MagicMock(return_value=mock_outputs2),
+ inputs_weighted)
+
+ assert not torch.isclose(loss_uniform, loss_weighted)
+
+
+def test_weighted_trainer_compute_loss_returns_outputs_when_requested():
+ """compute_loss with return_outputs=True must return (loss, outputs) tuple."""
+ import torch
+ from unittest.mock import MagicMock
+ from scripts.finetune_classifier import WeightedTrainer
+
+ n_classes = 3
+ batch = 2
+ logits = torch.randn(batch, n_classes)
+
+ mock_outputs = MagicMock()
+ mock_outputs.logits = logits
+ mock_model = MagicMock(return_value=mock_outputs)
+
+ trainer = WeightedTrainer.__new__(WeightedTrainer)
+ trainer.class_weights = torch.ones(n_classes)
+
+ inputs = {
+ "input_ids": torch.zeros(batch, 5, dtype=torch.long),
+ "labels": torch.randint(0, n_classes, (batch,)),
+ }
+
+ result = trainer.compute_loss(mock_model, inputs, return_outputs=True)
+ assert isinstance(result, tuple)
+ loss, outputs = result
+ assert isinstance(loss, torch.Tensor)
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_finetune.py -k "weighted_trainer" -v
+```
+
+Expected: `ImportError` — `WeightedTrainer` not yet defined.
+
+---
+
+### Task 9: Implement WeightedTrainer
+
+**Files:**
+- Modify: `scripts/finetune_classifier.py`
+
+- [ ] **Step 1: Add WeightedTrainer class**
+
+Append to `scripts/finetune_classifier.py` after `compute_metrics_for_trainer`:
+
+```python
+# ---------------------------------------------------------------------------
+# Weighted Trainer
+# ---------------------------------------------------------------------------
+
+class WeightedTrainer(Trainer):
+ """Trainer subclass that applies per-class weights to cross-entropy loss.
+
+ Handles class imbalance by down-weighting majority classes and up-weighting
+ minority classes. Attach class_weights (CPU float tensor) before training.
+ """
+
+ def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
+ # **kwargs is required — absorbs num_items_in_batch added in Transformers 4.38.
+ # Do not remove it; removing it causes TypeError on the first training step.
+ labels = inputs.pop("labels")
+ outputs = model(**inputs)
+ # Move class_weights to the same device as logits — required for GPU training.
+ # class_weights is created on CPU; logits are on cuda:0 during training.
+ weight = self.class_weights.to(outputs.logits.device)
+ loss = F.cross_entropy(outputs.logits, labels, weight=weight)
+ return (loss, outputs) if return_outputs else loss
+```
+
+- [ ] **Step 2: Run WeightedTrainer tests**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_finetune.py -k "weighted_trainer" -v
+```
+
+Expected: 4 tests PASS.
+
+- [ ] **Step 3: Run full test_finetune.py**
+
+```bash
+/devl/miniconda3/envs/job-seeker-classifiers/bin/pytest tests/test_finetune.py -v
+```
+
+Expected: All tests PASS.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add scripts/finetune_classifier.py tests/test_finetune.py
+git commit -m "feat(avocet): add WeightedTrainer with device-aware class weights"
+```
+
+---
+
+### Task 10: Implement run_finetune() and CLI
+
+**Files:**
+- Modify: `scripts/finetune_classifier.py`
+
+- [ ] **Step 1: Add run_finetune() and CLI to finetune_classifier.py**
+
+Append to `scripts/finetune_classifier.py`:
+
+```python
+# ---------------------------------------------------------------------------
+# Training dataset wrapper
+# ---------------------------------------------------------------------------
+
+from torch.utils.data import Dataset as TorchDataset
+
+
+class _EmailDataset(TorchDataset):
+ def __init__(self, encodings: dict, label_ids: list[int]) -> None:
+ self.encodings = encodings
+ self.label_ids = label_ids
+
+ def __len__(self) -> int:
+ return len(self.label_ids)
+
+ def __getitem__(self, idx: int) -> dict:
+ item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
+ item["labels"] = torch.tensor(self.label_ids[idx], dtype=torch.long)
+ return item
+
+
+# ---------------------------------------------------------------------------
+# Main training function
+# ---------------------------------------------------------------------------
+
+def run_finetune(model_key: str, epochs: int = 5) -> None:
+ """Fine-tune the specified model on data/email_score.jsonl.
+
+ Saves model + tokenizer + training_info.json to models/avocet-{model_key}/.
+ All prints use flush=True for SSE streaming.
+ """
+ if model_key not in _MODEL_CONFIG:
+ raise ValueError(f"Unknown model key: {model_key!r}. Choose from: {list(_MODEL_CONFIG)}")
+
+ config = _MODEL_CONFIG[model_key]
+ base_model_id = config["base_model_id"]
+ output_dir = _ROOT / "models" / f"avocet-{model_key}"
+
+ print(f"[finetune] Model: {model_key} ({base_model_id})", flush=True)
+ print(f"[finetune] Output: {output_dir}", flush=True)
+ if output_dir.exists():
+ print(f"[finetune] WARNING: {output_dir} already exists — will overwrite.", flush=True)
+
+ # --- Data ---
+ score_file = _ROOT / "data" / "email_score.jsonl"
+ print(f"[finetune] Loading data from {score_file} ...", flush=True)
+ texts, str_labels = load_and_prepare_data(score_file)
+
+ present_labels = sorted(set(str_labels))
+ label2id = {l: i for i, l in enumerate(present_labels)}
+ id2label = {i: l for l, i in label2id.items()}
+ n_classes = len(present_labels)
+ label_ids = [label2id[l] for l in str_labels]
+
+ print(f"[finetune] {len(texts)} samples, {n_classes} classes", flush=True)
+
+ # Stratified 80/20 split
+ (train_texts, val_texts,
+ train_label_ids, val_label_ids) = train_test_split(
+ texts, label_ids,
+ test_size=0.2,
+ stratify=label_ids,
+ random_state=42,
+ )
+ print(f"[finetune] Train: {len(train_texts)}, Val: {len(val_texts)}", flush=True)
+
+ # Warn for classes with < 5 training samples
+ from collections import Counter
+ train_counts = Counter(train_label_ids)
+ for cls_id, cnt in train_counts.items():
+ if cnt < 5:
+ print(
+ f"[finetune] WARNING: Class {id2label[cls_id]!r} has {cnt} training sample(s). "
+ "Eval F1 for this class will be unreliable.",
+ flush=True,
+ )
+
+ # --- Tokenize ---
+ print(f"[finetune] Loading tokenizer ...", flush=True)
+ tokenizer = AutoTokenizer.from_pretrained(base_model_id)
+
+ train_enc = tokenizer(train_texts, truncation=True,
+ max_length=config["max_tokens"], padding=True)
+ val_enc = tokenizer(val_texts, truncation=True,
+ max_length=config["max_tokens"], padding=True)
+
+ train_dataset = _EmailDataset(train_enc, train_label_ids)
+ val_dataset = _EmailDataset(val_enc, val_label_ids)
+
+ # --- Class weights ---
+ class_weights = compute_class_weights(train_label_ids, n_classes)
+ print(f"[finetune] Class weights: {dict(zip(present_labels, class_weights.tolist()))}", flush=True)
+
+ # --- Model ---
+ print(f"[finetune] Loading model ...", flush=True)
+ model = AutoModelForSequenceClassification.from_pretrained(
+ base_model_id,
+ num_labels=n_classes,
+ ignore_mismatched_sizes=True, # NLI head (3-class) → new head (n_classes)
+ id2label=id2label,
+ label2id=label2id,
+ )
+ if config["gradient_checkpointing"]:
+ model.gradient_checkpointing_enable()
+
+ # --- TrainingArguments ---
+ training_args = TrainingArguments(
+ output_dir=str(output_dir),
+ num_train_epochs=epochs,
+ per_device_train_batch_size=config["batch_size"],
+ per_device_eval_batch_size=config["batch_size"],
+ gradient_accumulation_steps=config["grad_accum"],
+ learning_rate=2e-5,
+ lr_scheduler_type="linear",
+ warmup_ratio=0.1,
+ fp16=config["fp16"],
+ eval_strategy="epoch",
+ save_strategy="epoch",
+ load_best_model_at_end=True,
+ metric_for_best_model="macro_f1",
+ greater_is_better=True,
+ logging_steps=10,
+ report_to="none",
+ save_total_limit=2,
+ )
+
+ trainer = WeightedTrainer(
+ model=model,
+ args=training_args,
+ train_dataset=train_dataset,
+ eval_dataset=val_dataset,
+ compute_metrics=compute_metrics_for_trainer,
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
+ )
+ trainer.class_weights = class_weights
+
+ # --- Train ---
+ print(f"[finetune] Starting training ({epochs} epochs) ...", flush=True)
+ train_result = trainer.train()
+ print(f"[finetune] Training complete. Steps: {train_result.global_step}", flush=True)
+
+ # --- Evaluate ---
+ print(f"[finetune] Evaluating best checkpoint ...", flush=True)
+ metrics = trainer.evaluate()
+ val_macro_f1 = metrics.get("eval_macro_f1", 0.0)
+ val_accuracy = metrics.get("eval_accuracy", 0.0)
+ print(f"[finetune] Val macro-F1: {val_macro_f1:.4f}, Accuracy: {val_accuracy:.4f}", flush=True)
+
+ # --- Save model + tokenizer ---
+ print(f"[finetune] Saving model to {output_dir} ...", flush=True)
+ trainer.save_model(str(output_dir))
+ tokenizer.save_pretrained(str(output_dir))
+
+ # --- Write training_info.json ---
+ from collections import Counter
+ label_counts = dict(Counter(str_labels))
+ info = {
+ "name": f"avocet-{model_key}",
+ "base_model_id": base_model_id,
+ "timestamp": datetime.now(timezone.utc).isoformat(),
+ "epochs_run": epochs,
+ "val_macro_f1": round(val_macro_f1, 4),
+ "val_accuracy": round(val_accuracy, 4),
+ "sample_count": len(train_texts),
+ "label_counts": label_counts,
+ }
+ info_path = output_dir / "training_info.json"
+ info_path.write_text(json.dumps(info, indent=2), encoding="utf-8")
+ print(f"[finetune] Saved training_info.json: val_macro_f1={val_macro_f1:.4f}", flush=True)
+ print(f"[finetune] Done.", flush=True)
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Fine-tune an email classifier")
+ parser.add_argument(
+ "--model",
+ choices=list(_MODEL_CONFIG),
+ required=True,
+ help="Model key to fine-tune",
+ )
+ parser.add_argument(
+ "--epochs",
+ type=int,
+ default=5,
+ help="Number of training epochs (default: 5)",
+ )
+ args = parser.parse_args()
+ run_finetune(args.model, args.epochs)
+```
+
+- [ ] **Step 2: Run all finetune tests**
+
+```bash
+/devl/miniconda3/envs/job-seeker-classifiers/bin/pytest tests/test_finetune.py -v
+```
+
+Expected: All tests PASS (run_finetune itself is tested in the integration test — Task 11).
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add scripts/finetune_classifier.py
+git commit -m "feat(avocet): add run_finetune() training loop and CLI"
+```
+
+---
+
+### Task 11: Integration test — finetune on example data
+
+**Files:**
+- Modify: `tests/test_finetune.py`
+
+The example file `data/email_score.jsonl.example` has 8 samples with 5 of 10 labels represented. The 5 missing labels trigger the `< 2 total samples` drop path.
+
+- [ ] **Step 1: Append integration test**
+
+Append to `tests/test_finetune.py`:
+
+```python
+
+# ---- Integration test ----
+
+def test_integration_finetune_on_example_data(tmp_path):
+ """Fine-tune deberta-small on example data for 1 epoch.
+
+ Uses data/email_score.jsonl.example (8 samples, 5 labels represented).
+ The 5 missing labels must trigger the < 2 samples drop warning.
+ Verifies training_info.json is written with correct keys.
+
+ NOTE: This test requires the job-seeker-classifiers conda env and downloads
+ the deberta-small model on first run (~100MB). Skip in CI if model not cached.
+ Mark with @pytest.mark.slow to exclude from default runs.
+ """
+ import shutil
+ from scripts.finetune_classifier import run_finetune, _ROOT
+ from scripts import finetune_classifier as ft_mod
+
+ example_file = _ROOT / "data" / "email_score.jsonl.example"
+ if not example_file.exists():
+ pytest.skip("email_score.jsonl.example not found")
+
+ # Patch _ROOT to use tmp_path so model saves there, not production models/
+ orig_root = ft_mod._ROOT
+ ft_mod._ROOT = tmp_path
+
+ # Also copy the example file to tmp_path/data/
+ (tmp_path / "data").mkdir()
+ shutil.copy(example_file, tmp_path / "data" / "email_score.jsonl")
+
+ try:
+ import io
+ from contextlib import redirect_stdout
+ captured = io.StringIO()
+ with redirect_stdout(captured):
+ run_finetune("deberta-small", epochs=1)
+ output = captured.getvalue()
+ finally:
+ ft_mod._ROOT = orig_root
+
+ # 5 missing labels should each trigger a drop warning
+ from scripts.classifier_adapters import LABELS
+ assert "< 2 total samples" in output or "WARNING: Dropping class" in output
+
+ # training_info.json must exist with correct keys
+ info_path = tmp_path / "models" / "avocet-deberta-small" / "training_info.json"
+ assert info_path.exists(), "training_info.json not written"
+
+ import json
+ info = json.loads(info_path.read_text())
+ for key in ("name", "base_model_id", "timestamp", "epochs_run",
+ "val_macro_f1", "val_accuracy", "sample_count", "label_counts"):
+ assert key in info, f"Missing key: {key}"
+
+ assert info["name"] == "avocet-deberta-small"
+ assert info["epochs_run"] == 1
+```
+
+- [ ] **Step 2: Run unit tests only (fast path, no model download)**
+
+```bash
+/devl/miniconda3/envs/job-seeker-classifiers/bin/pytest tests/test_finetune.py -v -k "not integration"
+```
+
+Expected: All non-integration tests PASS.
+
+- [ ] **Step 3: Run integration test (requires model download ~100MB)**
+
+```bash
+/devl/miniconda3/envs/job-seeker-classifiers/bin/pytest tests/test_finetune.py::test_integration_finetune_on_example_data -v -s
+```
+
+Expected: PASS. Check output for drop warnings for missing labels.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add tests/test_finetune.py
+git commit -m "test(avocet): add integration test for finetune_classifier on example data"
+```
+
+---
+
+## Chunk 3: API Endpoints + BenchmarkView UI
+
+### Task 12: API endpoints — write failing tests
+
+**Files:**
+- Modify: `tests/test_api.py`
+
+- [ ] **Step 1: Append finetune endpoint tests**
+
+Append to `tests/test_api.py`:
+
+```python
+
+# ---- /api/finetune/status tests ----
+
+def test_finetune_status_returns_empty_when_no_models_dir(client):
+ """GET /api/finetune/status must return [] if models/ does not exist."""
+ r = client.get("/api/finetune/status")
+ assert r.status_code == 200
+ assert r.json() == []
+
+
+def test_finetune_status_returns_training_info(client, tmp_path):
+ """GET /api/finetune/status must return one entry per training_info.json found."""
+ import json
+ from app import api as api_module
+
+ # Create a fake models dir under tmp_path (data dir)
+ models_dir = api_module._DATA_DIR.parent / "models"
+ model_dir = models_dir / "avocet-deberta-small"
+ model_dir.mkdir(parents=True)
+ info = {
+ "name": "avocet-deberta-small",
+ "base_model_id": "cross-encoder/nli-deberta-v3-small",
+ "val_macro_f1": 0.712,
+ "timestamp": "2026-03-15T12:00:00Z",
+ "sample_count": 401,
+ }
+ (model_dir / "training_info.json").write_text(json.dumps(info))
+
+ r = client.get("/api/finetune/status")
+ assert r.status_code == 200
+ data = r.json()
+ assert len(data) == 1
+ assert data[0]["name"] == "avocet-deberta-small"
+ assert data[0]["val_macro_f1"] == pytest.approx(0.712)
+
+
+def test_finetune_run_streams_sse_events(client):
+ """GET /api/finetune/run must return text/event-stream content type."""
+ import subprocess
+ from unittest.mock import patch, MagicMock
+
+ mock_proc = MagicMock()
+ mock_proc.stdout = iter(["Training epoch 1\n", "Done\n"])
+ mock_proc.returncode = 0
+ mock_proc.wait = MagicMock()
+
+ with patch("subprocess.Popen", return_value=mock_proc):
+ r = client.get("/api/finetune/run?model=deberta-small&epochs=1")
+
+ assert r.status_code == 200
+ assert "text/event-stream" in r.headers.get("content-type", "")
+
+
+def test_finetune_run_emits_complete_on_success(client):
+ """GET /api/finetune/run must emit a complete event on clean exit."""
+ import subprocess
+ from unittest.mock import patch, MagicMock
+
+ mock_proc = MagicMock()
+ mock_proc.stdout = iter(["progress line\n"])
+ mock_proc.returncode = 0
+ mock_proc.wait = MagicMock()
+
+ with patch("subprocess.Popen", return_value=mock_proc):
+ r = client.get("/api/finetune/run?model=deberta-small&epochs=1")
+
+ assert '{"type": "complete"}' in r.text
+
+
+def test_finetune_run_emits_error_on_nonzero_exit(client):
+ """GET /api/finetune/run must emit an error event on non-zero exit."""
+ import subprocess
+ from unittest.mock import patch, MagicMock
+
+ mock_proc = MagicMock()
+ mock_proc.stdout = iter([])
+ mock_proc.returncode = 1
+ mock_proc.wait = MagicMock()
+
+ with patch("subprocess.Popen", return_value=mock_proc):
+ r = client.get("/api/finetune/run?model=deberta-small&epochs=1")
+
+ assert '"type": "error"' in r.text
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_api.py -k "finetune" -v
+```
+
+Expected: 404 or connection errors — endpoints not yet defined.
+
+---
+
+### Task 13: Implement finetune API endpoints
+
+**Files:**
+- Modify: `app/api.py`
+
+- [ ] **Step 1: Add finetune endpoints to api.py**
+
+In `app/api.py`, add after the benchmark endpoints section (after the `run_benchmark` function, before the `fetch_stream` function):
+
+```python
+# ---------------------------------------------------------------------------
+# Fine-tune endpoints
+# ---------------------------------------------------------------------------
+
+@app.get("/api/finetune/status")
+def get_finetune_status():
+ """Scan models/ for training_info.json files. Returns [] if none exist."""
+ models_dir = _ROOT / "models"
+ if not models_dir.exists():
+ return []
+ results = []
+ for sub in models_dir.iterdir():
+ if not sub.is_dir():
+ continue
+ info_path = sub / "training_info.json"
+ if not info_path.exists():
+ continue
+ try:
+ info = json.loads(info_path.read_text(encoding="utf-8"))
+ results.append(info)
+ except Exception:
+ pass
+ return results
+
+
+@app.get("/api/finetune/run")
+def run_finetune(model: str = "deberta-small", epochs: int = 5):
+ """Spawn finetune_classifier.py and stream stdout as SSE progress events."""
+ import subprocess
+
+ python_bin = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python"
+ script = str(_ROOT / "scripts" / "finetune_classifier.py")
+ cmd = [python_bin, script, "--model", model, "--epochs", str(epochs)]
+
+ def generate():
+ try:
+ proc = subprocess.Popen(
+ cmd,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ text=True,
+ bufsize=1,
+ cwd=str(_ROOT),
+ )
+ for line in proc.stdout:
+ line = line.rstrip()
+ if line:
+ yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
+ proc.wait()
+ if proc.returncode == 0:
+ yield f"data: {json.dumps({'type': 'complete'})}\n\n"
+ else:
+ yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
+ except Exception as exc:
+ yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
+
+ return StreamingResponse(
+ generate(),
+ media_type="text/event-stream",
+ headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
+ )
+```
+
+- [ ] **Step 2: Run finetune API tests**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_api.py -k "finetune" -v
+```
+
+Expected: All 5 finetune tests PASS.
+
+- [ ] **Step 3: Run full API test suite**
+
+```bash
+/devl/miniconda3/envs/job-seeker/bin/pytest tests/test_api.py -v
+```
+
+Expected: All tests PASS.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add app/api.py tests/test_api.py
+git commit -m "feat(avocet): add /api/finetune/status and /api/finetune/run endpoints"
+```
+
+---
+
+### Task 14: BenchmarkView.vue — trained models badge row + fine-tune section
+
+**Files:**
+- Modify: `web/src/views/BenchmarkView.vue`
+
+The BenchmarkView already has:
+- Macro-F1 bar chart
+- Latency bar chart
+- Per-label F1 heatmap
+- Benchmark run button with SSE log
+
+Add:
+1. **Trained models badge row** at the top (conditional on `fineTunedModels.length > 0`)
+2. **Fine-tune section** (collapsible, at the bottom): model dropdown, epoch input, run button → SSE log, on `complete` auto-trigger benchmark run
+
+- [ ] **Step 1: Read current BenchmarkView.vue**
+
+```bash
+cat web/src/views/BenchmarkView.vue
+```
+
+(Use this to understand the existing structure before editing — identify where to insert each new section.)
+
+- [ ] **Step 2: Add fineTunedModels state and fetch logic**
+
+In the `
diff --git a/web/src/views/LabelView.vue b/web/src/views/LabelView.vue
index 7eabbcf..98dd2fb 100644
--- a/web/src/views/LabelView.vue
+++ b/web/src/views/LabelView.vue
@@ -103,7 +103,7 @@