feat(avocet): add finetune data pipeline, class weights, WeightedTrainer

Implements load_and_prepare_data (JSONL ingestion with class filtering), compute_class_weights (inverse-frequency, div-by-zero safe), compute_metrics_for_trainer (macro F1 + accuracy), and WeightedTrainer.compute_loss (**kwargs-safe for Transformers 4.38+ num_items_in_batch). All 12 tests pass.
2026-03-15 15:38:45 -07:00 · 2026-03-15 15:38:45 -07:00 · 5eb593569d
commit 5eb593569d
parent 2d795b9573
2 changed files with 416 additions and 0 deletions
--- a/scripts/finetune_classifier.py
+++ b/scripts/finetune_classifier.py
@ -0,0 +1,166 @@
 """Fine-tune email classifiers on the labeled dataset.
 CLI entry point. All prints use flush=True so stdout is SSE-streamable.
 Usage:
    python scripts/finetune_classifier.py --model deberta-small [--epochs 5]
 Supported --model values: deberta-small, bge-m3
 """
 from __future__ import annotations
 import argparse
 import json
 import sys
 from collections import Counter
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 import torch
 import torch.nn.functional as F
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import f1_score, accuracy_score
 from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
 )
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.classifier_adapters import LABELS
 _ROOT = Path(__file__).parent.parent
 _MODEL_CONFIG: dict[str, dict[str, Any]] = {
    "deberta-small": {
        "base_model_id": "cross-encoder/nli-deberta-v3-small",
        "max_tokens": 512,
        "fp16": False,
        "batch_size": 16,
        "grad_accum": 1,
        "gradient_checkpointing": False,
    },
    "bge-m3": {
        "base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
        "max_tokens": 512,
        "fp16": True,
        "batch_size": 4,
        "grad_accum": 4,
        "gradient_checkpointing": True,
    },
 }
 def load_and_prepare_data(score_file: Path) -> tuple[list[str], list[str]]:
    """Load labeled JSONL and return (texts, labels) filtered to canonical LABELS.
    Drops rows with non-canonical labels (with warning), and drops entire classes
    that have fewer than 2 total samples (required for stratified split).
    Warns (but continues) for classes with fewer than 5 samples.
    """
    if not score_file.exists():
        raise FileNotFoundError(
            f"Labeled data not found: {score_file}\n"
            "Run the label tool first to generate email_score.jsonl."
        )
    label_set = set(LABELS)
    rows: list[dict] = []
    with score_file.open() as fh:
        for line in fh:
            line = line.strip()
            if not line:
                continue
            try:
                r = json.loads(line)
            except json.JSONDecodeError:
                continue
            lbl = r.get("label", "")
            if lbl not in label_set:
                print(
                    f"[data] WARNING: Dropping row with non-canonical label {lbl!r}",
                    flush=True,
                )
                continue
            rows.append(r)
    # Count samples per class
    counts: Counter = Counter(r["label"] for r in rows)
    # Drop classes with < 2 total samples (cannot stratify-split)
    drop_classes: set[str] = set()
    for lbl, cnt in counts.items():
        if cnt < 2:
            print(
                f"[data] WARNING: Dropping class {lbl!r} — only {counts[lbl]} total "
                f"sample(s). Need at least 2 for stratified split.",
                flush=True,
            )
            drop_classes.add(lbl)
    # Warn for classes with < 5 samples (unreliable eval F1)
    for lbl, cnt in counts.items():
        if lbl not in drop_classes and cnt < 5:
            print(
                f"[data] WARNING: Class {lbl!r} has only {cnt} sample(s). "
                f"Eval F1 for this class will be unreliable.",
                flush=True,
            )
    # Filter rows
    rows = [r for r in rows if r["label"] not in drop_classes]
    texts = [f"{r['subject']} [SEP] {r['body'][:400]}" for r in rows]
    labels = [r["label"] for r in rows]
    return texts, labels
 def compute_class_weights(label_ids: list[int], n_classes: int) -> torch.Tensor:
    """Compute inverse-frequency class weights.
    Formula: total / (n_classes * class_count) per class.
    Unseen classes (count=0) use count=1 to avoid division by zero.
    Returns a CPU float32 tensor of shape (n_classes,).
    """
    counts = Counter(label_ids)
    total = len(label_ids)
    weights = []
    for cls in range(n_classes):
        cnt = counts.get(cls, 1)  # use 1 for unseen to avoid div-by-zero
        weights.append(total / (n_classes * cnt))
    return torch.tensor(weights, dtype=torch.float32)
 def compute_metrics_for_trainer(eval_pred: EvalPrediction) -> dict:
    """Compute macro F1 and accuracy from EvalPrediction.
    Called by Hugging Face Trainer at each evaluation step.
    """
    logits, label_ids = eval_pred.predictions, eval_pred.label_ids
    preds = logits.argmax(axis=-1)
    macro_f1 = f1_score(label_ids, preds, average="macro", zero_division=0)
    acc = accuracy_score(label_ids, preds)
    return {"macro_f1": float(macro_f1), "accuracy": float(acc)}
 class WeightedTrainer(Trainer):
    """Trainer subclass that applies per-class weights to the cross-entropy loss."""
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # **kwargs is required — absorbs num_items_in_batch added in Transformers 4.38.
        # Do not remove it; removing it causes TypeError on the first training step.
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        # Move class_weights to the same device as logits — required for GPU training.
        # class_weights is created on CPU; logits are on cuda:0 during training.
        weight = self.class_weights.to(outputs.logits.device)
        loss = F.cross_entropy(outputs.logits, labels, weight=weight)
        return (loss, outputs) if return_outputs else loss
--- a/tests/test_finetune.py
+++ b/tests/test_finetune.py
@ -0,0 +1,250 @@
 """Tests for finetune_classifier — no model downloads required."""
 from __future__ import annotations
 import json
 import pytest
 # ---- Data loading tests ----
 def test_load_and_prepare_data_drops_non_canonical_labels(tmp_path):
    """Rows with labels not in LABELS must be silently dropped."""
    from scripts.finetune_classifier import load_and_prepare_data
    from scripts.classifier_adapters import LABELS
    # Two samples per canonical label so they survive the < 2 class-drop rule.
    rows = [
        {"subject": "s1", "body": "b1", "label": "digest"},
        {"subject": "s2", "body": "b2", "label": "digest"},
        {"subject": "s3", "body": "b3", "label": "profile_alert"},  # non-canonical
        {"subject": "s4", "body": "b4", "label": "neutral"},
        {"subject": "s5", "body": "b5", "label": "neutral"},
    ]
    score_file = tmp_path / "email_score.jsonl"
    score_file.write_text("\n".join(json.dumps(r) for r in rows))
    texts, labels = load_and_prepare_data(score_file)
    assert len(texts) == 4
    assert all(l in LABELS for l in labels)
 def test_load_and_prepare_data_formats_input_as_sep(tmp_path):
    """Input text must be 'subject [SEP] body[:400]'."""
    # Two samples with the same label so the class survives the < 2 drop rule.
    rows = [
        {"subject": "Hello", "body": "World" * 100, "label": "neutral"},
        {"subject": "Hello2", "body": "World" * 100, "label": "neutral"},
    ]
    score_file = tmp_path / "email_score.jsonl"
    score_file.write_text("\n".join(json.dumps(r) for r in rows))
    from scripts.finetune_classifier import load_and_prepare_data
    texts, labels = load_and_prepare_data(score_file)
    assert texts[0].startswith("Hello [SEP] ")
    assert len(texts[0]) <= len("Hello [SEP] ") + 400 + 5
 def test_load_and_prepare_data_raises_on_missing_file():
    """FileNotFoundError must be raised with actionable message."""
    from pathlib import Path
    from scripts.finetune_classifier import load_and_prepare_data
    with pytest.raises(FileNotFoundError, match="email_score.jsonl"):
        load_and_prepare_data(Path("/nonexistent/email_score.jsonl"))
 def test_load_and_prepare_data_drops_class_with_fewer_than_2_samples(tmp_path, capsys):
    """Classes with < 2 total samples must be dropped with a warning."""
    from scripts.finetune_classifier import load_and_prepare_data
    rows = [
        {"subject": "s1", "body": "b", "label": "digest"},
        {"subject": "s2", "body": "b", "label": "digest"},
        {"subject": "s3", "body": "b", "label": "new_lead"},  # only 1 sample — drop
    ]
    score_file = tmp_path / "email_score.jsonl"
    score_file.write_text("\n".join(json.dumps(r) for r in rows))
    texts, labels = load_and_prepare_data(score_file)
    captured = capsys.readouterr()
    assert "new_lead" not in labels
    assert "new_lead" in captured.out  # warning printed
 # ---- Class weights tests ----
 def test_compute_class_weights_returns_tensor_for_each_class():
    """compute_class_weights must return a float tensor of length n_classes."""
    import torch
    from scripts.finetune_classifier import compute_class_weights
    label_ids = [0, 0, 0, 1, 1, 2]  # 3 classes, imbalanced
    weights = compute_class_weights(label_ids, n_classes=3)
    assert isinstance(weights, torch.Tensor)
    assert weights.shape == (3,)
    assert all(w > 0 for w in weights)
 def test_compute_class_weights_upweights_minority():
    """Minority classes must receive higher weight than majority classes."""
    from scripts.finetune_classifier import compute_class_weights
    # Class 0: 10 samples, Class 1: 2 samples
    label_ids = [0] * 10 + [1] * 2
    weights = compute_class_weights(label_ids, n_classes=2)
    assert weights[1] > weights[0]
 # ---- compute_metrics_for_trainer tests ----
 def test_compute_metrics_for_trainer_returns_macro_f1_key():
    """Must return a dict with 'macro_f1' key."""
    import numpy as np
    from scripts.finetune_classifier import compute_metrics_for_trainer
    from transformers import EvalPrediction
    logits = np.array([[2.0, 0.1], [0.1, 2.0], [2.0, 0.1]])
    labels = np.array([0, 1, 0])
    pred = EvalPrediction(predictions=logits, label_ids=labels)
    result = compute_metrics_for_trainer(pred)
    assert "macro_f1" in result
    assert result["macro_f1"] == pytest.approx(1.0)
 def test_compute_metrics_for_trainer_returns_accuracy_key():
    """Must also return 'accuracy' key."""
    import numpy as np
    from scripts.finetune_classifier import compute_metrics_for_trainer
    from transformers import EvalPrediction
    logits = np.array([[2.0, 0.1], [0.1, 2.0]])
    labels = np.array([0, 1])
    pred = EvalPrediction(predictions=logits, label_ids=labels)
    result = compute_metrics_for_trainer(pred)
    assert "accuracy" in result
    assert result["accuracy"] == pytest.approx(1.0)
 # ---- WeightedTrainer tests ----
 def test_weighted_trainer_compute_loss_returns_scalar():
    """compute_loss must return a scalar tensor when return_outputs=False."""
    import torch
    from unittest.mock import MagicMock
    from scripts.finetune_classifier import WeightedTrainer
    n_classes = 3
    batch = 4
    logits = torch.randn(batch, n_classes)
    mock_outputs = MagicMock()
    mock_outputs.logits = logits
    mock_model = MagicMock(return_value=mock_outputs)
    trainer = WeightedTrainer.__new__(WeightedTrainer)
    trainer.class_weights = torch.ones(n_classes)
    inputs = {
        "input_ids": torch.zeros(batch, 10, dtype=torch.long),
        "labels": torch.randint(0, n_classes, (batch,)),
    }
    loss = trainer.compute_loss(mock_model, inputs, return_outputs=False)
    assert isinstance(loss, torch.Tensor)
    assert loss.ndim == 0  # scalar
 def test_weighted_trainer_compute_loss_accepts_kwargs():
    """compute_loss must not raise TypeError when called with num_items_in_batch kwarg."""
    import torch
    from unittest.mock import MagicMock
    from scripts.finetune_classifier import WeightedTrainer
    n_classes = 3
    batch = 2
    logits = torch.randn(batch, n_classes)
    mock_outputs = MagicMock()
    mock_outputs.logits = logits
    mock_model = MagicMock(return_value=mock_outputs)
    trainer = WeightedTrainer.__new__(WeightedTrainer)
    trainer.class_weights = torch.ones(n_classes)
    inputs = {
        "input_ids": torch.zeros(batch, 5, dtype=torch.long),
        "labels": torch.randint(0, n_classes, (batch,)),
    }
    loss = trainer.compute_loss(mock_model, inputs, return_outputs=False,
                                num_items_in_batch=batch)
    assert isinstance(loss, torch.Tensor)
 def test_weighted_trainer_weighted_loss_differs_from_unweighted():
    """Weighted loss must differ from uniform-weight loss for imbalanced inputs."""
    import torch
    from unittest.mock import MagicMock
    from scripts.finetune_classifier import WeightedTrainer
    n_classes = 2
    batch = 4
    # Mixed labels: 3× class-0, 1× class-1.
    # Asymmetric logits (class-0 samples predicted well, class-1 predicted poorly)
    # ensure per-class CE values differ, so re-weighting changes the weighted mean.
    labels = torch.tensor([0, 0, 0, 1], dtype=torch.long)
    logits = torch.tensor([[3.0, -1.0], [3.0, -1.0], [3.0, -1.0], [0.5, 0.5]])
    mock_outputs = MagicMock()
    mock_outputs.logits = logits
    trainer_uniform = WeightedTrainer.__new__(WeightedTrainer)
    trainer_uniform.class_weights = torch.ones(n_classes)
    inputs_uniform = {"input_ids": torch.zeros(batch, 5, dtype=torch.long), "labels": labels.clone()}
    loss_uniform = trainer_uniform.compute_loss(MagicMock(return_value=mock_outputs),
                                                inputs_uniform)
    trainer_weighted = WeightedTrainer.__new__(WeightedTrainer)
    trainer_weighted.class_weights = torch.tensor([0.1, 10.0])
    inputs_weighted = {"input_ids": torch.zeros(batch, 5, dtype=torch.long), "labels": labels.clone()}
    mock_outputs2 = MagicMock()
    mock_outputs2.logits = logits.clone()
    loss_weighted = trainer_weighted.compute_loss(MagicMock(return_value=mock_outputs2),
                                                  inputs_weighted)
    assert not torch.isclose(loss_uniform, loss_weighted)
 def test_weighted_trainer_compute_loss_returns_outputs_when_requested():
    """compute_loss with return_outputs=True must return (loss, outputs) tuple."""
    import torch
    from unittest.mock import MagicMock
    from scripts.finetune_classifier import WeightedTrainer
    n_classes = 3
    batch = 2
    logits = torch.randn(batch, n_classes)
    mock_outputs = MagicMock()
    mock_outputs.logits = logits
    mock_model = MagicMock(return_value=mock_outputs)
    trainer = WeightedTrainer.__new__(WeightedTrainer)
    trainer.class_weights = torch.ones(n_classes)
    inputs = {
        "input_ids": torch.zeros(batch, 5, dtype=torch.long),
        "labels": torch.randint(0, n_classes, (batch,)),
    }
    result = trainer.compute_loss(mock_model, inputs, return_outputs=True)
    assert isinstance(result, tuple)
    loss, outputs = result
    assert isinstance(loss, torch.Tensor)