feat(avocet): run_finetune, CLI, multi-score-file merge with last-write-wins dedup

- load_and_prepare_data() now accepts Path | list[Path]; single-Path callers unchanged - Dedup by MD5(subject + body[:100]); last file/row wins (lets later runs correct labels) - Prints summary line when duplicates are dropped - Added _EmailDataset (TorchDataset wrapper), run_finetune(), and argparse CLI - run_finetune() saves model + tokenizer + training_info.json with score_files provenance - Stratified split guard: val set size clamped to at least n_classes (handles tiny example data) - 3 new unit tests (merge, last-write-wins dedup, single-Path compat) + 1 integration test - All 16 tests pass (15 unit + 1 integration)
2026-03-15 15:52:41 -07:00 · 2026-03-15 15:52:41 -07:00 · 8ba34bb2d1
commit 8ba34bb2d1
parent f262b23cf5
2 changed files with 385 additions and 24 deletions
--- a/scripts/finetune_classifier.py
+++ b/scripts/finetune_classifier.py
@ -10,6 +10,7 @@ Supported --model values: deberta-small, bge-m3
 from __future__ import annotations
 import argparse
 import hashlib
 import json
 import sys
 from collections import Counter
@ -56,39 +57,68 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = {
 }
-def load_and_prepare_data(score_file: Path) -> tuple[list[str], list[str]]:
+def load_and_prepare_data(score_files: Path | list[Path]) -> tuple[list[str], list[str]]:
    """Load labeled JSONL and return (texts, labels) filtered to canonical LABELS.
    score_files: a single Path or a list of Paths. When multiple files are given,
    rows are merged with last-write-wins deduplication keyed by content hash
    (MD5 of subject + body[:100]).
    Drops rows with non-canonical labels (with warning), and drops entire classes
    that have fewer than 2 total samples (required for stratified split).
    Warns (but continues) for classes with fewer than 5 samples.
    """
-    if not score_file.exists():
+    # Normalise to list — backwards compatible with single-Path callers.
-        raise FileNotFoundError(
+    if isinstance(score_files, Path):
-            f"Labeled data not found: {score_file}\n"
+        score_files = [score_files]
-            "Run the label tool first to generate email_score.jsonl."
+
-        )
+    for score_file in score_files:
        if not score_file.exists():
            raise FileNotFoundError(
                f"Labeled data not found: {score_file}\n"
                "Run the label tool first to generate email_score.jsonl."
            )
    label_set = set(LABELS)
-    rows: list[dict] = []
+    # Use a plain dict keyed by content hash; later entries overwrite earlier ones
    # (last-write wins), which lets later labeling runs correct earlier labels.
    seen: dict[str, dict] = {}
    total = 0
-    with score_file.open() as fh:
+    for score_file in score_files:
-        for line in fh:
+        with score_file.open() as fh:
-            line = line.strip()
+            for line in fh:
-            if not line:
+                line = line.strip()
-                continue
+                if not line:
-            try:
+                    continue
-                r = json.loads(line)
+                try:
-            except json.JSONDecodeError:
+                    r = json.loads(line)
-                continue
+                except json.JSONDecodeError:
-            lbl = r.get("label", "")
+                    continue
-            if lbl not in label_set:
+                lbl = r.get("label", "")
-                print(
+                if lbl not in label_set:
-                    f"[data] WARNING: Dropping row with non-canonical label {lbl!r}",
+                    print(
-                    flush=True,
+                        f"[data] WARNING: Dropping row with non-canonical label {lbl!r}",
-                )
+                        flush=True,
-                continue
+                    )
-            rows.append(r)
+                    continue
                content_hash = hashlib.md5(
                    (r.get("subject", "") + (r.get("body", "") or "")[:100]).encode(
                        "utf-8", errors="replace"
                    )
                ).hexdigest()
                seen[content_hash] = r
                total += 1
    kept = len(seen)
    dropped = total - kept
    if dropped > 0:
        print(
            f"[data] Deduped: kept {kept} of {total} rows (dropped {dropped} duplicates)",
            flush=True,
        )
    rows = list(seen.values())
    # Count samples per class
    counts: Counter = Counter(r["label"] for r in rows)
@ -164,3 +194,214 @@ class WeightedTrainer(Trainer):
        weight = self.class_weights.to(outputs.logits.device)
        loss = F.cross_entropy(outputs.logits, labels, weight=weight)
        return (loss, outputs) if return_outputs else loss
 # ---------------------------------------------------------------------------
 # Training dataset wrapper
 # ---------------------------------------------------------------------------
 from torch.utils.data import Dataset as TorchDataset
 class _EmailDataset(TorchDataset):
    def __init__(self, encodings: dict, label_ids: list[int]) -> None:
        self.encodings = encodings
        self.label_ids = label_ids
    def __len__(self) -> int:
        return len(self.label_ids)
    def __getitem__(self, idx: int) -> dict:
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.label_ids[idx], dtype=torch.long)
        return item
 # ---------------------------------------------------------------------------
 # Main training function
 # ---------------------------------------------------------------------------
 def run_finetune(model_key: str, epochs: int = 5, score_files: list[Path] | None = None) -> None:
    """Fine-tune the specified model on labeled data.
    score_files: list of score JSONL paths to merge. Defaults to [_ROOT / "data" / "email_score.jsonl"].
    Saves model + tokenizer + training_info.json to models/avocet-{model_key}/.
    All prints use flush=True for SSE streaming.
    """
    if model_key not in _MODEL_CONFIG:
        raise ValueError(f"Unknown model key: {model_key!r}. Choose from: {list(_MODEL_CONFIG)}")
    if score_files is None:
        score_files = [_ROOT / "data" / "email_score.jsonl"]
    config = _MODEL_CONFIG[model_key]
    base_model_id = config["base_model_id"]
    output_dir = _ROOT / "models" / f"avocet-{model_key}"
    print(f"[finetune] Model: {model_key} ({base_model_id})", flush=True)
    print(f"[finetune] Score files: {[str(f) for f in score_files]}", flush=True)
    print(f"[finetune] Output: {output_dir}", flush=True)
    if output_dir.exists():
        print(f"[finetune] WARNING: {output_dir} already exists — will overwrite.", flush=True)
    # --- Data ---
    print(f"[finetune] Loading data ...", flush=True)
    texts, str_labels = load_and_prepare_data(score_files)
    present_labels = sorted(set(str_labels))
    label2id = {l: i for i, l in enumerate(present_labels)}
    id2label = {i: l for l, i in label2id.items()}
    n_classes = len(present_labels)
    label_ids = [label2id[l] for l in str_labels]
    print(f"[finetune] {len(texts)} samples, {n_classes} classes", flush=True)
    # Stratified 80/20 split — ensure val set has at least n_classes samples.
    # For very small datasets (e.g. example data) we may need to give the val set
    # more than 20% so every class appears at least once in eval.
    desired_test = max(int(len(texts) * 0.2), n_classes)
    # test_size must leave at least n_classes samples for train too
    desired_test = min(desired_test, len(texts) - n_classes)
    (train_texts, val_texts,
     train_label_ids, val_label_ids) = train_test_split(
        texts, label_ids,
        test_size=desired_test,
        stratify=label_ids,
        random_state=42,
    )
    print(f"[finetune] Train: {len(train_texts)}, Val: {len(val_texts)}", flush=True)
    # Warn for classes with < 5 training samples
    train_counts = Counter(train_label_ids)
    for cls_id, cnt in train_counts.items():
        if cnt < 5:
            print(
                f"[finetune] WARNING: Class {id2label[cls_id]!r} has {cnt} training sample(s). "
                "Eval F1 for this class will be unreliable.",
                flush=True,
            )
    # --- Tokenize ---
    print(f"[finetune] Loading tokenizer ...", flush=True)
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    train_enc = tokenizer(train_texts, truncation=True,
                          max_length=config["max_tokens"], padding=True)
    val_enc   = tokenizer(val_texts,   truncation=True,
                          max_length=config["max_tokens"], padding=True)
    train_dataset = _EmailDataset(train_enc, train_label_ids)
    val_dataset   = _EmailDataset(val_enc,   val_label_ids)
    # --- Class weights ---
    class_weights = compute_class_weights(train_label_ids, n_classes)
    print(f"[finetune] Class weights computed", flush=True)
    # --- Model ---
    print(f"[finetune] Loading model ...", flush=True)
    model = AutoModelForSequenceClassification.from_pretrained(
        base_model_id,
        num_labels=n_classes,
        ignore_mismatched_sizes=True,   # NLI head (3-class) → new head (n_classes)
        id2label=id2label,
        label2id=label2id,
    )
    if config["gradient_checkpointing"]:
        model.gradient_checkpointing_enable()
    # --- TrainingArguments ---
    training_args = TrainingArguments(
        output_dir=str(output_dir),
        num_train_epochs=epochs,
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        gradient_accumulation_steps=config["grad_accum"],
        learning_rate=2e-5,
        lr_scheduler_type="linear",
        warmup_ratio=0.1,
        fp16=config["fp16"],
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="macro_f1",
        greater_is_better=True,
        logging_steps=10,
        report_to="none",
        save_total_limit=2,
    )
    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics_for_trainer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    trainer.class_weights = class_weights
    # --- Train ---
    print(f"[finetune] Starting training ({epochs} epochs) ...", flush=True)
    train_result = trainer.train()
    print(f"[finetune] Training complete. Steps: {train_result.global_step}", flush=True)
    # --- Evaluate ---
    print(f"[finetune] Evaluating best checkpoint ...", flush=True)
    metrics = trainer.evaluate()
    val_macro_f1 = metrics.get("eval_macro_f1", 0.0)
    val_accuracy = metrics.get("eval_accuracy", 0.0)
    print(f"[finetune] Val macro-F1: {val_macro_f1:.4f}, Accuracy: {val_accuracy:.4f}", flush=True)
    # --- Save model + tokenizer ---
    print(f"[finetune] Saving model to {output_dir} ...", flush=True)
    trainer.save_model(str(output_dir))
    tokenizer.save_pretrained(str(output_dir))
    # --- Write training_info.json ---
    label_counts = dict(Counter(str_labels))
    info = {
        "name": f"avocet-{model_key}",
        "base_model_id": base_model_id,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "epochs_run": epochs,
        "val_macro_f1": round(val_macro_f1, 4),
        "val_accuracy": round(val_accuracy, 4),
        "sample_count": len(train_texts),
        "label_counts": label_counts,
        "score_files": [str(f) for f in score_files],
    }
    info_path = output_dir / "training_info.json"
    info_path.write_text(json.dumps(info, indent=2), encoding="utf-8")
    print(f"[finetune] Saved training_info.json: val_macro_f1={val_macro_f1:.4f}", flush=True)
    print(f"[finetune] Done.", flush=True)
 # ---------------------------------------------------------------------------
 # CLI
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Fine-tune an email classifier")
    parser.add_argument(
        "--model",
        choices=list(_MODEL_CONFIG),
        required=True,
        help="Model key to fine-tune",
    )
    parser.add_argument(
        "--epochs",
        type=int,
        default=5,
        help="Number of training epochs (default: 5)",
    )
    parser.add_argument(
        "--score",
        dest="score_files",
        type=Path,
        action="append",
        metavar="FILE",
        help="Score JSONL file to include (repeatable; defaults to data/email_score.jsonl)",
    )
    args = parser.parse_args()
    score_files = args.score_files or None  # None → run_finetune uses default
    run_finetune(args.model, args.epochs, score_files=score_files)
--- a/tests/test_finetune.py
+++ b/tests/test_finetune.py
@ -249,3 +249,123 @@ def test_weighted_trainer_compute_loss_returns_outputs_when_requested():
    assert isinstance(result, tuple)
    loss, outputs = result
    assert isinstance(loss, torch.Tensor)
 # ---- Multi-file merge / dedup tests ----
 def test_load_and_prepare_data_merges_multiple_files(tmp_path):
    """Multiple score files must be merged into a single dataset."""
    from scripts.finetune_classifier import load_and_prepare_data
    file1 = tmp_path / "run1.jsonl"
    file2 = tmp_path / "run2.jsonl"
    file1.write_text(
        json.dumps({"subject": "s1", "body": "b1", "label": "digest"}) + "\n" +
        json.dumps({"subject": "s2", "body": "b2", "label": "digest"}) + "\n"
    )
    file2.write_text(
        json.dumps({"subject": "s3", "body": "b3", "label": "neutral"}) + "\n" +
        json.dumps({"subject": "s4", "body": "b4", "label": "neutral"}) + "\n"
    )
    texts, labels = load_and_prepare_data([file1, file2])
    assert len(texts) == 4
    assert labels.count("digest") == 2
    assert labels.count("neutral") == 2
 def test_load_and_prepare_data_deduplicates_last_write_wins(tmp_path, capsys):
    """Duplicate rows (same content hash) keep the last occurrence."""
    from scripts.finetune_classifier import load_and_prepare_data
    # Same subject+body[:100] = same hash
    row_early = {"subject": "Hello", "body": "World", "label": "neutral"}
    row_late  = {"subject": "Hello", "body": "World", "label": "digest"}   # relabeled
    file1 = tmp_path / "run1.jsonl"
    file2 = tmp_path / "run2.jsonl"
    # Add a second row with different content so class count >= 2 for both classes
    file1.write_text(
        json.dumps(row_early) + "\n" +
        json.dumps({"subject": "Other1", "body": "Other", "label": "neutral"}) + "\n"
    )
    file2.write_text(
        json.dumps(row_late) + "\n" +
        json.dumps({"subject": "Other2", "body": "Stuff", "label": "digest"}) + "\n"
    )
    texts, labels = load_and_prepare_data([file1, file2])
    captured = capsys.readouterr()
    # The duplicate row should be counted as dropped
    assert "Deduped" in captured.out
    # The relabeled row should have "digest" (last-write wins), not "neutral"
    hello_idx = next(i for i, t in enumerate(texts) if t.startswith("Hello [SEP]"))
    assert labels[hello_idx] == "digest"
 def test_load_and_prepare_data_single_path_still_works(tmp_path):
    """Passing a single Path (not a list) must still work — backwards compatibility."""
    from scripts.finetune_classifier import load_and_prepare_data
    rows = [
        {"subject": "s1", "body": "b1", "label": "digest"},
        {"subject": "s2", "body": "b2", "label": "digest"},
    ]
    score_file = tmp_path / "email_score.jsonl"
    score_file.write_text("\n".join(json.dumps(r) for r in rows))
    texts, labels = load_and_prepare_data(score_file)  # single Path, not list
    assert len(texts) == 2
 # ---- Integration test ----
 def test_integration_finetune_on_example_data(tmp_path):
    """Fine-tune deberta-small on example data for 1 epoch.
    Uses data/email_score.jsonl.example (8 samples, 5 labels represented).
    The 5 missing labels must trigger the < 2 samples drop warning.
    Verifies training_info.json is written with correct keys.
    Requires job-seeker-classifiers env and downloads deberta-small (~100MB on first run).
    """
    import shutil
    from scripts import finetune_classifier as ft_mod
    from scripts.finetune_classifier import run_finetune
    example_file = ft_mod._ROOT / "data" / "email_score.jsonl.example"
    if not example_file.exists():
        pytest.skip("email_score.jsonl.example not found")
    orig_root = ft_mod._ROOT
    ft_mod._ROOT = tmp_path
    (tmp_path / "data").mkdir()
    shutil.copy(example_file, tmp_path / "data" / "email_score.jsonl")
    try:
        import io
        from contextlib import redirect_stdout
        captured = io.StringIO()
        with redirect_stdout(captured):
            run_finetune("deberta-small", epochs=1)
        output = captured.getvalue()
    finally:
        ft_mod._ROOT = orig_root
    # Missing labels should trigger the < 2 samples drop warning
    assert "WARNING: Dropping class" in output
    # training_info.json must exist with correct keys
    info_path = tmp_path / "models" / "avocet-deberta-small" / "training_info.json"
    assert info_path.exists(), "training_info.json not written"
    info = json.loads(info_path.read_text())
    for key in ("name", "base_model_id", "timestamp", "epochs_run",
                "val_macro_f1", "val_accuracy", "sample_count",
                "label_counts", "score_files"):
        assert key in info, f"Missing key: {key}"
    assert info["name"] == "avocet-deberta-small"
    assert info["epochs_run"] == 1
    assert isinstance(info["score_files"], list)