fix(avocet): reduce deberta-small VRAM + auto-select freest GPU for training

- deberta-small: batch_size 16→8 + grad_accum 1→2 (same effective batch), gradient_checkpointing=True (fp16 stays off: DeBERTa v3 disentangled attention overflows fp16 at the gather step) - api: _best_cuda_device() picks highest free-VRAM GPU via nvidia-smi; sets CUDA_VISIBLE_DEVICES in subprocess env to prevent DataParallel replication across both GPUs; adds PYTORCH_ALLOC_CONF=expandable_segments - SSE log now reports which GPU was selected
2026-03-15 17:09:06 -07:00 · 2026-03-15 17:09:06 -07:00 · 5dee23f53c
commit 5dee23f53c
parent 606917f90f
2 changed files with 49 additions and 3 deletions
--- a/app/api.py
+++ b/app/api.py
@ -7,6 +7,8 @@ from __future__ import annotations
 import hashlib
 import json
 import os
 import subprocess as _subprocess
 import yaml
 from pathlib import Path
@ -27,6 +29,34 @@ def set_data_dir(path: Path) -> None:
    _DATA_DIR = path
 def _best_cuda_device() -> str:
    """Return the index of the GPU with the most free VRAM as a string.
    Uses nvidia-smi so it works in the job-seeker env (no torch). Returns ""
    if nvidia-smi is unavailable or no GPUs are found. Restricting the
    training subprocess to a single GPU via CUDA_VISIBLE_DEVICES prevents
    PyTorch DataParallel from replicating the model across all GPUs, which
    would OOM the GPU with less headroom.
    """
    try:
        out = _subprocess.check_output(
            ["nvidia-smi", "--query-gpu=index,memory.free",
             "--format=csv,noheader,nounits"],
            text=True,
            timeout=5,
        )
        best_idx, best_free = "", 0
        for line in out.strip().splitlines():
            parts = line.strip().split(", ")
            if len(parts) == 2:
                idx, free = parts[0].strip(), int(parts[1].strip())
                if free > best_free:
                    best_free, best_idx = free, idx
        return best_idx
    except Exception:
        return ""
 def set_models_dir(path: Path) -> None:
    """Override models directory — used by tests."""
    global _MODELS_DIR
@ -391,7 +421,18 @@ def run_finetune_endpoint(
            raise HTTPException(400, f"Invalid score path: {score_file!r}")
        cmd.extend(["--score", str(resolved)])
    # Pick the GPU with the most free VRAM. Setting CUDA_VISIBLE_DEVICES to a
    # single device prevents DataParallel from replicating the model across all
    # GPUs, which would force a full copy onto the more memory-constrained device.
    proc_env = {**os.environ, "PYTORCH_ALLOC_CONF": "expandable_segments:True"}
    best_gpu = _best_cuda_device()
    if best_gpu:
        proc_env["CUDA_VISIBLE_DEVICES"] = best_gpu
    gpu_note = f"GPU {best_gpu}" if best_gpu else "CPU (no GPU found)"
    def generate():
        yield f"data: {json.dumps({'type': 'progress', 'message': f'[api] Using {gpu_note} (most free VRAM)'})}\n\n"
        try:
            proc = subprocess.Popen(
                cmd,
@ -400,6 +441,7 @@ def run_finetune_endpoint(
                text=True,
                bufsize=1,
                cwd=str(_ROOT),
                env=proc_env,
            )
            for line in proc.stdout:
                line = line.rstrip()
--- a/scripts/finetune_classifier.py
+++ b/scripts/finetune_classifier.py
@ -42,10 +42,14 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = {
    "deberta-small": {
        "base_model_id": "cross-encoder/nli-deberta-v3-small",
        "max_tokens": 512,
        # fp16 must stay OFF — DeBERTa-v3 disentangled attention overflows fp16.
        "fp16": False,
-        "batch_size": 16,
+        # batch_size=8 + grad_accum=2 keeps effective batch of 16 while halving
-        "grad_accum": 1,
+        # per-step activation memory. gradient_checkpointing recomputes activations
-        "gradient_checkpointing": False,
+        # on backward instead of storing them — ~60% less activation VRAM.
        "batch_size": 8,
        "grad_accum": 2,
        "gradient_checkpointing": True,
    },
    "bge-m3": {
        "base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",