fix(avocet): reduce deberta-small VRAM + auto-select freest GPU for training

- deberta-small: batch_size 16→8 + grad_accum 1→2 (same effective batch), gradient_checkpointing=True (fp16 stays off: DeBERTa v3 disentangled attention overflows fp16 at the gather step) - api: _best_cuda_device() picks highest free-VRAM GPU via nvidia-smi; sets CUDA_VISIBLE_DEVICES in subprocess env to prevent DataParallel replication across both GPUs; adds PYTORCH_ALLOC_CONF=expandable_segments - SSE log now reports which GPU was selected
2026-03-15 17:09:06 -07:00 · 2026-03-15 17:09:06 -07:00 · 5dee23f53c
commit 5dee23f53c
parent 606917f90f
2 changed files with 49 additions and 3 deletions
--- a/app/api.py
+++ b/app/api.py
@ -7,6 +7,8 @@ from __future__ import annotations

 import hashlib
 import json
+import os
+import subprocess as _subprocess
 import yaml
 from pathlib import Path

@ -27,6 +29,34 @@ def set_data_dir(path: Path) -> None:
    _DATA_DIR = path


+def _best_cuda_device() -> str:
+    """Return the index of the GPU with the most free VRAM as a string.
+
+    Uses nvidia-smi so it works in the job-seeker env (no torch). Returns ""
+    if nvidia-smi is unavailable or no GPUs are found. Restricting the
+    training subprocess to a single GPU via CUDA_VISIBLE_DEVICES prevents
+    PyTorch DataParallel from replicating the model across all GPUs, which
+    would OOM the GPU with less headroom.
+    """
+    try:
+        out = _subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=index,memory.free",
+             "--format=csv,noheader,nounits"],
+            text=True,
+            timeout=5,
+        )
+        best_idx, best_free = "", 0
+        for line in out.strip().splitlines():
+            parts = line.strip().split(", ")
+            if len(parts) == 2:
+                idx, free = parts[0].strip(), int(parts[1].strip())
+                if free > best_free:
+                    best_free, best_idx = free, idx
+        return best_idx
+    except Exception:
+        return ""
+
+
 def set_models_dir(path: Path) -> None:
    """Override models directory — used by tests."""
    global _MODELS_DIR
@ -391,7 +421,18 @@ def run_finetune_endpoint(
            raise HTTPException(400, f"Invalid score path: {score_file!r}")
        cmd.extend(["--score", str(resolved)])

+    # Pick the GPU with the most free VRAM. Setting CUDA_VISIBLE_DEVICES to a
+    # single device prevents DataParallel from replicating the model across all
+    # GPUs, which would force a full copy onto the more memory-constrained device.
+    proc_env = {**os.environ, "PYTORCH_ALLOC_CONF": "expandable_segments:True"}
+    best_gpu = _best_cuda_device()
+    if best_gpu:
+        proc_env["CUDA_VISIBLE_DEVICES"] = best_gpu
+
+    gpu_note = f"GPU {best_gpu}" if best_gpu else "CPU (no GPU found)"
+
    def generate():
+        yield f"data: {json.dumps({'type': 'progress', 'message': f'[api] Using {gpu_note} (most free VRAM)'})}\n\n"
        try:
            proc = subprocess.Popen(
                cmd,
@ -400,6 +441,7 @@ def run_finetune_endpoint(
                text=True,
                bufsize=1,
                cwd=str(_ROOT),
+                env=proc_env,
            )
            for line in proc.stdout:
                line = line.rstrip()
--- a/scripts/finetune_classifier.py
+++ b/scripts/finetune_classifier.py
@ -42,10 +42,14 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = {
    "deberta-small": {
        "base_model_id": "cross-encoder/nli-deberta-v3-small",
        "max_tokens": 512,
+        # fp16 must stay OFF — DeBERTa-v3 disentangled attention overflows fp16.
        "fp16": False,
-        "batch_size": 16,
-        "grad_accum": 1,
-        "gradient_checkpointing": False,
+        # batch_size=8 + grad_accum=2 keeps effective batch of 16 while halving
+        # per-step activation memory. gradient_checkpointing recomputes activations
+        # on backward instead of storing them — ~60% less activation VRAM.
+        "batch_size": 8,
+        "grad_accum": 2,
+        "gradient_checkpointing": True,
    },
    "bge-m3": {
        "base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",