fix(avocet): reduce deberta-small VRAM + auto-select freest GPU for training
- deberta-small: batch_size 16→8 + grad_accum 1→2 (same effective batch), gradient_checkpointing=True (fp16 stays off: DeBERTa v3 disentangled attention overflows fp16 at the gather step) - api: _best_cuda_device() picks highest free-VRAM GPU via nvidia-smi; sets CUDA_VISIBLE_DEVICES in subprocess env to prevent DataParallel replication across both GPUs; adds PYTORCH_ALLOC_CONF=expandable_segments - SSE log now reports which GPU was selected
This commit is contained in:
parent
606917f90f
commit
5dee23f53c
2 changed files with 49 additions and 3 deletions
42
app/api.py
42
app/api.py
|
|
@ -7,6 +7,8 @@ from __future__ import annotations
|
|||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import subprocess as _subprocess
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
|
||||
|
|
@ -27,6 +29,34 @@ def set_data_dir(path: Path) -> None:
|
|||
_DATA_DIR = path
|
||||
|
||||
|
||||
def _best_cuda_device() -> str:
|
||||
"""Return the index of the GPU with the most free VRAM as a string.
|
||||
|
||||
Uses nvidia-smi so it works in the job-seeker env (no torch). Returns ""
|
||||
if nvidia-smi is unavailable or no GPUs are found. Restricting the
|
||||
training subprocess to a single GPU via CUDA_VISIBLE_DEVICES prevents
|
||||
PyTorch DataParallel from replicating the model across all GPUs, which
|
||||
would OOM the GPU with less headroom.
|
||||
"""
|
||||
try:
|
||||
out = _subprocess.check_output(
|
||||
["nvidia-smi", "--query-gpu=index,memory.free",
|
||||
"--format=csv,noheader,nounits"],
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
best_idx, best_free = "", 0
|
||||
for line in out.strip().splitlines():
|
||||
parts = line.strip().split(", ")
|
||||
if len(parts) == 2:
|
||||
idx, free = parts[0].strip(), int(parts[1].strip())
|
||||
if free > best_free:
|
||||
best_free, best_idx = free, idx
|
||||
return best_idx
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def set_models_dir(path: Path) -> None:
|
||||
"""Override models directory — used by tests."""
|
||||
global _MODELS_DIR
|
||||
|
|
@ -391,7 +421,18 @@ def run_finetune_endpoint(
|
|||
raise HTTPException(400, f"Invalid score path: {score_file!r}")
|
||||
cmd.extend(["--score", str(resolved)])
|
||||
|
||||
# Pick the GPU with the most free VRAM. Setting CUDA_VISIBLE_DEVICES to a
|
||||
# single device prevents DataParallel from replicating the model across all
|
||||
# GPUs, which would force a full copy onto the more memory-constrained device.
|
||||
proc_env = {**os.environ, "PYTORCH_ALLOC_CONF": "expandable_segments:True"}
|
||||
best_gpu = _best_cuda_device()
|
||||
if best_gpu:
|
||||
proc_env["CUDA_VISIBLE_DEVICES"] = best_gpu
|
||||
|
||||
gpu_note = f"GPU {best_gpu}" if best_gpu else "CPU (no GPU found)"
|
||||
|
||||
def generate():
|
||||
yield f"data: {json.dumps({'type': 'progress', 'message': f'[api] Using {gpu_note} (most free VRAM)'})}\n\n"
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
|
|
@ -400,6 +441,7 @@ def run_finetune_endpoint(
|
|||
text=True,
|
||||
bufsize=1,
|
||||
cwd=str(_ROOT),
|
||||
env=proc_env,
|
||||
)
|
||||
for line in proc.stdout:
|
||||
line = line.rstrip()
|
||||
|
|
|
|||
|
|
@ -42,10 +42,14 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = {
|
|||
"deberta-small": {
|
||||
"base_model_id": "cross-encoder/nli-deberta-v3-small",
|
||||
"max_tokens": 512,
|
||||
# fp16 must stay OFF — DeBERTa-v3 disentangled attention overflows fp16.
|
||||
"fp16": False,
|
||||
"batch_size": 16,
|
||||
"grad_accum": 1,
|
||||
"gradient_checkpointing": False,
|
||||
# batch_size=8 + grad_accum=2 keeps effective batch of 16 while halving
|
||||
# per-step activation memory. gradient_checkpointing recomputes activations
|
||||
# on backward instead of storing them — ~60% less activation VRAM.
|
||||
"batch_size": 8,
|
||||
"grad_accum": 2,
|
||||
"gradient_checkpointing": True,
|
||||
},
|
||||
"bge-m3": {
|
||||
"base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
|
||||
|
|
|
|||
Loading…
Reference in a new issue