fix(avocet): reduce deberta-small VRAM + auto-select freest GPU for training
- deberta-small: batch_size 16→8 + grad_accum 1→2 (same effective batch), gradient_checkpointing=True (fp16 stays off: DeBERTa v3 disentangled attention overflows fp16 at the gather step) - api: _best_cuda_device() picks highest free-VRAM GPU via nvidia-smi; sets CUDA_VISIBLE_DEVICES in subprocess env to prevent DataParallel replication across both GPUs; adds PYTORCH_ALLOC_CONF=expandable_segments - SSE log now reports which GPU was selected
This commit is contained in:
parent
606917f90f
commit
5dee23f53c
2 changed files with 49 additions and 3 deletions
42
app/api.py
42
app/api.py
|
|
@ -7,6 +7,8 @@ from __future__ import annotations
|
||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
import subprocess as _subprocess
|
||||||
import yaml
|
import yaml
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
@ -27,6 +29,34 @@ def set_data_dir(path: Path) -> None:
|
||||||
_DATA_DIR = path
|
_DATA_DIR = path
|
||||||
|
|
||||||
|
|
||||||
|
def _best_cuda_device() -> str:
|
||||||
|
"""Return the index of the GPU with the most free VRAM as a string.
|
||||||
|
|
||||||
|
Uses nvidia-smi so it works in the job-seeker env (no torch). Returns ""
|
||||||
|
if nvidia-smi is unavailable or no GPUs are found. Restricting the
|
||||||
|
training subprocess to a single GPU via CUDA_VISIBLE_DEVICES prevents
|
||||||
|
PyTorch DataParallel from replicating the model across all GPUs, which
|
||||||
|
would OOM the GPU with less headroom.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
out = _subprocess.check_output(
|
||||||
|
["nvidia-smi", "--query-gpu=index,memory.free",
|
||||||
|
"--format=csv,noheader,nounits"],
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
best_idx, best_free = "", 0
|
||||||
|
for line in out.strip().splitlines():
|
||||||
|
parts = line.strip().split(", ")
|
||||||
|
if len(parts) == 2:
|
||||||
|
idx, free = parts[0].strip(), int(parts[1].strip())
|
||||||
|
if free > best_free:
|
||||||
|
best_free, best_idx = free, idx
|
||||||
|
return best_idx
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
def set_models_dir(path: Path) -> None:
|
def set_models_dir(path: Path) -> None:
|
||||||
"""Override models directory — used by tests."""
|
"""Override models directory — used by tests."""
|
||||||
global _MODELS_DIR
|
global _MODELS_DIR
|
||||||
|
|
@ -391,7 +421,18 @@ def run_finetune_endpoint(
|
||||||
raise HTTPException(400, f"Invalid score path: {score_file!r}")
|
raise HTTPException(400, f"Invalid score path: {score_file!r}")
|
||||||
cmd.extend(["--score", str(resolved)])
|
cmd.extend(["--score", str(resolved)])
|
||||||
|
|
||||||
|
# Pick the GPU with the most free VRAM. Setting CUDA_VISIBLE_DEVICES to a
|
||||||
|
# single device prevents DataParallel from replicating the model across all
|
||||||
|
# GPUs, which would force a full copy onto the more memory-constrained device.
|
||||||
|
proc_env = {**os.environ, "PYTORCH_ALLOC_CONF": "expandable_segments:True"}
|
||||||
|
best_gpu = _best_cuda_device()
|
||||||
|
if best_gpu:
|
||||||
|
proc_env["CUDA_VISIBLE_DEVICES"] = best_gpu
|
||||||
|
|
||||||
|
gpu_note = f"GPU {best_gpu}" if best_gpu else "CPU (no GPU found)"
|
||||||
|
|
||||||
def generate():
|
def generate():
|
||||||
|
yield f"data: {json.dumps({'type': 'progress', 'message': f'[api] Using {gpu_note} (most free VRAM)'})}\n\n"
|
||||||
try:
|
try:
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
cmd,
|
cmd,
|
||||||
|
|
@ -400,6 +441,7 @@ def run_finetune_endpoint(
|
||||||
text=True,
|
text=True,
|
||||||
bufsize=1,
|
bufsize=1,
|
||||||
cwd=str(_ROOT),
|
cwd=str(_ROOT),
|
||||||
|
env=proc_env,
|
||||||
)
|
)
|
||||||
for line in proc.stdout:
|
for line in proc.stdout:
|
||||||
line = line.rstrip()
|
line = line.rstrip()
|
||||||
|
|
|
||||||
|
|
@ -42,10 +42,14 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = {
|
||||||
"deberta-small": {
|
"deberta-small": {
|
||||||
"base_model_id": "cross-encoder/nli-deberta-v3-small",
|
"base_model_id": "cross-encoder/nli-deberta-v3-small",
|
||||||
"max_tokens": 512,
|
"max_tokens": 512,
|
||||||
|
# fp16 must stay OFF — DeBERTa-v3 disentangled attention overflows fp16.
|
||||||
"fp16": False,
|
"fp16": False,
|
||||||
"batch_size": 16,
|
# batch_size=8 + grad_accum=2 keeps effective batch of 16 while halving
|
||||||
"grad_accum": 1,
|
# per-step activation memory. gradient_checkpointing recomputes activations
|
||||||
"gradient_checkpointing": False,
|
# on backward instead of storing them — ~60% less activation VRAM.
|
||||||
|
"batch_size": 8,
|
||||||
|
"grad_accum": 2,
|
||||||
|
"gradient_checkpointing": True,
|
||||||
},
|
},
|
||||||
"bge-m3": {
|
"bge-m3": {
|
||||||
"base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
|
"base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue