diff --git a/app/api.py b/app/api.py index a437f2f..83490b5 100644 --- a/app/api.py +++ b/app/api.py @@ -7,6 +7,8 @@ from __future__ import annotations import hashlib import json +import os +import subprocess as _subprocess import yaml from pathlib import Path @@ -27,6 +29,34 @@ def set_data_dir(path: Path) -> None: _DATA_DIR = path +def _best_cuda_device() -> str: + """Return the index of the GPU with the most free VRAM as a string. + + Uses nvidia-smi so it works in the job-seeker env (no torch). Returns "" + if nvidia-smi is unavailable or no GPUs are found. Restricting the + training subprocess to a single GPU via CUDA_VISIBLE_DEVICES prevents + PyTorch DataParallel from replicating the model across all GPUs, which + would OOM the GPU with less headroom. + """ + try: + out = _subprocess.check_output( + ["nvidia-smi", "--query-gpu=index,memory.free", + "--format=csv,noheader,nounits"], + text=True, + timeout=5, + ) + best_idx, best_free = "", 0 + for line in out.strip().splitlines(): + parts = line.strip().split(", ") + if len(parts) == 2: + idx, free = parts[0].strip(), int(parts[1].strip()) + if free > best_free: + best_free, best_idx = free, idx + return best_idx + except Exception: + return "" + + def set_models_dir(path: Path) -> None: """Override models directory — used by tests.""" global _MODELS_DIR @@ -391,7 +421,18 @@ def run_finetune_endpoint( raise HTTPException(400, f"Invalid score path: {score_file!r}") cmd.extend(["--score", str(resolved)]) + # Pick the GPU with the most free VRAM. Setting CUDA_VISIBLE_DEVICES to a + # single device prevents DataParallel from replicating the model across all + # GPUs, which would force a full copy onto the more memory-constrained device. + proc_env = {**os.environ, "PYTORCH_ALLOC_CONF": "expandable_segments:True"} + best_gpu = _best_cuda_device() + if best_gpu: + proc_env["CUDA_VISIBLE_DEVICES"] = best_gpu + + gpu_note = f"GPU {best_gpu}" if best_gpu else "CPU (no GPU found)" + def generate(): + yield f"data: {json.dumps({'type': 'progress', 'message': f'[api] Using {gpu_note} (most free VRAM)'})}\n\n" try: proc = subprocess.Popen( cmd, @@ -400,6 +441,7 @@ def run_finetune_endpoint( text=True, bufsize=1, cwd=str(_ROOT), + env=proc_env, ) for line in proc.stdout: line = line.rstrip() diff --git a/scripts/finetune_classifier.py b/scripts/finetune_classifier.py index 9bd832e..c70929e 100644 --- a/scripts/finetune_classifier.py +++ b/scripts/finetune_classifier.py @@ -42,10 +42,14 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = { "deberta-small": { "base_model_id": "cross-encoder/nli-deberta-v3-small", "max_tokens": 512, + # fp16 must stay OFF — DeBERTa-v3 disentangled attention overflows fp16. "fp16": False, - "batch_size": 16, - "grad_accum": 1, - "gradient_checkpointing": False, + # batch_size=8 + grad_accum=2 keeps effective batch of 16 while halving + # per-step activation memory. gradient_checkpointing recomputes activations + # on backward instead of storing them — ~60% less activation VRAM. + "batch_size": 8, + "grad_accum": 2, + "gradient_checkpointing": True, }, "bge-m3": { "base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",