diff --git a/app/api.py b/app/api.py
index a437f2f..83490b5 100644
--- a/app/api.py
+++ b/app/api.py
@@ -7,6 +7,8 @@ from __future__ import annotations
 
 import hashlib
 import json
+import os
+import subprocess as _subprocess
 import yaml
 from pathlib import Path
 
@@ -27,6 +29,34 @@ def set_data_dir(path: Path) -> None:
     _DATA_DIR = path
 
 
+def _best_cuda_device() -> str:
+    """Return the index of the GPU with the most free VRAM as a string.
+
+    Uses nvidia-smi so it works in the job-seeker env (no torch). Returns ""
+    if nvidia-smi is unavailable or no GPUs are found. Restricting the
+    training subprocess to a single GPU via CUDA_VISIBLE_DEVICES prevents
+    PyTorch DataParallel from replicating the model across all GPUs, which
+    would OOM the GPU with less headroom.
+    """
+    try:
+        out = _subprocess.check_output(
+            ["nvidia-smi", "--query-gpu=index,memory.free",
+             "--format=csv,noheader,nounits"],
+            text=True,
+            timeout=5,
+        )
+        best_idx, best_free = "", 0
+        for line in out.strip().splitlines():
+            parts = line.strip().split(", ")
+            if len(parts) == 2:
+                idx, free = parts[0].strip(), int(parts[1].strip())
+                if free > best_free:
+                    best_free, best_idx = free, idx
+        return best_idx
+    except Exception:
+        return ""
+
+
 def set_models_dir(path: Path) -> None:
     """Override models directory — used by tests."""
     global _MODELS_DIR
@@ -391,7 +421,18 @@ def run_finetune_endpoint(
             raise HTTPException(400, f"Invalid score path: {score_file!r}")
         cmd.extend(["--score", str(resolved)])
 
+    # Pick the GPU with the most free VRAM. Setting CUDA_VISIBLE_DEVICES to a
+    # single device prevents DataParallel from replicating the model across all
+    # GPUs, which would force a full copy onto the more memory-constrained device.
+    proc_env = {**os.environ, "PYTORCH_ALLOC_CONF": "expandable_segments:True"}
+    best_gpu = _best_cuda_device()
+    if best_gpu:
+        proc_env["CUDA_VISIBLE_DEVICES"] = best_gpu
+
+    gpu_note = f"GPU {best_gpu}" if best_gpu else "CPU (no GPU found)"
+
     def generate():
+        yield f"data: {json.dumps({'type': 'progress', 'message': f'[api] Using {gpu_note} (most free VRAM)'})}\n\n"
         try:
             proc = subprocess.Popen(
                 cmd,
@@ -400,6 +441,7 @@ def run_finetune_endpoint(
                 text=True,
                 bufsize=1,
                 cwd=str(_ROOT),
+                env=proc_env,
             )
             for line in proc.stdout:
                 line = line.rstrip()
diff --git a/scripts/finetune_classifier.py b/scripts/finetune_classifier.py
index 9bd832e..c70929e 100644
--- a/scripts/finetune_classifier.py
+++ b/scripts/finetune_classifier.py
@@ -42,10 +42,14 @@ _MODEL_CONFIG: dict[str, dict[str, Any]] = {
     "deberta-small": {
         "base_model_id": "cross-encoder/nli-deberta-v3-small",
         "max_tokens": 512,
+        # fp16 must stay OFF — DeBERTa-v3 disentangled attention overflows fp16.
         "fp16": False,
-        "batch_size": 16,
-        "grad_accum": 1,
-        "gradient_checkpointing": False,
+        # batch_size=8 + grad_accum=2 keeps effective batch of 16 while halving
+        # per-step activation memory. gradient_checkpointing recomputes activations
+        # on backward instead of storing them — ~60% less activation VRAM.
+        "batch_size": 8,
+        "grad_accum": 2,
+        "gradient_checkpointing": True,
     },
     "bge-m3": {
         "base_model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",