feat(resources): add GpuMonitor for nvidia-smi polling

This commit is contained in:
pyr0ball 2026-03-30 20:42:57 -07:00
parent 6b239b76e3
commit 3dcbe801f1
3 changed files with 112 additions and 0 deletions

View file

@ -0,0 +1,52 @@
from __future__ import annotations
import logging
import subprocess
from circuitforge_core.resources.models import GpuInfo
logger = logging.getLogger(__name__)
_NVIDIA_SMI_CMD = [
"nvidia-smi",
"--query-gpu=index,name,memory.total,memory.used,memory.free",
"--format=csv,noheader,nounits",
]
class GpuMonitor:
def poll(self) -> list[GpuInfo]:
try:
result = subprocess.run(
_NVIDIA_SMI_CMD,
capture_output=True,
text=True,
timeout=5,
)
except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
logger.warning("nvidia-smi unavailable: %s", exc)
return []
if result.returncode != 0:
logger.warning("nvidia-smi exited %d", result.returncode)
return []
return self._parse(result.stdout)
def _parse(self, output: str) -> list[GpuInfo]:
gpus: list[GpuInfo] = []
for line in output.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) != 5:
continue
try:
gpus.append(GpuInfo(
gpu_id=int(parts[0]),
name=parts[1],
vram_total_mb=int(parts[2]),
vram_used_mb=int(parts[3]),
vram_free_mb=int(parts[4]),
))
except ValueError:
logger.debug("Skipping malformed nvidia-smi line: %r", line)
return gpus

View file

@ -0,0 +1,60 @@
from unittest.mock import patch
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
SAMPLE_NVIDIA_SMI_OUTPUT = (
"0, Quadro RTX 4000, 8192, 6843, 1349\n"
"1, Quadro RTX 4000, 8192, 721, 7471\n"
)
def test_parse_returns_list_of_gpu_info():
monitor = GpuMonitor()
with patch("subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
gpus = monitor.poll()
assert len(gpus) == 2
assert gpus[0].gpu_id == 0
assert gpus[0].name == "Quadro RTX 4000"
assert gpus[0].vram_total_mb == 8192
assert gpus[0].vram_used_mb == 6843
assert gpus[0].vram_free_mb == 1349
def test_parse_second_gpu():
monitor = GpuMonitor()
with patch("subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
gpus = monitor.poll()
assert gpus[1].gpu_id == 1
assert gpus[1].vram_used_mb == 721
assert gpus[1].vram_free_mb == 7471
def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
monitor = GpuMonitor()
with patch("subprocess.run", side_effect=FileNotFoundError):
gpus = monitor.poll()
assert gpus == []
def test_poll_returns_empty_list_on_nonzero_exit():
monitor = GpuMonitor()
with patch("subprocess.run") as mock_run:
mock_run.return_value.returncode = 1
mock_run.return_value.stdout = ""
gpus = monitor.poll()
assert gpus == []
def test_poll_skips_malformed_lines():
monitor = GpuMonitor()
malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
with patch("subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = malformed
gpus = monitor.poll()
assert len(gpus) == 1
assert gpus[0].gpu_id == 1