feat(resources): add GpuMonitor for nvidia-smi polling
This commit is contained in:
parent
6b239b76e3
commit
3dcbe801f1
3 changed files with 112 additions and 0 deletions
0
circuitforge_core/resources/agent/__init__.py
Normal file
0
circuitforge_core/resources/agent/__init__.py
Normal file
52
circuitforge_core/resources/agent/gpu_monitor.py
Normal file
52
circuitforge_core/resources/agent/gpu_monitor.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
from circuitforge_core.resources.models import GpuInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_NVIDIA_SMI_CMD = [
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total,memory.used,memory.free",
|
||||
"--format=csv,noheader,nounits",
|
||||
]
|
||||
|
||||
|
||||
class GpuMonitor:
|
||||
def poll(self) -> list[GpuInfo]:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
_NVIDIA_SMI_CMD,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
|
||||
logger.warning("nvidia-smi unavailable: %s", exc)
|
||||
return []
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning("nvidia-smi exited %d", result.returncode)
|
||||
return []
|
||||
|
||||
return self._parse(result.stdout)
|
||||
|
||||
def _parse(self, output: str) -> list[GpuInfo]:
|
||||
gpus: list[GpuInfo] = []
|
||||
for line in output.strip().splitlines():
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) != 5:
|
||||
continue
|
||||
try:
|
||||
gpus.append(GpuInfo(
|
||||
gpu_id=int(parts[0]),
|
||||
name=parts[1],
|
||||
vram_total_mb=int(parts[2]),
|
||||
vram_used_mb=int(parts[3]),
|
||||
vram_free_mb=int(parts[4]),
|
||||
))
|
||||
except ValueError:
|
||||
logger.debug("Skipping malformed nvidia-smi line: %r", line)
|
||||
return gpus
|
||||
60
tests/test_resources/test_gpu_monitor.py
Normal file
60
tests/test_resources/test_gpu_monitor.py
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
from unittest.mock import patch
|
||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
||||
|
||||
|
||||
SAMPLE_NVIDIA_SMI_OUTPUT = (
|
||||
"0, Quadro RTX 4000, 8192, 6843, 1349\n"
|
||||
"1, Quadro RTX 4000, 8192, 721, 7471\n"
|
||||
)
|
||||
|
||||
|
||||
def test_parse_returns_list_of_gpu_info():
|
||||
monitor = GpuMonitor()
|
||||
with patch("subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 0
|
||||
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
||||
gpus = monitor.poll()
|
||||
assert len(gpus) == 2
|
||||
assert gpus[0].gpu_id == 0
|
||||
assert gpus[0].name == "Quadro RTX 4000"
|
||||
assert gpus[0].vram_total_mb == 8192
|
||||
assert gpus[0].vram_used_mb == 6843
|
||||
assert gpus[0].vram_free_mb == 1349
|
||||
|
||||
|
||||
def test_parse_second_gpu():
|
||||
monitor = GpuMonitor()
|
||||
with patch("subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 0
|
||||
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
||||
gpus = monitor.poll()
|
||||
assert gpus[1].gpu_id == 1
|
||||
assert gpus[1].vram_used_mb == 721
|
||||
assert gpus[1].vram_free_mb == 7471
|
||||
|
||||
|
||||
def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
|
||||
monitor = GpuMonitor()
|
||||
with patch("subprocess.run", side_effect=FileNotFoundError):
|
||||
gpus = monitor.poll()
|
||||
assert gpus == []
|
||||
|
||||
|
||||
def test_poll_returns_empty_list_on_nonzero_exit():
|
||||
monitor = GpuMonitor()
|
||||
with patch("subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 1
|
||||
mock_run.return_value.stdout = ""
|
||||
gpus = monitor.poll()
|
||||
assert gpus == []
|
||||
|
||||
|
||||
def test_poll_skips_malformed_lines():
|
||||
monitor = GpuMonitor()
|
||||
malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
|
||||
with patch("subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 0
|
||||
mock_run.return_value.stdout = malformed
|
||||
gpus = monitor.poll()
|
||||
assert len(gpus) == 1
|
||||
assert gpus[0].gpu_id == 1
|
||||
Loading…
Reference in a new issue