feat(resources): add GpuMonitor for nvidia-smi polling
This commit is contained in:
parent
6b239b76e3
commit
3dcbe801f1
3 changed files with 112 additions and 0 deletions
0
circuitforge_core/resources/agent/__init__.py
Normal file
0
circuitforge_core/resources/agent/__init__.py
Normal file
52
circuitforge_core/resources/agent/gpu_monitor.py
Normal file
52
circuitforge_core/resources/agent/gpu_monitor.py
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from circuitforge_core.resources.models import GpuInfo
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_NVIDIA_SMI_CMD = [
|
||||||
|
"nvidia-smi",
|
||||||
|
"--query-gpu=index,name,memory.total,memory.used,memory.free",
|
||||||
|
"--format=csv,noheader,nounits",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class GpuMonitor:
|
||||||
|
def poll(self) -> list[GpuInfo]:
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
_NVIDIA_SMI_CMD,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
|
||||||
|
logger.warning("nvidia-smi unavailable: %s", exc)
|
||||||
|
return []
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
logger.warning("nvidia-smi exited %d", result.returncode)
|
||||||
|
return []
|
||||||
|
|
||||||
|
return self._parse(result.stdout)
|
||||||
|
|
||||||
|
def _parse(self, output: str) -> list[GpuInfo]:
|
||||||
|
gpus: list[GpuInfo] = []
|
||||||
|
for line in output.strip().splitlines():
|
||||||
|
parts = [p.strip() for p in line.split(",")]
|
||||||
|
if len(parts) != 5:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
gpus.append(GpuInfo(
|
||||||
|
gpu_id=int(parts[0]),
|
||||||
|
name=parts[1],
|
||||||
|
vram_total_mb=int(parts[2]),
|
||||||
|
vram_used_mb=int(parts[3]),
|
||||||
|
vram_free_mb=int(parts[4]),
|
||||||
|
))
|
||||||
|
except ValueError:
|
||||||
|
logger.debug("Skipping malformed nvidia-smi line: %r", line)
|
||||||
|
return gpus
|
||||||
60
tests/test_resources/test_gpu_monitor.py
Normal file
60
tests/test_resources/test_gpu_monitor.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
from unittest.mock import patch
|
||||||
|
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_NVIDIA_SMI_OUTPUT = (
|
||||||
|
"0, Quadro RTX 4000, 8192, 6843, 1349\n"
|
||||||
|
"1, Quadro RTX 4000, 8192, 721, 7471\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_returns_list_of_gpu_info():
|
||||||
|
monitor = GpuMonitor()
|
||||||
|
with patch("subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value.returncode = 0
|
||||||
|
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
||||||
|
gpus = monitor.poll()
|
||||||
|
assert len(gpus) == 2
|
||||||
|
assert gpus[0].gpu_id == 0
|
||||||
|
assert gpus[0].name == "Quadro RTX 4000"
|
||||||
|
assert gpus[0].vram_total_mb == 8192
|
||||||
|
assert gpus[0].vram_used_mb == 6843
|
||||||
|
assert gpus[0].vram_free_mb == 1349
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_second_gpu():
|
||||||
|
monitor = GpuMonitor()
|
||||||
|
with patch("subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value.returncode = 0
|
||||||
|
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
||||||
|
gpus = monitor.poll()
|
||||||
|
assert gpus[1].gpu_id == 1
|
||||||
|
assert gpus[1].vram_used_mb == 721
|
||||||
|
assert gpus[1].vram_free_mb == 7471
|
||||||
|
|
||||||
|
|
||||||
|
def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
|
||||||
|
monitor = GpuMonitor()
|
||||||
|
with patch("subprocess.run", side_effect=FileNotFoundError):
|
||||||
|
gpus = monitor.poll()
|
||||||
|
assert gpus == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_poll_returns_empty_list_on_nonzero_exit():
|
||||||
|
monitor = GpuMonitor()
|
||||||
|
with patch("subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value.returncode = 1
|
||||||
|
mock_run.return_value.stdout = ""
|
||||||
|
gpus = monitor.poll()
|
||||||
|
assert gpus == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_poll_skips_malformed_lines():
|
||||||
|
monitor = GpuMonitor()
|
||||||
|
malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
|
||||||
|
with patch("subprocess.run") as mock_run:
|
||||||
|
mock_run.return_value.returncode = 0
|
||||||
|
mock_run.return_value.stdout = malformed
|
||||||
|
gpus = monitor.poll()
|
||||||
|
assert len(gpus) == 1
|
||||||
|
assert gpus[0].gpu_id == 1
|
||||||
Loading…
Reference in a new issue