From 3dcbe801f1ebba3bfa13a1df1fcc5e5704986c28 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 30 Mar 2026 20:42:57 -0700 Subject: [PATCH] feat(resources): add GpuMonitor for nvidia-smi polling --- circuitforge_core/resources/agent/__init__.py | 0 .../resources/agent/gpu_monitor.py | 52 ++++++++++++++++ tests/test_resources/test_gpu_monitor.py | 60 +++++++++++++++++++ 3 files changed, 112 insertions(+) create mode 100644 circuitforge_core/resources/agent/__init__.py create mode 100644 circuitforge_core/resources/agent/gpu_monitor.py create mode 100644 tests/test_resources/test_gpu_monitor.py diff --git a/circuitforge_core/resources/agent/__init__.py b/circuitforge_core/resources/agent/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/circuitforge_core/resources/agent/gpu_monitor.py b/circuitforge_core/resources/agent/gpu_monitor.py new file mode 100644 index 0000000..4d058d6 --- /dev/null +++ b/circuitforge_core/resources/agent/gpu_monitor.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import logging +import subprocess + +from circuitforge_core.resources.models import GpuInfo + +logger = logging.getLogger(__name__) + +_NVIDIA_SMI_CMD = [ + "nvidia-smi", + "--query-gpu=index,name,memory.total,memory.used,memory.free", + "--format=csv,noheader,nounits", +] + + +class GpuMonitor: + def poll(self) -> list[GpuInfo]: + try: + result = subprocess.run( + _NVIDIA_SMI_CMD, + capture_output=True, + text=True, + timeout=5, + ) + except (FileNotFoundError, subprocess.TimeoutExpired) as exc: + logger.warning("nvidia-smi unavailable: %s", exc) + return [] + + if result.returncode != 0: + logger.warning("nvidia-smi exited %d", result.returncode) + return [] + + return self._parse(result.stdout) + + def _parse(self, output: str) -> list[GpuInfo]: + gpus: list[GpuInfo] = [] + for line in output.strip().splitlines(): + parts = [p.strip() for p in line.split(",")] + if len(parts) != 5: + continue + try: + gpus.append(GpuInfo( + gpu_id=int(parts[0]), + name=parts[1], + vram_total_mb=int(parts[2]), + vram_used_mb=int(parts[3]), + vram_free_mb=int(parts[4]), + )) + except ValueError: + logger.debug("Skipping malformed nvidia-smi line: %r", line) + return gpus diff --git a/tests/test_resources/test_gpu_monitor.py b/tests/test_resources/test_gpu_monitor.py new file mode 100644 index 0000000..a1aace7 --- /dev/null +++ b/tests/test_resources/test_gpu_monitor.py @@ -0,0 +1,60 @@ +from unittest.mock import patch +from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor + + +SAMPLE_NVIDIA_SMI_OUTPUT = ( + "0, Quadro RTX 4000, 8192, 6843, 1349\n" + "1, Quadro RTX 4000, 8192, 721, 7471\n" +) + + +def test_parse_returns_list_of_gpu_info(): + monitor = GpuMonitor() + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT + gpus = monitor.poll() + assert len(gpus) == 2 + assert gpus[0].gpu_id == 0 + assert gpus[0].name == "Quadro RTX 4000" + assert gpus[0].vram_total_mb == 8192 + assert gpus[0].vram_used_mb == 6843 + assert gpus[0].vram_free_mb == 1349 + + +def test_parse_second_gpu(): + monitor = GpuMonitor() + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT + gpus = monitor.poll() + assert gpus[1].gpu_id == 1 + assert gpus[1].vram_used_mb == 721 + assert gpus[1].vram_free_mb == 7471 + + +def test_poll_returns_empty_list_when_nvidia_smi_unavailable(): + monitor = GpuMonitor() + with patch("subprocess.run", side_effect=FileNotFoundError): + gpus = monitor.poll() + assert gpus == [] + + +def test_poll_returns_empty_list_on_nonzero_exit(): + monitor = GpuMonitor() + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 1 + mock_run.return_value.stdout = "" + gpus = monitor.poll() + assert gpus == [] + + +def test_poll_skips_malformed_lines(): + monitor = GpuMonitor() + malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n" + with patch("subprocess.run") as mock_run: + mock_run.return_value.returncode = 0 + mock_run.return_value.stdout = malformed + gpus = monitor.poll() + assert len(gpus) == 1 + assert gpus[0].gpu_id == 1