feat: implement GET /api/nodes-mgmt/nodes with coordinator proxy and profile merge
This commit is contained in:
parent
c039ea4698
commit
c2de9e53da
2 changed files with 197 additions and 6 deletions
84
app/nodes.py
84
app/nodes.py
|
|
@ -105,12 +105,84 @@ def _get_ollama_url(node_id: str) -> str:
|
|||
|
||||
@router.get("/nodes")
|
||||
def list_nodes() -> list:
|
||||
"""List all nodes visible to the cf-orch coordinator.
|
||||
"""Return all nodes with live GPU stats merged with profile YAML."""
|
||||
import httpx
|
||||
|
||||
Returns an empty list if no coordinator_url is configured.
|
||||
Full implementation arrives in Task 2 (live coordinator proxy).
|
||||
"""
|
||||
cfg = _load_config()
|
||||
if not cfg.get("coordinator_url"):
|
||||
coordinator_url = cfg.get("coordinator_url", "") or ""
|
||||
if not coordinator_url:
|
||||
return []
|
||||
return [] # full implementation in Task 2
|
||||
|
||||
try:
|
||||
r = httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
|
||||
r.raise_for_status()
|
||||
coord_nodes: list[dict] = r.json()
|
||||
except (httpx.HTTPError, httpx.ConnectError) as exc:
|
||||
logger.warning("Coordinator unreachable: %s", exc)
|
||||
return []
|
||||
|
||||
try:
|
||||
sr = httpx.get(f"{coordinator_url}/api/services", timeout=5.0)
|
||||
sr.raise_for_status()
|
||||
services_data: list[dict] = sr.json()
|
||||
except Exception:
|
||||
services_data = []
|
||||
|
||||
# Build per-node, per-GPU running services map
|
||||
running: dict[str, dict[int, list[str]]] = {}
|
||||
for svc in services_data:
|
||||
nid = svc.get("node_id", "")
|
||||
gid = svc.get("gpu_id")
|
||||
svc_name = svc.get("service", "")
|
||||
if nid and gid is not None and svc_name:
|
||||
running.setdefault(nid, {}).setdefault(gid, []).append(svc_name)
|
||||
|
||||
result = []
|
||||
for node in coord_nodes:
|
||||
node_id = node.get("node_id", "") or node.get("id", "")
|
||||
profile = _load_profile(node_id) if node_id else None
|
||||
profile_loaded = profile is not None
|
||||
|
||||
gpus = []
|
||||
for gpu in (node.get("gpus", []) or []):
|
||||
gpu_id = gpu.get("gpu_id", gpu.get("id", 0))
|
||||
services_assigned: list[str] = []
|
||||
if profile:
|
||||
node_entry = (profile.get("nodes", {}) or {}).get(node_id, {}) or {}
|
||||
for g in (node_entry.get("gpus", []) or []):
|
||||
if isinstance(g, dict) and g.get("id") == gpu_id:
|
||||
services_assigned = g.get("services", []) or []
|
||||
break
|
||||
gpus.append({
|
||||
"gpu_id": gpu_id,
|
||||
"card": gpu.get("card", ""),
|
||||
"vram_total_mb": gpu.get("vram_total_mb", 0),
|
||||
"vram_used_mb": gpu.get("vram_used_mb", 0),
|
||||
"vram_free_mb": gpu.get("vram_free_mb", 0),
|
||||
"temp_c": gpu.get("temp_c"),
|
||||
"utilization_pct": gpu.get("utilization_pct"),
|
||||
"compute_cap": gpu.get("compute_cap"),
|
||||
"services_assigned": services_assigned,
|
||||
"services_running": running.get(node_id, {}).get(gpu_id, []),
|
||||
})
|
||||
|
||||
services_catalog: dict = {}
|
||||
if profile:
|
||||
for svc_name, svc_info in (profile.get("services", {}) or {}).items():
|
||||
catalog = svc_info.get("catalog", {}) or {}
|
||||
services_catalog[svc_name] = {
|
||||
"min_compute_cap": svc_info.get("min_compute_cap", 0.0),
|
||||
"max_mb": svc_info.get("max_mb", 0),
|
||||
"catalog_size": len(catalog),
|
||||
}
|
||||
|
||||
result.append({
|
||||
"node_id": node_id,
|
||||
"online": node.get("online", True),
|
||||
"agent_url": node.get("agent_url", ""),
|
||||
"gpus": gpus,
|
||||
"profile_loaded": profile_loaded,
|
||||
"services_catalog": services_catalog,
|
||||
})
|
||||
|
||||
return result
|
||||
|
|
|
|||
|
|
@ -45,3 +45,122 @@ def test_list_nodes_returns_empty_when_no_coordinator(client):
|
|||
r = client.get("/api/nodes-mgmt/nodes")
|
||||
assert r.status_code == 200
|
||||
assert r.json() == []
|
||||
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
|
||||
def _fake_nodes_response(nodes_json: list, services_json: list | None = None):
|
||||
"""Build side_effect list for two httpx.get calls: nodes then services."""
|
||||
mock_nodes = MagicMock()
|
||||
mock_nodes.raise_for_status = MagicMock()
|
||||
mock_nodes.json.return_value = nodes_json
|
||||
|
||||
mock_services = MagicMock()
|
||||
mock_services.raise_for_status = MagicMock()
|
||||
mock_services.json.return_value = services_json or []
|
||||
|
||||
return [mock_nodes, mock_services]
|
||||
|
||||
|
||||
def test_list_nodes_coordinator_unreachable_returns_empty(client, tmp_path):
|
||||
"""Coordinator unreachable — returns [] with no 500."""
|
||||
import httpx
|
||||
_write_config(tmp_path, {"coordinator_url": "http://fake-coord:7700"})
|
||||
with patch("httpx.get", side_effect=httpx.ConnectError("refused")):
|
||||
r = client.get("/api/nodes-mgmt/nodes")
|
||||
assert r.status_code == 200
|
||||
assert r.json() == []
|
||||
|
||||
|
||||
def test_list_nodes_merges_profile_data(client, tmp_path):
|
||||
"""Profile YAML services_assigned merged with live GPU stats."""
|
||||
profiles_dir = tmp_path / "profiles"
|
||||
_write_config(tmp_path, {
|
||||
"coordinator_url": "http://fake-coord:7700",
|
||||
"profiles_dir": str(profiles_dir),
|
||||
})
|
||||
_write_profile(profiles_dir, "heimdall", {
|
||||
"services": {
|
||||
"cf-text": {"min_compute_cap": 7.0, "max_mb": 8192, "catalog": {}},
|
||||
},
|
||||
"nodes": {
|
||||
"heimdall": {
|
||||
"gpus": [{"id": 0, "vram_mb": 24576, "compute_cap": 8.6,
|
||||
"services": ["cf-text"], "role": "primary", "card": "RTX 3090",
|
||||
"always_on": True}],
|
||||
"agent_url": "http://10.1.10.71:7701",
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
coord_nodes = [{
|
||||
"node_id": "heimdall", "online": True, "agent_url": "http://10.1.10.71:7701",
|
||||
"gpus": [{"gpu_id": 0, "card": "RTX 3090", "vram_total_mb": 24576,
|
||||
"vram_used_mb": 4096, "vram_free_mb": 20480,
|
||||
"temp_c": 42.0, "utilization_pct": 15.0, "compute_cap": 8.6}],
|
||||
}]
|
||||
|
||||
with patch("httpx.get", side_effect=_fake_nodes_response(coord_nodes)):
|
||||
r = client.get("/api/nodes-mgmt/nodes")
|
||||
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert len(data) == 1
|
||||
node = data[0]
|
||||
assert node["node_id"] == "heimdall"
|
||||
assert node["profile_loaded"] is True
|
||||
assert node["gpus"][0]["services_assigned"] == ["cf-text"]
|
||||
assert node["gpus"][0]["vram_total_mb"] == 24576
|
||||
assert "cf-text" in node["services_catalog"]
|
||||
|
||||
|
||||
def test_list_nodes_no_profile_returns_profile_loaded_false(client, tmp_path):
|
||||
"""Node with no profile YAML — profile_loaded: false, GPU stats still returned."""
|
||||
_write_config(tmp_path, {"coordinator_url": "http://fake-coord:7700"})
|
||||
|
||||
coord_nodes = [{
|
||||
"node_id": "sif", "online": True, "agent_url": "http://10.1.10.158:7701",
|
||||
"gpus": [{"gpu_id": 0, "card": "RTX 5060 Ti", "vram_total_mb": 16384,
|
||||
"vram_used_mb": 0, "vram_free_mb": 16384,
|
||||
"temp_c": None, "utilization_pct": None, "compute_cap": 10.0}],
|
||||
}]
|
||||
|
||||
with patch("httpx.get", side_effect=_fake_nodes_response(coord_nodes)):
|
||||
r = client.get("/api/nodes-mgmt/nodes")
|
||||
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
node = data[0]
|
||||
assert node["profile_loaded"] is False
|
||||
assert node["gpus"][0]["card"] == "RTX 5060 Ti"
|
||||
assert node["services_catalog"] == {}
|
||||
|
||||
|
||||
def test_list_nodes_marks_running_services(client, tmp_path):
|
||||
"""services_running populated from coordinator /api/services response."""
|
||||
profiles_dir = tmp_path / "profiles"
|
||||
_write_config(tmp_path, {
|
||||
"coordinator_url": "http://fake-coord:7700",
|
||||
"profiles_dir": str(profiles_dir),
|
||||
})
|
||||
_write_profile(profiles_dir, "heimdall", {
|
||||
"services": {},
|
||||
"nodes": {"heimdall": {"gpus": [{"id": 0, "vram_mb": 24576, "compute_cap": 8.6,
|
||||
"services": ["cf-text"], "role": "p",
|
||||
"card": "RTX 3090", "always_on": True}],
|
||||
"agent_url": "http://10.1.10.71:7701"}}
|
||||
})
|
||||
|
||||
coord_nodes = [{"node_id": "heimdall", "online": True,
|
||||
"agent_url": "http://10.1.10.71:7701",
|
||||
"gpus": [{"gpu_id": 0, "card": "RTX 3090", "vram_total_mb": 24576,
|
||||
"vram_used_mb": 8192, "vram_free_mb": 16384,
|
||||
"temp_c": 55.0, "utilization_pct": 80.0, "compute_cap": 8.6}]}]
|
||||
coord_services = [{"service": "cf-text", "node_id": "heimdall", "gpu_id": 0}]
|
||||
|
||||
with patch("httpx.get", side_effect=_fake_nodes_response(coord_nodes, coord_services)):
|
||||
r = client.get("/api/nodes-mgmt/nodes")
|
||||
|
||||
data = r.json()
|
||||
assert data[0]["gpus"][0]["services_running"] == ["cf-text"]
|
||||
|
|
|
|||
Loading…
Reference in a new issue