feat!: strip resources/ from MIT core — moves to circuitforge-orch (v0.8.0)
BREAKING CHANGE: circuitforge_core.resources is no longer available. Import CFOrchClient from circuitforge_orch.client instead. cf-orch CLI entry point is now in the circuitforge-orch package.
This commit is contained in:
parent
2259382d0b
commit
c244260d1c
63 changed files with 34 additions and 6571 deletions
22
README.md
22
README.md
|
|
@ -2,15 +2,29 @@
|
|||
|
||||
Shared scaffold for CircuitForge products.
|
||||
|
||||
**Current version: 0.7.0**
|
||||
|
||||
## Modules
|
||||
|
||||
### Implemented
|
||||
|
||||
- `circuitforge_core.db` — SQLite connection factory and migration runner
|
||||
- `circuitforge_core.llm` — LLM router with fallback chain
|
||||
- `circuitforge_core.llm` — LLM router with fallback chain (Ollama, vLLM, Anthropic, OpenAI-compatible)
|
||||
- `circuitforge_core.tiers` — Tier system with BYOK and local vision unlocks
|
||||
- `circuitforge_core.config` — Env validation and .env loader
|
||||
- `circuitforge_core.vision` — Vision router stub (v0.2+)
|
||||
- `circuitforge_core.wizard` — First-run wizard base class stub
|
||||
- `circuitforge_core.pipeline` — Staging queue stub (v0.2+)
|
||||
- `circuitforge_core.hardware` — Hardware detection and LLM backend profile generation (VRAM tiers, GPU/CPU auto-select)
|
||||
- `circuitforge_core.documents` — Document ingestion pipeline: PDF, DOCX, and image OCR → `StructuredDocument`
|
||||
- `circuitforge_core.affiliates` — Affiliate URL wrapping with opt-out, BYOK user IDs, and CF env-var fallback (`wrap_url`)
|
||||
- `circuitforge_core.preferences` — User preference store (local YAML file, pluggable backend); dot-path get/set API
|
||||
- `circuitforge_core.tasks` — VRAM-aware LLM task scheduler; shared slot manager across services (`TaskScheduler`)
|
||||
- `circuitforge_core.manage` — Cross-platform product process manager (Docker and native modes)
|
||||
- `circuitforge_core.resources` — Resource coordinator and agent: VRAM allocation, eviction engine, GPU profile registry
|
||||
|
||||
### Stubs (in-tree, not yet implemented)
|
||||
|
||||
- `circuitforge_core.vision` — Vision router base class (planned: moondream2 / Claude vision dispatch)
|
||||
- `circuitforge_core.wizard` — First-run wizard base class (products subclass `BaseWizard`)
|
||||
- `circuitforge_core.pipeline` — Staging queue base (`StagingDB`; products provide concrete schema)
|
||||
|
||||
## Install
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
__version__ = "0.7.0"
|
||||
__version__ = "0.8.0"
|
||||
|
|
|
|||
|
|
@ -56,6 +56,12 @@ def _build_ebay_url(url: str, affiliate_id: str) -> str:
|
|||
return f"{url}{sep}{params}"
|
||||
|
||||
|
||||
def _build_instacart_url(url: str, affiliate_id: str) -> str:
|
||||
"""Append Instacart affiliate parameter to a search URL."""
|
||||
sep = "&" if "?" in url else "?"
|
||||
return f"{url}{sep}aff={affiliate_id}"
|
||||
|
||||
|
||||
def _build_amazon_url(url: str, affiliate_id: str) -> str:
|
||||
"""Merge an Amazon Associates tag into a product URL's query string."""
|
||||
parsed = urlparse(url)
|
||||
|
|
@ -101,3 +107,10 @@ register_program(AffiliateProgram(
|
|||
env_var="AMAZON_ASSOCIATES_TAG",
|
||||
build_url=_build_amazon_url,
|
||||
))
|
||||
|
||||
register_program(AffiliateProgram(
|
||||
name="Instacart",
|
||||
retailer_key="instacart",
|
||||
env_var="INSTACART_AFFILIATE_ID",
|
||||
build_url=_build_instacart_url,
|
||||
))
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
from circuitforge_core.resources.client import CFOrchClient, Allocation # noqa: F401
|
||||
|
|
@ -1,105 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import Any
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor
|
||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EvictRequest(BaseModel):
|
||||
pid: int
|
||||
grace_period_s: float = 5.0
|
||||
|
||||
|
||||
class ServiceStartRequest(BaseModel):
|
||||
gpu_id: int = 0
|
||||
params: dict[str, str] = {}
|
||||
|
||||
|
||||
def create_agent_app(
|
||||
node_id: str,
|
||||
monitor: GpuMonitor | None = None,
|
||||
executor: EvictionExecutor | None = None,
|
||||
service_manager: ServiceManager | None = None,
|
||||
) -> FastAPI:
|
||||
_monitor = monitor or GpuMonitor()
|
||||
_executor = executor or EvictionExecutor()
|
||||
|
||||
app = FastAPI(title=f"cf-orch-agent [{node_id}]")
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict[str, Any]:
|
||||
return {"status": "ok", "node_id": node_id}
|
||||
|
||||
@app.get("/gpu-info")
|
||||
def gpu_info() -> dict[str, Any]:
|
||||
gpus = _monitor.poll()
|
||||
return {
|
||||
"node_id": node_id,
|
||||
"gpus": [
|
||||
{
|
||||
"gpu_id": g.gpu_id,
|
||||
"name": g.name,
|
||||
"vram_total_mb": g.vram_total_mb,
|
||||
"vram_used_mb": g.vram_used_mb,
|
||||
"vram_free_mb": g.vram_free_mb,
|
||||
}
|
||||
for g in gpus
|
||||
],
|
||||
}
|
||||
|
||||
@app.post("/evict")
|
||||
def evict(req: EvictRequest) -> dict[str, Any]:
|
||||
result = _executor.evict_pid(pid=req.pid, grace_period_s=req.grace_period_s)
|
||||
return {
|
||||
"success": result.success,
|
||||
"method": result.method,
|
||||
"message": result.message,
|
||||
}
|
||||
|
||||
@app.get("/resident-info")
|
||||
def resident_info() -> dict[str, Any]:
|
||||
"""Return which models are currently loaded in each running managed service."""
|
||||
if service_manager is None:
|
||||
return {"residents": []}
|
||||
from circuitforge_core.resources.agent.service_probe import probe_all
|
||||
return {"residents": probe_all(service_manager)}
|
||||
|
||||
if service_manager is not None:
|
||||
@app.get("/services")
|
||||
def list_services() -> dict:
|
||||
return {"running": service_manager.list_running()}
|
||||
|
||||
@app.get("/services/{service}")
|
||||
def service_status(service: str) -> dict:
|
||||
running = service_manager.is_running(service)
|
||||
url = service_manager.get_url(service) if running else None
|
||||
return {"service": service, "running": running, "url": url}
|
||||
|
||||
@app.post("/services/{service}/start")
|
||||
def start_service(service: str, req: ServiceStartRequest) -> dict:
|
||||
try:
|
||||
already_running = service_manager.is_running(service)
|
||||
url = service_manager.start(service, req.gpu_id, req.params)
|
||||
# adopted=True signals the coordinator to treat this instance as
|
||||
# immediately running rather than waiting for the probe loop.
|
||||
adopted = already_running and service_manager.is_running(service)
|
||||
return {"service": service, "url": url, "running": True, "adopted": adopted}
|
||||
except (ValueError, NotImplementedError) as exc:
|
||||
raise HTTPException(status_code=422, detail=str(exc))
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to start {service}: {exc}")
|
||||
|
||||
@app.post("/services/{service}/stop")
|
||||
def stop_service(service: str) -> dict:
|
||||
stopped = service_manager.stop(service)
|
||||
return {"service": service, "stopped": stopped}
|
||||
|
||||
return app
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
import psutil
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_GRACE_S = 5.0
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EvictionResult:
|
||||
success: bool
|
||||
method: str # "sigterm", "sigkill", "already_gone", "not_found", "error"
|
||||
message: str
|
||||
|
||||
|
||||
class EvictionExecutor:
|
||||
def __init__(self, grace_period_s: float = _DEFAULT_GRACE_S) -> None:
|
||||
self._default_grace = grace_period_s
|
||||
|
||||
def evict_pid(
|
||||
self,
|
||||
pid: int,
|
||||
grace_period_s: float | None = None,
|
||||
) -> EvictionResult:
|
||||
grace = grace_period_s if grace_period_s is not None else self._default_grace
|
||||
|
||||
if pid <= 0:
|
||||
return EvictionResult(
|
||||
success=False, method="error",
|
||||
message=f"Refusing to signal invalid PID {pid}"
|
||||
)
|
||||
|
||||
if not psutil.pid_exists(pid):
|
||||
return EvictionResult(
|
||||
success=False, method="not_found",
|
||||
message=f"PID {pid} not found"
|
||||
)
|
||||
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
except ProcessLookupError:
|
||||
return EvictionResult(
|
||||
success=True, method="already_gone",
|
||||
message=f"PID {pid} vanished before SIGTERM"
|
||||
)
|
||||
except PermissionError as exc:
|
||||
return EvictionResult(
|
||||
success=False, method="error",
|
||||
message=f"Permission denied terminating PID {pid}: {exc}"
|
||||
)
|
||||
|
||||
# Wait for grace period
|
||||
deadline = time.monotonic() + grace
|
||||
while time.monotonic() < deadline:
|
||||
if not psutil.pid_exists(pid):
|
||||
logger.info("PID %d exited cleanly after SIGTERM", pid)
|
||||
return EvictionResult(
|
||||
success=True, method="sigterm",
|
||||
message=f"PID {pid} exited after SIGTERM"
|
||||
)
|
||||
time.sleep(0.05)
|
||||
|
||||
# Escalate to SIGKILL
|
||||
if psutil.pid_exists(pid):
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
logger.warning("PID %d required SIGKILL", pid)
|
||||
return EvictionResult(
|
||||
success=True, method="sigkill",
|
||||
message=f"PID {pid} killed with SIGKILL"
|
||||
)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
return EvictionResult(
|
||||
success=True, method="sigkill",
|
||||
message=f"PID {pid} is gone"
|
||||
)
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
from circuitforge_core.resources.models import GpuInfo
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_NVIDIA_SMI_CMD = [
|
||||
"nvidia-smi",
|
||||
"--query-gpu=index,name,memory.total,memory.used,memory.free",
|
||||
"--format=csv,noheader,nounits",
|
||||
]
|
||||
|
||||
|
||||
class GpuMonitor:
|
||||
def poll(self) -> list[GpuInfo]:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
_NVIDIA_SMI_CMD,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
|
||||
logger.warning("nvidia-smi unavailable: %s", exc)
|
||||
return []
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning("nvidia-smi exited %d", result.returncode)
|
||||
return []
|
||||
|
||||
return self._parse(result.stdout)
|
||||
|
||||
def _parse(self, output: str) -> list[GpuInfo]:
|
||||
gpus: list[GpuInfo] = []
|
||||
for line in output.strip().splitlines():
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) != 5:
|
||||
continue
|
||||
try:
|
||||
gpus.append(GpuInfo(
|
||||
gpu_id=int(parts[0]),
|
||||
name=parts[1],
|
||||
vram_total_mb=int(parts[2]),
|
||||
vram_used_mb=int(parts[3]),
|
||||
vram_free_mb=int(parts[4]),
|
||||
))
|
||||
except ValueError:
|
||||
logger.debug("Skipping malformed nvidia-smi line: %r", line)
|
||||
return gpus
|
||||
|
|
@ -1,186 +0,0 @@
|
|||
"""
|
||||
ServiceManager — start/stop Docker containers and processes for cf-orch managed services.
|
||||
|
||||
Container naming convention: cf-orch-{service}-{node_id}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from circuitforge_core.resources.profiles.schema import DockerSpec, GpuProfile, ProcessSpec
|
||||
|
||||
|
||||
def _expand_volume(v: str) -> str:
|
||||
"""Expand bash-style volume strings including ${VAR:-default} and $VAR."""
|
||||
def _sub(m: re.Match) -> str: # type: ignore[type-arg]
|
||||
var, default = m.group(1), m.group(2) or ""
|
||||
return os.environ.get(var) or default
|
||||
v = re.sub(r"\$\{(\w+)(?::-(.*?))?\}", _sub, v)
|
||||
v = re.sub(r"\$(\w+)", lambda m: os.environ.get(m.group(1), m.group(0)), v)
|
||||
return v
|
||||
|
||||
|
||||
class ServiceManager:
|
||||
def __init__(
|
||||
self,
|
||||
node_id: str,
|
||||
profile: GpuProfile,
|
||||
advertise_host: str = "127.0.0.1",
|
||||
) -> None:
|
||||
self.node_id = node_id
|
||||
self.profile = profile
|
||||
self.advertise_host = advertise_host
|
||||
self._procs: dict[str, Any] = {}
|
||||
|
||||
def container_name(self, service: str) -> str:
|
||||
return f"cf-orch-{service}-{self.node_id}"
|
||||
|
||||
def _get_spec(self, service: str) -> DockerSpec | ProcessSpec | None:
|
||||
svc = self.profile.services.get(service)
|
||||
if svc is None:
|
||||
return None
|
||||
return svc.managed
|
||||
|
||||
def is_running(self, service: str) -> bool:
|
||||
spec = self._get_spec(service)
|
||||
if spec is None:
|
||||
return False
|
||||
if isinstance(spec, DockerSpec):
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"docker",
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{.State.Running}}",
|
||||
self.container_name(service),
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=True,
|
||||
)
|
||||
return result.stdout.strip() == "true"
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
if isinstance(spec, ProcessSpec):
|
||||
# For adopt=True services, check the health endpoint regardless of whether
|
||||
# we spawned the process (it may be a system daemon we didn't start).
|
||||
if spec.adopt:
|
||||
return self._probe_health(spec.host_port, spec.health_path)
|
||||
proc = self._procs.get(service)
|
||||
if proc is None or proc.poll() is not None:
|
||||
return False
|
||||
import socket
|
||||
try:
|
||||
with socket.create_connection(("127.0.0.1", spec.host_port), timeout=1):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
return False
|
||||
|
||||
def _probe_health(self, port: int, health_path: str = "/health") -> bool:
|
||||
"""Return True if the service at localhost:port responds 200 on health_path."""
|
||||
import urllib.request
|
||||
try:
|
||||
url = f"http://127.0.0.1:{port}{health_path}"
|
||||
with urllib.request.urlopen(url, timeout=2.0) as resp:
|
||||
return resp.status == 200
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def start(self, service: str, gpu_id: int, params: dict[str, str]) -> str:
|
||||
spec = self._get_spec(service)
|
||||
if spec is None:
|
||||
raise ValueError(f"Service {service!r} not in profile or has no managed spec")
|
||||
|
||||
if self.is_running(service):
|
||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
||||
|
||||
if isinstance(spec, DockerSpec):
|
||||
expanded_volumes = [_expand_volume(v) for v in spec.volumes]
|
||||
|
||||
filler: dict[str, str] = defaultdict(str, params)
|
||||
expanded_command = spec.command_template.format_map(filler).split()
|
||||
|
||||
cmd = [
|
||||
"docker", "run", "-d", "--rm",
|
||||
"--name", self.container_name(service),
|
||||
"--runtime", spec.runtime,
|
||||
"--gpus", f"device={gpu_id}",
|
||||
"--ipc", spec.ipc,
|
||||
"-p", f"{spec.host_port}:{spec.port}",
|
||||
]
|
||||
for vol in expanded_volumes:
|
||||
cmd += ["-v", vol]
|
||||
for key, val in spec.env.items():
|
||||
cmd += ["-e", f"{key}={val}"]
|
||||
cmd.append(spec.image)
|
||||
cmd.extend(expanded_command)
|
||||
|
||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
||||
|
||||
if isinstance(spec, ProcessSpec):
|
||||
# adopt=True: if the service is already healthy, claim it without spawning.
|
||||
if spec.adopt and self._probe_health(spec.host_port, spec.health_path):
|
||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
||||
|
||||
import subprocess as _sp
|
||||
|
||||
filler = defaultdict(str, params)
|
||||
filler.setdefault("port", str(spec.port))
|
||||
filler.setdefault("gpu_id", str(gpu_id))
|
||||
args_expanded = spec.args_template.format_map(filler).split()
|
||||
|
||||
cmd = [spec.exec_path] + args_expanded
|
||||
env = {**__import__("os").environ}
|
||||
proc = _sp.Popen(
|
||||
cmd,
|
||||
cwd=spec.cwd or None,
|
||||
env=env,
|
||||
stdout=_sp.DEVNULL,
|
||||
stderr=_sp.DEVNULL,
|
||||
)
|
||||
self._procs[service] = proc
|
||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
||||
|
||||
raise NotImplementedError(f"Unknown spec type: {type(spec)}")
|
||||
|
||||
def stop(self, service: str) -> bool:
|
||||
spec = self._get_spec(service)
|
||||
if spec is None:
|
||||
return False
|
||||
if isinstance(spec, DockerSpec):
|
||||
try:
|
||||
subprocess.run(
|
||||
["docker", "stop", self.container_name(service)],
|
||||
check=True,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
return True
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
if isinstance(spec, ProcessSpec):
|
||||
proc = self._procs.pop(service, None)
|
||||
if proc is not None:
|
||||
proc.terminate()
|
||||
try:
|
||||
proc.wait(timeout=10)
|
||||
except Exception:
|
||||
proc.kill()
|
||||
return True
|
||||
return False
|
||||
|
||||
def list_running(self) -> list[str]:
|
||||
return [svc for svc in self.profile.services if self.is_running(svc)]
|
||||
|
||||
def get_url(self, service: str) -> str | None:
|
||||
spec = self._get_spec(service)
|
||||
if spec is None or not self.is_running(service):
|
||||
return None
|
||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
||||
|
|
@ -1,123 +0,0 @@
|
|||
"""
|
||||
Probe running services to detect which models are currently loaded in VRAM.
|
||||
|
||||
Two probe strategies run together:
|
||||
|
||||
1. Well-known ports — always checked, regardless of who started the service.
|
||||
Catches ollama, vLLM, etc. running outside cf-orch management.
|
||||
|
||||
2. Managed services — services cf-orch started via ServiceManager.
|
||||
Checked on their configured host_port, deduplicates with well-known results.
|
||||
|
||||
Each service exposes a different introspection API:
|
||||
- vllm: GET /v1/models → {"data": [{"id": "<model-name>"}]}
|
||||
- ollama: GET /api/ps → {"models": [{"name": "<model>", "size_vram": <bytes>}]}
|
||||
|
||||
ollama can have multiple models loaded simultaneously; each is reported as a
|
||||
separate entry so the dashboard shows per-model residency.
|
||||
|
||||
The probe is best-effort: a timeout or connection refusal means model_name=None
|
||||
but the service is still reported as resident.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import urllib.request
|
||||
from typing import Any
|
||||
|
||||
from circuitforge_core.resources.profiles.schema import DockerSpec
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_PROBE_TIMEOUT_S = 2.0
|
||||
|
||||
# Well-known service ports probed on every heartbeat.
|
||||
# key → (service_name, prober_key)
|
||||
_WELL_KNOWN_PORTS: dict[int, str] = {
|
||||
11434: "ollama",
|
||||
8000: "vllm",
|
||||
8080: "vllm", # common alt vLLM port
|
||||
}
|
||||
|
||||
|
||||
def _fetch_json(url: str) -> dict[str, Any] | None:
|
||||
"""GET a URL and parse JSON; returns None on any error."""
|
||||
try:
|
||||
with urllib.request.urlopen(url, timeout=_PROBE_TIMEOUT_S) as resp:
|
||||
return json.loads(resp.read())
|
||||
except Exception as exc:
|
||||
logger.debug("Probe %s: %s", url, exc)
|
||||
return None
|
||||
|
||||
|
||||
def _probe_vllm(port: int) -> list[str]:
|
||||
data = _fetch_json(f"http://127.0.0.1:{port}/v1/models")
|
||||
if data and data.get("data"):
|
||||
return [m["id"] for m in data["data"] if m.get("id")]
|
||||
return []
|
||||
|
||||
|
||||
def _probe_ollama(port: int) -> list[str]:
|
||||
# /api/ps lists models currently *loaded in memory*, not just downloaded.
|
||||
data = _fetch_json(f"http://127.0.0.1:{port}/api/ps")
|
||||
if data and data.get("models"):
|
||||
return [m["name"] for m in data["models"] if m.get("name")]
|
||||
return []
|
||||
|
||||
|
||||
_PROBERS: dict[str, Any] = {
|
||||
"vllm": _probe_vllm,
|
||||
"ollama": _probe_ollama,
|
||||
}
|
||||
|
||||
|
||||
def probe_all(service_manager: Any) -> list[dict[str, Any]]:
|
||||
"""
|
||||
Probe all services — both well-known ports and cf-orch managed services.
|
||||
|
||||
Returns a list of dicts: [{"service": str, "model_name": str | None}].
|
||||
Multiple loaded models in one service (e.g. two ollama models) each get
|
||||
their own entry, disambiguated as "ollama/0", "ollama/1", etc.
|
||||
"""
|
||||
results: list[dict[str, Any]] = []
|
||||
seen_ports: set[int] = set()
|
||||
|
||||
# ── 1. Well-known ports ──────────────────────────────────────────
|
||||
for port, service in _WELL_KNOWN_PORTS.items():
|
||||
prober = _PROBERS.get(service)
|
||||
if prober is None:
|
||||
continue
|
||||
models = prober(port)
|
||||
if not models:
|
||||
continue # nothing on this port right now
|
||||
seen_ports.add(port)
|
||||
if len(models) == 1:
|
||||
results.append({"service": service, "model_name": models[0]})
|
||||
else:
|
||||
for i, model in enumerate(models):
|
||||
results.append({"service": f"{service}/{i}", "model_name": model})
|
||||
|
||||
# ── 2. Managed services (cf-orch started) ───────────────────────
|
||||
if service_manager is not None:
|
||||
for service in service_manager.list_running():
|
||||
spec = service_manager._get_spec(service)
|
||||
if not isinstance(spec, DockerSpec):
|
||||
continue
|
||||
if spec.host_port in seen_ports:
|
||||
continue # already captured by well-known probe
|
||||
prober = _PROBERS.get(service)
|
||||
if prober is None:
|
||||
results.append({"service": service, "model_name": None})
|
||||
continue
|
||||
models = prober(spec.host_port)
|
||||
seen_ports.add(spec.host_port)
|
||||
if not models:
|
||||
results.append({"service": service, "model_name": None})
|
||||
elif len(models) == 1:
|
||||
results.append({"service": service, "model_name": models[0]})
|
||||
else:
|
||||
for i, model in enumerate(models):
|
||||
results.append({"service": f"{service}/{i}", "model_name": model})
|
||||
|
||||
return results
|
||||
|
|
@ -1,234 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Optional
|
||||
|
||||
import typer
|
||||
import uvicorn
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
app = typer.Typer(name="cf-orch", help="CircuitForge GPU resource orchestrator")
|
||||
|
||||
_SYSTEMD_UNIT_PATH = Path("/etc/systemd/system/cf-orch.service")
|
||||
|
||||
_SYSTEMD_UNIT_TEMPLATE = """\
|
||||
[Unit]
|
||||
Description=CircuitForge GPU Resource Orchestrator
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart={python} -m circuitforge_core.resources.cli start
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
"""
|
||||
|
||||
|
||||
@app.command()
|
||||
def start(
|
||||
profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
|
||||
host: str = "0.0.0.0",
|
||||
port: int = 7700,
|
||||
node_id: str = "local",
|
||||
agent_port: int = 7701,
|
||||
) -> None:
|
||||
"""Start the cf-orch coordinator (auto-detects GPU profile if not specified).
|
||||
|
||||
Automatically pre-registers the local agent so its GPUs appear on the
|
||||
dashboard immediately. Remote nodes self-register via POST /api/nodes.
|
||||
"""
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
||||
|
||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
||||
|
||||
lease_manager = LeaseManager()
|
||||
profile_registry = ProfileRegistry()
|
||||
service_registry = ServiceRegistry()
|
||||
node_store = NodeStore()
|
||||
supervisor = AgentSupervisor(
|
||||
lease_manager=lease_manager,
|
||||
service_registry=service_registry,
|
||||
profile_registry=profile_registry,
|
||||
node_store=node_store,
|
||||
)
|
||||
restored = supervisor.restore_from_store()
|
||||
if restored:
|
||||
typer.echo(f"Restored {restored} known node(s) from previous session")
|
||||
|
||||
monitor = GpuMonitor()
|
||||
gpus = monitor.poll()
|
||||
if not gpus:
|
||||
typer.echo(
|
||||
"Warning: no GPUs detected via nvidia-smi — coordinator running with 0 VRAM"
|
||||
)
|
||||
else:
|
||||
typer.echo(f"Detected {len(gpus)} GPU(s)")
|
||||
|
||||
if profile:
|
||||
active_profile = profile_registry.load(profile)
|
||||
typer.echo(f"Using profile: {active_profile.name} (from {profile})")
|
||||
else:
|
||||
active_profile = (
|
||||
profile_registry.auto_detect(gpus)
|
||||
if gpus
|
||||
else profile_registry.list_public()[-1]
|
||||
)
|
||||
typer.echo(f"Auto-selected profile: {active_profile.name}")
|
||||
|
||||
# Pre-register the local agent — the heartbeat loop will poll it for live GPU data.
|
||||
local_agent_url = f"http://127.0.0.1:{agent_port}"
|
||||
supervisor.register(node_id, local_agent_url)
|
||||
typer.echo(f"Registered local node '{node_id}' → {local_agent_url}")
|
||||
|
||||
coordinator_app = create_coordinator_app(
|
||||
lease_manager=lease_manager,
|
||||
profile_registry=profile_registry,
|
||||
agent_supervisor=supervisor,
|
||||
service_registry=service_registry,
|
||||
)
|
||||
|
||||
typer.echo(f"Starting cf-orch coordinator on {host}:{port}")
|
||||
uvicorn.run(coordinator_app, host=host, port=port)
|
||||
|
||||
|
||||
@app.command()
|
||||
def agent(
|
||||
coordinator: str = "http://localhost:7700",
|
||||
node_id: str = "local",
|
||||
host: str = "0.0.0.0",
|
||||
port: int = 7701,
|
||||
advertise_host: Optional[str] = None,
|
||||
profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
|
||||
) -> None:
|
||||
"""Start a cf-orch node agent and self-register with the coordinator.
|
||||
|
||||
The agent starts its HTTP server, then POSTs its URL to the coordinator
|
||||
so it appears on the dashboard without manual configuration.
|
||||
|
||||
Use --advertise-host to override the IP the coordinator should use to
|
||||
reach this agent (e.g. on a multi-homed or NATted host).
|
||||
"""
|
||||
import threading
|
||||
import httpx
|
||||
from circuitforge_core.resources.agent.app import create_agent_app
|
||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
|
||||
# The URL the coordinator should use to reach this agent.
|
||||
reach_host = advertise_host or ("127.0.0.1" if host in ("0.0.0.0", "::") else host)
|
||||
agent_url = f"http://{reach_host}:{port}"
|
||||
|
||||
_RECONNECT_INTERVAL_S = 30.0
|
||||
|
||||
def _reconnect_loop() -> None:
|
||||
"""
|
||||
Persistently re-register this agent with the coordinator.
|
||||
|
||||
Runs as a daemon thread for the lifetime of the agent process:
|
||||
- Waits 2 s on first run (uvicorn needs time to bind)
|
||||
- Re-registers every 30 s thereafter
|
||||
- If the coordinator is down, silently retries — no crashing
|
||||
- When the coordinator restarts, the agent re-appears within one cycle
|
||||
|
||||
This means coordinator restarts require no manual intervention on agent hosts.
|
||||
"""
|
||||
import time
|
||||
first = True
|
||||
while True:
|
||||
time.sleep(2.0 if first else _RECONNECT_INTERVAL_S)
|
||||
first = False
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{coordinator}/api/nodes",
|
||||
json={"node_id": node_id, "agent_url": agent_url},
|
||||
timeout=5.0,
|
||||
)
|
||||
if resp.is_success:
|
||||
logger.debug("Registered with coordinator at %s as '%s'", coordinator, node_id)
|
||||
else:
|
||||
logger.warning(
|
||||
"Coordinator registration returned %s", resp.status_code
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Coordinator at %s unreachable, will retry: %s", coordinator, exc)
|
||||
|
||||
# Fire reconnect loop in a daemon thread so uvicorn.run() can start blocking immediately.
|
||||
threading.Thread(target=_reconnect_loop, daemon=True, name="cf-orch-reconnect").start()
|
||||
typer.echo(f"Reconnect loop started — will register with {coordinator} every {int(_RECONNECT_INTERVAL_S)}s")
|
||||
|
||||
service_manager = None
|
||||
try:
|
||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
||||
pr = ProfileRegistry()
|
||||
gpus = GpuMonitor().poll()
|
||||
p = pr.load(Path(profile)) if profile else pr.auto_detect(gpus)
|
||||
service_manager = ServiceManager(node_id=node_id, profile=p, advertise_host=reach_host)
|
||||
typer.echo(f"ServiceManager ready with profile: {p.name}")
|
||||
except Exception as exc:
|
||||
typer.echo(f"Warning: ServiceManager unavailable ({exc})", err=True)
|
||||
|
||||
agent_app = create_agent_app(node_id=node_id, service_manager=service_manager)
|
||||
typer.echo(f"Starting cf-orch agent [{node_id}] on {host}:{port}")
|
||||
uvicorn.run(agent_app, host=host, port=port)
|
||||
|
||||
|
||||
@app.command()
|
||||
def status(coordinator: str = "http://localhost:7700") -> None:
|
||||
"""Show GPU and lease status from the coordinator."""
|
||||
import httpx
|
||||
|
||||
try:
|
||||
resp = httpx.get(f"{coordinator}/api/nodes", timeout=5.0)
|
||||
resp.raise_for_status()
|
||||
nodes = resp.json().get("nodes", [])
|
||||
for node in nodes:
|
||||
typer.echo(f"\nNode: {node['node_id']}")
|
||||
for gpu in node.get("gpus", []):
|
||||
typer.echo(
|
||||
f" GPU {gpu['gpu_id']}: {gpu['name']} — "
|
||||
f"{gpu['vram_used_mb']}/{gpu['vram_total_mb']} MB used"
|
||||
)
|
||||
except Exception as exc:
|
||||
typer.echo(f"Coordinator unreachable at {coordinator}: {exc}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
|
||||
@app.command("install-service")
|
||||
def install_service(
|
||||
dry_run: bool = typer.Option(
|
||||
False, "--dry-run", help="Print unit file without writing"
|
||||
),
|
||||
) -> None:
|
||||
"""Write a systemd unit file for cf-orch (requires root)."""
|
||||
python = sys.executable
|
||||
unit_content = _SYSTEMD_UNIT_TEMPLATE.format(python=python)
|
||||
if dry_run:
|
||||
typer.echo(f"Would write to {_SYSTEMD_UNIT_PATH}:\n")
|
||||
typer.echo(unit_content)
|
||||
return
|
||||
try:
|
||||
_SYSTEMD_UNIT_PATH.write_text(unit_content)
|
||||
typer.echo(f"Written: {_SYSTEMD_UNIT_PATH}")
|
||||
typer.echo(
|
||||
"Run: sudo systemctl daemon-reload && sudo systemctl enable --now cf-orch"
|
||||
)
|
||||
except PermissionError:
|
||||
typer.echo(
|
||||
f"Permission denied writing to {_SYSTEMD_UNIT_PATH}. Run as root.", err=True
|
||||
)
|
||||
raise typer.Exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from contextlib import contextmanager, asynccontextmanager
|
||||
from dataclasses import dataclass
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Allocation:
|
||||
allocation_id: str
|
||||
service: str
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
model: str | None
|
||||
url: str
|
||||
started: bool
|
||||
warm: bool
|
||||
|
||||
|
||||
class CFOrchClient:
|
||||
"""
|
||||
Client for cf-orch coordinator allocation.
|
||||
|
||||
Sync usage (in LLMRouter or other sync code):
|
||||
client = CFOrchClient(os.environ["CF_ORCH_URL"])
|
||||
with client.allocate("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
|
||||
# alloc.url is the inference endpoint
|
||||
|
||||
Async usage (in FastAPI apps):
|
||||
async with client.allocate_async("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
|
||||
...
|
||||
|
||||
Authentication:
|
||||
Pass api_key explicitly, or set CF_LICENSE_KEY env var. When set, every
|
||||
request carries Authorization: Bearer <key>. Required for the hosted
|
||||
CircuitForge coordinator (orch.circuitforge.tech); optional for local
|
||||
self-hosted coordinators.
|
||||
|
||||
Raises ValueError immediately if coordinator_url is empty.
|
||||
"""
|
||||
|
||||
def __init__(self, coordinator_url: str, api_key: str | None = None) -> None:
|
||||
if not coordinator_url:
|
||||
raise ValueError("coordinator_url is empty — cf-orch not configured")
|
||||
self._url = coordinator_url.rstrip("/")
|
||||
self._api_key = api_key or os.environ.get("CF_LICENSE_KEY", "")
|
||||
|
||||
def _headers(self) -> dict[str, str]:
|
||||
if self._api_key:
|
||||
return {"Authorization": f"Bearer {self._api_key}"}
|
||||
return {}
|
||||
|
||||
def _build_body(self, model_candidates: list[str] | None, ttl_s: float, caller: str) -> dict:
|
||||
return {
|
||||
"model_candidates": model_candidates or [],
|
||||
"ttl_s": ttl_s,
|
||||
"caller": caller,
|
||||
}
|
||||
|
||||
def _parse_allocation(self, data: dict, service: str) -> Allocation:
|
||||
return Allocation(
|
||||
allocation_id=data["allocation_id"],
|
||||
service=service,
|
||||
node_id=data["node_id"],
|
||||
gpu_id=data["gpu_id"],
|
||||
model=data.get("model"),
|
||||
url=data["url"],
|
||||
started=data.get("started", False),
|
||||
warm=data.get("warm", False),
|
||||
)
|
||||
|
||||
@contextmanager
|
||||
def allocate(
|
||||
self,
|
||||
service: str,
|
||||
*,
|
||||
model_candidates: list[str] | None = None,
|
||||
ttl_s: float = 3600.0,
|
||||
caller: str = "",
|
||||
):
|
||||
"""Sync context manager. Allocates on enter, releases on exit."""
|
||||
resp = httpx.post(
|
||||
f"{self._url}/api/services/{service}/allocate",
|
||||
json=self._build_body(model_candidates, ttl_s, caller),
|
||||
headers=self._headers(),
|
||||
timeout=120.0,
|
||||
)
|
||||
if not resp.is_success:
|
||||
raise RuntimeError(
|
||||
f"cf-orch allocation failed for {service!r}: "
|
||||
f"HTTP {resp.status_code} — {resp.text[:200]}"
|
||||
)
|
||||
alloc = self._parse_allocation(resp.json(), service)
|
||||
try:
|
||||
yield alloc
|
||||
finally:
|
||||
try:
|
||||
httpx.delete(
|
||||
f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
|
||||
headers=self._headers(),
|
||||
timeout=10.0,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("cf-orch release failed (non-fatal): %s", exc)
|
||||
|
||||
@asynccontextmanager
|
||||
async def allocate_async(
|
||||
self,
|
||||
service: str,
|
||||
*,
|
||||
model_candidates: list[str] | None = None,
|
||||
ttl_s: float = 3600.0,
|
||||
caller: str = "",
|
||||
):
|
||||
"""Async context manager. Allocates on enter, releases on exit."""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(
|
||||
f"{self._url}/api/services/{service}/allocate",
|
||||
json=self._build_body(model_candidates, ttl_s, caller),
|
||||
headers=self._headers(),
|
||||
)
|
||||
if not resp.is_success:
|
||||
raise RuntimeError(
|
||||
f"cf-orch allocation failed for {service!r}: "
|
||||
f"HTTP {resp.status_code} — {resp.text[:200]}"
|
||||
)
|
||||
alloc = self._parse_allocation(resp.json(), service)
|
||||
try:
|
||||
yield alloc
|
||||
finally:
|
||||
try:
|
||||
await client.delete(
|
||||
f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
|
||||
headers=self._headers(),
|
||||
timeout=10.0,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("cf-orch async release failed (non-fatal): %s", exc)
|
||||
|
|
@ -1,44 +0,0 @@
|
|||
# circuitforge_core/resources/compose.yml
|
||||
# One-command cf-orch deployment for Docker self-hosters:
|
||||
# docker compose -f path/to/compose.yml up cf-orch-coordinator
|
||||
|
||||
services:
|
||||
cf-orch-coordinator:
|
||||
image: python:3.12-slim
|
||||
command: >
|
||||
sh -c "pip install 'circuitforge-core[orch]' &&
|
||||
cf-orch start --host 0.0.0.0 --port 7700"
|
||||
ports:
|
||||
- "7700:7700"
|
||||
volumes:
|
||||
- /run/docker.sock:/var/run/docker.sock:ro
|
||||
- cf-orch-data:/data
|
||||
environment:
|
||||
- CFORCH_PROFILE=${CFORCH_PROFILE:-}
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/nvidia0:/dev/nvidia0
|
||||
- /dev/nvidiactl:/dev/nvidiactl
|
||||
runtime: nvidia
|
||||
|
||||
cf-orch-agent:
|
||||
image: python:3.12-slim
|
||||
command: >
|
||||
sh -c "pip install 'circuitforge-core[orch]' &&
|
||||
cf-orch agent --coordinator http://cf-orch-coordinator:7700
|
||||
--node-id ${CFORCH_NODE_ID:-local}
|
||||
--host 0.0.0.0 --port 7701"
|
||||
ports:
|
||||
- "7701:7701"
|
||||
depends_on:
|
||||
- cf-orch-coordinator
|
||||
environment:
|
||||
- CFORCH_NODE_ID=${CFORCH_NODE_ID:-local}
|
||||
restart: unless-stopped
|
||||
devices:
|
||||
- /dev/nvidia0:/dev/nvidia0
|
||||
- /dev/nvidiactl:/dev/nvidiactl
|
||||
runtime: nvidia
|
||||
|
||||
volumes:
|
||||
cf-orch-data:
|
||||
|
|
@ -1,209 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import httpx
|
||||
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo, ResidentAllocation
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_HEARTBEAT_INTERVAL_S = 10.0
|
||||
_AGENT_TIMEOUT_S = 5.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class AgentRecord:
|
||||
node_id: str
|
||||
agent_url: str
|
||||
last_seen: float = field(default_factory=time.time)
|
||||
gpus: list[GpuInfo] = field(default_factory=list)
|
||||
online: bool = False
|
||||
|
||||
|
||||
class AgentSupervisor:
|
||||
def __init__(
|
||||
self,
|
||||
lease_manager: LeaseManager,
|
||||
service_registry: ServiceRegistry | None = None,
|
||||
profile_registry: ProfileRegistry | None = None,
|
||||
node_store: NodeStore | None = None,
|
||||
) -> None:
|
||||
self._agents: dict[str, AgentRecord] = {}
|
||||
self._lease_manager = lease_manager
|
||||
self._running = False
|
||||
self._service_registry = service_registry
|
||||
self._profile_registry = profile_registry
|
||||
self._node_store = node_store
|
||||
self._heartbeat_tick = 0
|
||||
|
||||
def restore_from_store(self) -> int:
|
||||
"""
|
||||
Load previously-known nodes from NodeStore into the in-memory registry.
|
||||
|
||||
All restored nodes start as offline=False. The heartbeat loop will poll
|
||||
them on its first tick and promote any that respond to online=True.
|
||||
|
||||
Returns the number of nodes restored.
|
||||
"""
|
||||
if self._node_store is None:
|
||||
return 0
|
||||
restored = 0
|
||||
for node_id, agent_url in self._node_store.all():
|
||||
if node_id not in self._agents:
|
||||
self._agents[node_id] = AgentRecord(
|
||||
node_id=node_id, agent_url=agent_url, online=False
|
||||
)
|
||||
restored += 1
|
||||
if restored:
|
||||
logger.info("NodeStore: restored %d known node(s) from previous session", restored)
|
||||
return restored
|
||||
|
||||
def register(self, node_id: str, agent_url: str) -> None:
|
||||
if node_id not in self._agents:
|
||||
self._agents[node_id] = AgentRecord(node_id=node_id, agent_url=agent_url)
|
||||
logger.info("Registered agent node: %s @ %s", node_id, agent_url)
|
||||
else:
|
||||
if self._agents[node_id].agent_url != agent_url:
|
||||
self._agents[node_id].agent_url = agent_url
|
||||
logger.info("Updated agent URL for %s → %s", node_id, agent_url)
|
||||
if self._node_store is not None:
|
||||
self._node_store.upsert(node_id, agent_url)
|
||||
|
||||
def get_node_info(self, node_id: str) -> NodeInfo | None:
|
||||
record = self._agents.get(node_id)
|
||||
if record is None:
|
||||
return None
|
||||
return NodeInfo(
|
||||
node_id=record.node_id,
|
||||
agent_url=record.agent_url,
|
||||
gpus=record.gpus,
|
||||
last_heartbeat=record.last_seen,
|
||||
)
|
||||
|
||||
def all_nodes(self) -> list[NodeInfo]:
|
||||
return [
|
||||
NodeInfo(
|
||||
node_id=r.node_id,
|
||||
agent_url=r.agent_url,
|
||||
gpus=r.gpus,
|
||||
last_heartbeat=r.last_seen,
|
||||
)
|
||||
for r in self._agents.values()
|
||||
]
|
||||
|
||||
def online_agents(self) -> "dict[str, AgentRecord]":
|
||||
"""Return only currently-online agents, keyed by node_id."""
|
||||
return {nid: rec for nid, rec in self._agents.items() if rec.online}
|
||||
|
||||
async def poll_agent(self, node_id: str) -> bool:
|
||||
record = self._agents.get(node_id)
|
||||
if record is None:
|
||||
return False
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=_AGENT_TIMEOUT_S) as client:
|
||||
gpu_resp = await client.get(f"{record.agent_url}/gpu-info")
|
||||
gpu_resp.raise_for_status()
|
||||
|
||||
# Resident-info is best-effort — older agents may not have the endpoint.
|
||||
try:
|
||||
res_resp = await client.get(f"{record.agent_url}/resident-info")
|
||||
resident_data = res_resp.json() if res_resp.is_success else {}
|
||||
except Exception:
|
||||
resident_data = {}
|
||||
|
||||
data = gpu_resp.json()
|
||||
gpus = [
|
||||
GpuInfo(
|
||||
gpu_id=g["gpu_id"],
|
||||
name=g["name"],
|
||||
vram_total_mb=g["vram_total_mb"],
|
||||
vram_used_mb=g["vram_used_mb"],
|
||||
vram_free_mb=g["vram_free_mb"],
|
||||
)
|
||||
for g in data.get("gpus", [])
|
||||
]
|
||||
record.gpus = gpus
|
||||
record.last_seen = time.time()
|
||||
record.online = True
|
||||
for gpu in gpus:
|
||||
self._lease_manager.register_gpu(node_id, gpu.gpu_id, gpu.vram_total_mb)
|
||||
|
||||
residents = [
|
||||
(r["service"], r.get("model_name"))
|
||||
for r in resident_data.get("residents", [])
|
||||
]
|
||||
self._lease_manager.set_residents_for_node(node_id, residents)
|
||||
|
||||
return True
|
||||
except Exception as exc:
|
||||
logger.warning("Agent %s unreachable: %s", node_id, exc)
|
||||
record.online = False
|
||||
return False
|
||||
|
||||
async def poll_all(self) -> None:
|
||||
await asyncio.gather(*[self.poll_agent(nid) for nid in self._agents])
|
||||
|
||||
def _build_idle_stop_config(self) -> dict[str, int]:
|
||||
if self._profile_registry is None:
|
||||
return {}
|
||||
config: dict[str, int] = {}
|
||||
for profile in self._profile_registry.list_public():
|
||||
for svc_name, svc in profile.services.items():
|
||||
if svc.idle_stop_after_s > 0:
|
||||
existing = config.get(svc_name, 0)
|
||||
config[svc_name] = min(existing, svc.idle_stop_after_s) if existing > 0 else svc.idle_stop_after_s
|
||||
return config
|
||||
|
||||
async def _http_post(self, url: str) -> bool:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
resp = await client.post(url)
|
||||
return resp.is_success
|
||||
except Exception as exc:
|
||||
logger.warning("HTTP POST %s failed: %s", url, exc)
|
||||
return False
|
||||
|
||||
async def _run_idle_sweep(self) -> None:
|
||||
if self._service_registry is None:
|
||||
return
|
||||
expired = self._service_registry.sweep_expired_allocations()
|
||||
if expired:
|
||||
logger.info("TTL sweep: expired %d allocation(s): %s", len(expired), expired)
|
||||
idle_stop_config = self._build_idle_stop_config()
|
||||
if not idle_stop_config:
|
||||
return
|
||||
timed_out = self._service_registry.idle_past_timeout(idle_stop_config)
|
||||
for instance in timed_out:
|
||||
node_info = self.get_node_info(instance.node_id)
|
||||
if node_info is None:
|
||||
continue
|
||||
stop_url = f"{node_info.agent_url}/services/{instance.service}/stop"
|
||||
logger.info(
|
||||
"Idle sweep: stopping %s on %s gpu%s (idle timeout)",
|
||||
instance.service, instance.node_id, instance.gpu_id,
|
||||
)
|
||||
success = await self._http_post(stop_url)
|
||||
if success:
|
||||
self._service_registry.mark_stopped(
|
||||
instance.service, instance.node_id, instance.gpu_id
|
||||
)
|
||||
|
||||
async def run_heartbeat_loop(self) -> None:
|
||||
self._running = True
|
||||
while self._running:
|
||||
await self.poll_all()
|
||||
self._heartbeat_tick += 1
|
||||
if self._heartbeat_tick % 3 == 0:
|
||||
await self._run_idle_sweep()
|
||||
await asyncio.sleep(_HEARTBEAT_INTERVAL_S)
|
||||
|
||||
def stop(self) -> None:
|
||||
self._running = False
|
||||
|
|
@ -1,509 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import time
|
||||
import urllib.request
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.responses import HTMLResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||
from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.node_selector import select_node
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.profiles.schema import ProcessSpec
|
||||
|
||||
_DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
|
||||
|
||||
|
||||
def _get_health_path(profile_registry: ProfileRegistry, service: str) -> str:
|
||||
"""Return the health_path for a service from the first matching profile spec."""
|
||||
for profile in profile_registry.list_public():
|
||||
svc = profile.services.get(service)
|
||||
if svc and isinstance(svc.managed, ProcessSpec):
|
||||
return svc.managed.health_path
|
||||
return "/health"
|
||||
|
||||
_PROBE_INTERVAL_S = 5.0 # how often to poll starting instances
|
||||
_PROBE_TIMEOUT_S = 300.0 # give up and mark stopped after this many seconds
|
||||
|
||||
|
||||
async def _run_instance_probe_loop(service_registry: ServiceRegistry) -> None:
|
||||
"""
|
||||
Background loop: transition 'starting' instances to 'running' once their
|
||||
/health endpoint responds, or to 'stopped' after PROBE_TIMEOUT_S.
|
||||
"""
|
||||
import asyncio
|
||||
|
||||
start_times: dict[str, float] = {} # instance key → time first seen as starting
|
||||
|
||||
while True:
|
||||
await asyncio.sleep(_PROBE_INTERVAL_S)
|
||||
now = time.time()
|
||||
for inst in service_registry.all_instances():
|
||||
if inst.state != "starting":
|
||||
start_times.pop(f"{inst.service}:{inst.node_id}:{inst.gpu_id}", None)
|
||||
continue
|
||||
key = f"{inst.service}:{inst.node_id}:{inst.gpu_id}"
|
||||
start_times.setdefault(key, now)
|
||||
|
||||
healthy = False
|
||||
if inst.url:
|
||||
try:
|
||||
with urllib.request.urlopen(
|
||||
inst.url.rstrip("/") + inst.health_path, timeout=2.0
|
||||
) as resp:
|
||||
healthy = resp.status == 200
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if healthy:
|
||||
service_registry.upsert_instance(
|
||||
service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
|
||||
state="running", model=inst.model, url=inst.url,
|
||||
)
|
||||
start_times.pop(key, None)
|
||||
logger.info("Instance %s/%s gpu=%s transitioned to running", inst.service, inst.node_id, inst.gpu_id)
|
||||
elif now - start_times[key] > _PROBE_TIMEOUT_S:
|
||||
service_registry.upsert_instance(
|
||||
service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
|
||||
state="stopped", model=inst.model, url=inst.url,
|
||||
)
|
||||
start_times.pop(key, None)
|
||||
logger.warning("Instance %s/%s gpu=%s timed out in starting state — marked stopped", inst.service, inst.node_id, inst.gpu_id)
|
||||
|
||||
|
||||
class LeaseRequest(BaseModel):
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
mb: int
|
||||
service: str
|
||||
priority: int = 2
|
||||
ttl_s: float = 0.0
|
||||
|
||||
|
||||
class NodeRegisterRequest(BaseModel):
|
||||
node_id: str
|
||||
agent_url: str # e.g. "http://10.1.10.71:7701"
|
||||
|
||||
|
||||
class ServiceEnsureRequest(BaseModel):
|
||||
node_id: str
|
||||
gpu_id: int = 0
|
||||
params: dict[str, str] = {}
|
||||
ttl_s: float = 3600.0
|
||||
# Ordered list of model names to try; falls back down the list if VRAM is tight.
|
||||
# The "model" key in params is used if this list is empty.
|
||||
model_candidates: list[str] = []
|
||||
|
||||
|
||||
class ServiceAllocateRequest(BaseModel):
|
||||
model_candidates: list[str] = []
|
||||
gpu_id: int | None = None
|
||||
params: dict[str, str] = {}
|
||||
ttl_s: float = 3600.0
|
||||
caller: str = ""
|
||||
|
||||
|
||||
def create_coordinator_app(
|
||||
lease_manager: LeaseManager,
|
||||
profile_registry: ProfileRegistry,
|
||||
agent_supervisor: AgentSupervisor,
|
||||
service_registry: ServiceRegistry,
|
||||
) -> FastAPI:
|
||||
eviction_engine = EvictionEngine(lease_manager=lease_manager)
|
||||
|
||||
@asynccontextmanager
|
||||
async def _lifespan(app: FastAPI): # type: ignore[type-arg]
|
||||
import asyncio
|
||||
heartbeat_task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
|
||||
probe_task = asyncio.create_task(_run_instance_probe_loop(service_registry))
|
||||
yield
|
||||
agent_supervisor.stop()
|
||||
heartbeat_task.cancel()
|
||||
probe_task.cancel()
|
||||
|
||||
app = FastAPI(title="cf-orch-coordinator", lifespan=_lifespan)
|
||||
|
||||
# Optional Heimdall auth — enabled when HEIMDALL_URL env var is set.
|
||||
# Self-hosted coordinators skip this entirely; the CF-hosted public endpoint
|
||||
# (orch.circuitforge.tech) sets HEIMDALL_URL to gate paid+ access.
|
||||
from circuitforge_core.resources.coordinator.auth import HeimdallAuthMiddleware
|
||||
_auth = HeimdallAuthMiddleware.from_env()
|
||||
if _auth is not None:
|
||||
app.middleware("http")(_auth)
|
||||
|
||||
@app.get("/", response_class=HTMLResponse, include_in_schema=False)
|
||||
def dashboard() -> HTMLResponse:
|
||||
return HTMLResponse(content=_DASHBOARD_HTML)
|
||||
|
||||
@app.get("/api/health")
|
||||
def health() -> dict[str, Any]:
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.get("/api/nodes")
|
||||
def get_nodes() -> dict[str, Any]:
|
||||
nodes = agent_supervisor.all_nodes()
|
||||
return {
|
||||
"nodes": [
|
||||
{
|
||||
"node_id": n.node_id,
|
||||
"agent_url": n.agent_url,
|
||||
"last_heartbeat": n.last_heartbeat,
|
||||
"gpus": [
|
||||
{
|
||||
"gpu_id": g.gpu_id,
|
||||
"name": g.name,
|
||||
"vram_total_mb": g.vram_total_mb,
|
||||
"vram_used_mb": g.vram_used_mb,
|
||||
"vram_free_mb": g.vram_free_mb,
|
||||
}
|
||||
for g in n.gpus
|
||||
],
|
||||
}
|
||||
for n in nodes
|
||||
]
|
||||
}
|
||||
|
||||
@app.post("/api/nodes")
|
||||
async def register_node(req: NodeRegisterRequest) -> dict[str, Any]:
|
||||
"""Agents call this to self-register. Coordinator immediately polls for GPU info."""
|
||||
agent_supervisor.register(req.node_id, req.agent_url)
|
||||
await agent_supervisor.poll_agent(req.node_id)
|
||||
return {"registered": True, "node_id": req.node_id}
|
||||
|
||||
@app.get("/api/profiles")
|
||||
def get_profiles() -> dict[str, Any]:
|
||||
return {
|
||||
"profiles": [
|
||||
{"name": p.name, "vram_total_mb": p.vram_total_mb}
|
||||
for p in profile_registry.list_public()
|
||||
]
|
||||
}
|
||||
|
||||
@app.get("/api/resident")
|
||||
def get_residents() -> dict[str, Any]:
|
||||
return {
|
||||
"residents": [
|
||||
{
|
||||
"service": r.service,
|
||||
"node_id": r.node_id,
|
||||
"model_name": r.model_name,
|
||||
"first_seen": r.first_seen,
|
||||
}
|
||||
for r in lease_manager.all_residents()
|
||||
]
|
||||
}
|
||||
|
||||
@app.get("/api/leases")
|
||||
def get_leases() -> dict[str, Any]:
|
||||
return {
|
||||
"leases": [
|
||||
{
|
||||
"lease_id": lease.lease_id,
|
||||
"node_id": lease.node_id,
|
||||
"gpu_id": lease.gpu_id,
|
||||
"mb_granted": lease.mb_granted,
|
||||
"holder_service": lease.holder_service,
|
||||
"priority": lease.priority,
|
||||
"expires_at": lease.expires_at,
|
||||
}
|
||||
for lease in lease_manager.all_leases()
|
||||
]
|
||||
}
|
||||
|
||||
@app.post("/api/leases")
|
||||
async def request_lease(req: LeaseRequest) -> dict[str, Any]:
|
||||
node_info = agent_supervisor.get_node_info(req.node_id)
|
||||
if node_info is None:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Unknown node_id {req.node_id!r} — node not registered",
|
||||
)
|
||||
agent_url = node_info.agent_url
|
||||
|
||||
lease = await eviction_engine.request_lease(
|
||||
node_id=req.node_id,
|
||||
gpu_id=req.gpu_id,
|
||||
mb=req.mb,
|
||||
service=req.service,
|
||||
priority=req.priority,
|
||||
agent_url=agent_url,
|
||||
ttl_s=req.ttl_s,
|
||||
)
|
||||
if lease is None:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="Insufficient VRAM — no eviction candidates available",
|
||||
)
|
||||
return {
|
||||
"lease": {
|
||||
"lease_id": lease.lease_id,
|
||||
"node_id": lease.node_id,
|
||||
"gpu_id": lease.gpu_id,
|
||||
"mb_granted": lease.mb_granted,
|
||||
"holder_service": lease.holder_service,
|
||||
"priority": lease.priority,
|
||||
"expires_at": lease.expires_at,
|
||||
}
|
||||
}
|
||||
|
||||
@app.delete("/api/leases/{lease_id}")
|
||||
async def release_lease(lease_id: str) -> dict[str, Any]:
|
||||
released = await lease_manager.release(lease_id)
|
||||
if not released:
|
||||
raise HTTPException(status_code=404, detail=f"Lease {lease_id!r} not found")
|
||||
return {"released": True, "lease_id": lease_id}
|
||||
|
||||
@app.post("/api/services/{service}/ensure")
|
||||
async def ensure_service(service: str, req: ServiceEnsureRequest) -> dict[str, Any]:
|
||||
"""
|
||||
Ensure a managed service is running on the given node.
|
||||
|
||||
If model_candidates is provided, tries each model in order, skipping any
|
||||
that exceed the live free VRAM on the target GPU. Falls back down the list
|
||||
until one succeeds. The selected model is returned in the response.
|
||||
"""
|
||||
import httpx
|
||||
|
||||
node_info = agent_supervisor.get_node_info(req.node_id)
|
||||
if node_info is None:
|
||||
raise HTTPException(422, detail=f"Unknown node_id {req.node_id!r}")
|
||||
|
||||
# Resolve candidate list — fall back to params["model"] if not specified.
|
||||
candidates: list[str] = req.model_candidates or (
|
||||
[req.params["model"]] if "model" in req.params else []
|
||||
)
|
||||
if not candidates:
|
||||
raise HTTPException(422, detail="No model specified: set params.model or model_candidates")
|
||||
|
||||
# Live free VRAM on the target GPU (used for pre-flight filtering).
|
||||
gpu = next((g for g in node_info.gpus if g.gpu_id == req.gpu_id), None)
|
||||
free_mb = gpu.vram_free_mb if gpu else 0
|
||||
|
||||
# Profile max_mb for the service gives us the VRAM ceiling for this slot.
|
||||
# Models larger than free_mb are skipped before we even try to start them.
|
||||
# We use model file size as a rough proxy — skip if free_mb < half of max_mb,
|
||||
# since a fully-loaded model typically needs ~50-80% of its param size in VRAM.
|
||||
service_max_mb = 0
|
||||
for p in profile_registry.list_public():
|
||||
svc = p.services.get(service)
|
||||
if svc:
|
||||
service_max_mb = svc.max_mb
|
||||
break
|
||||
|
||||
# Filter candidates by VRAM headroom — require free VRAM >= service ceiling
|
||||
# so the model can actually load without competing for VRAM with other processes.
|
||||
if service_max_mb > 0 and free_mb < service_max_mb:
|
||||
raise HTTPException(
|
||||
503,
|
||||
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
|
||||
)
|
||||
|
||||
last_error: str = ""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
for model in candidates:
|
||||
params_with_model = {**req.params, "model": model}
|
||||
try:
|
||||
start_resp = await client.post(
|
||||
f"{node_info.agent_url}/services/{service}/start",
|
||||
json={"gpu_id": req.gpu_id, "params": params_with_model},
|
||||
)
|
||||
if start_resp.is_success:
|
||||
data = start_resp.json()
|
||||
return {
|
||||
"service": service,
|
||||
"node_id": req.node_id,
|
||||
"gpu_id": req.gpu_id,
|
||||
"model": model,
|
||||
"url": data.get("url"),
|
||||
"running": data.get("running", False),
|
||||
}
|
||||
last_error = start_resp.text
|
||||
except httpx.HTTPError as exc:
|
||||
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
|
||||
|
||||
raise HTTPException(
|
||||
503,
|
||||
detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
|
||||
)
|
||||
|
||||
@app.post("/api/services/{service}/allocate")
|
||||
async def allocate_service(service: str, req: ServiceAllocateRequest) -> dict[str, Any]:
|
||||
"""
|
||||
Allocate a managed service — coordinator picks the best node automatically.
|
||||
Returns a URL + allocation_id. (Allocation not tracked server-side until Phase 2.)
|
||||
"""
|
||||
import httpx
|
||||
|
||||
if not req.model_candidates:
|
||||
raise HTTPException(422, detail="model_candidates must be non-empty")
|
||||
|
||||
# Validate service is known in at least one profile, regardless of gpu_id
|
||||
if not any(service in p.services for p in profile_registry.list_public()):
|
||||
raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
|
||||
|
||||
residents = lease_manager.resident_keys()
|
||||
|
||||
if req.gpu_id is None:
|
||||
online = agent_supervisor.online_agents()
|
||||
placement = select_node(online, service, profile_registry, residents)
|
||||
if placement is None:
|
||||
raise HTTPException(
|
||||
503,
|
||||
detail=f"No online node has capacity for service {service!r}",
|
||||
)
|
||||
node_id, gpu_id = placement
|
||||
else:
|
||||
online = agent_supervisor.online_agents()
|
||||
node_id = next(
|
||||
(nid for nid, rec in online.items()
|
||||
if any(g.gpu_id == req.gpu_id for g in rec.gpus)),
|
||||
None,
|
||||
)
|
||||
if node_id is None:
|
||||
raise HTTPException(422, detail=f"No online node has gpu_id={req.gpu_id}")
|
||||
gpu_id = req.gpu_id
|
||||
|
||||
node_info = agent_supervisor.get_node_info(node_id)
|
||||
if node_info is None:
|
||||
raise HTTPException(422, detail=f"Node {node_id!r} not found")
|
||||
|
||||
warm = f"{node_id}:{service}" in residents
|
||||
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
last_error = ""
|
||||
for model in req.model_candidates:
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{node_info.agent_url}/services/{service}/start",
|
||||
json={"gpu_id": gpu_id, "params": {**req.params, "model": model}},
|
||||
)
|
||||
if resp.is_success:
|
||||
data = resp.json()
|
||||
svc_url = data.get("url", "")
|
||||
alloc = service_registry.allocate(
|
||||
service=service,
|
||||
node_id=node_id,
|
||||
gpu_id=gpu_id,
|
||||
model=model,
|
||||
caller=req.caller,
|
||||
url=svc_url,
|
||||
ttl_s=req.ttl_s,
|
||||
)
|
||||
# Seed the instance state for first-time starts.
|
||||
# adopted=True means the agent found it already running.
|
||||
adopted = data.get("adopted", False)
|
||||
instance_state = "running" if (warm or adopted) else "starting"
|
||||
health_path = _get_health_path(profile_registry, service)
|
||||
service_registry.upsert_instance(
|
||||
service=service,
|
||||
node_id=node_id,
|
||||
gpu_id=gpu_id,
|
||||
state=instance_state,
|
||||
model=model,
|
||||
url=svc_url,
|
||||
health_path=health_path,
|
||||
)
|
||||
return {
|
||||
"allocation_id": alloc.allocation_id,
|
||||
"service": service,
|
||||
"node_id": node_id,
|
||||
"gpu_id": gpu_id,
|
||||
"model": model,
|
||||
"url": data.get("url"),
|
||||
"started": not warm,
|
||||
"warm": warm,
|
||||
}
|
||||
last_error = resp.text
|
||||
except httpx.HTTPError as exc:
|
||||
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
|
||||
|
||||
raise HTTPException(
|
||||
503,
|
||||
detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
|
||||
)
|
||||
|
||||
@app.delete("/api/services/{service}/allocations/{allocation_id}")
|
||||
async def release_allocation(service: str, allocation_id: str) -> dict[str, Any]:
|
||||
existing = service_registry.get_allocation(allocation_id)
|
||||
if existing is None or existing.service != service:
|
||||
raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found for service {service!r}")
|
||||
released = service_registry.release(allocation_id)
|
||||
if not released:
|
||||
raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found")
|
||||
return {"released": True, "allocation_id": allocation_id}
|
||||
|
||||
@app.get("/api/services/{service}/status")
|
||||
def get_service_status(service: str) -> dict[str, Any]:
|
||||
instances = [i for i in service_registry.all_instances() if i.service == service]
|
||||
allocations = [a for a in service_registry.all_allocations() if a.service == service]
|
||||
return {
|
||||
"service": service,
|
||||
"instances": [
|
||||
{
|
||||
"node_id": i.node_id,
|
||||
"gpu_id": i.gpu_id,
|
||||
"state": i.state,
|
||||
"model": i.model,
|
||||
"url": i.url,
|
||||
"idle_since": i.idle_since,
|
||||
}
|
||||
for i in instances
|
||||
],
|
||||
"allocations": [
|
||||
{
|
||||
"allocation_id": a.allocation_id,
|
||||
"node_id": a.node_id,
|
||||
"gpu_id": a.gpu_id,
|
||||
"model": a.model,
|
||||
"caller": a.caller,
|
||||
"url": a.url,
|
||||
"expires_at": a.expires_at,
|
||||
}
|
||||
for a in allocations
|
||||
],
|
||||
}
|
||||
|
||||
@app.get("/api/services")
|
||||
def list_services() -> dict[str, Any]:
|
||||
instances = service_registry.all_instances()
|
||||
return {
|
||||
"services": [
|
||||
{
|
||||
"service": i.service,
|
||||
"node_id": i.node_id,
|
||||
"gpu_id": i.gpu_id,
|
||||
"state": i.state,
|
||||
"model": i.model,
|
||||
"url": i.url,
|
||||
}
|
||||
for i in instances
|
||||
]
|
||||
}
|
||||
|
||||
@app.delete("/api/services/{service}")
|
||||
async def stop_service(service: str, node_id: str) -> dict[str, Any]:
|
||||
"""Stop a managed service on the given node."""
|
||||
node_info = agent_supervisor.get_node_info(node_id)
|
||||
if node_info is None:
|
||||
raise HTTPException(422, detail=f"Unknown node_id {node_id!r}")
|
||||
|
||||
import httpx
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
try:
|
||||
resp = await client.post(f"{node_info.agent_url}/services/{service}/stop")
|
||||
resp.raise_for_status()
|
||||
return {"service": service, "node_id": node_id, "stopped": resp.json().get("stopped", False)}
|
||||
except httpx.HTTPError as exc:
|
||||
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
|
||||
|
||||
return app
|
||||
|
|
@ -1,197 +0,0 @@
|
|||
"""
|
||||
cf-orch coordinator auth middleware.
|
||||
|
||||
When HEIMDALL_URL is set, all /api/* requests (except /api/health) must carry:
|
||||
Authorization: Bearer <CF license key>
|
||||
|
||||
The key is validated against Heimdall and the result cached for
|
||||
CACHE_TTL_S seconds (default 300 / 5 min). This keeps Heimdall out of the
|
||||
per-allocation hot path while keeping revocation latency bounded.
|
||||
|
||||
When HEIMDALL_URL is not set, auth is disabled — self-hosted deployments work
|
||||
with no configuration change.
|
||||
|
||||
Environment variables
|
||||
---------------------
|
||||
HEIMDALL_URL Heimdall base URL, e.g. https://license.circuitforge.tech
|
||||
When absent, auth is skipped entirely.
|
||||
HEIMDALL_MIN_TIER Minimum tier required (default: "paid").
|
||||
Accepted values: free, paid, premium, ultra.
|
||||
CF_ORCH_AUTH_SECRET Shared secret sent to Heimdall so it can distinguish
|
||||
coordinator service calls from end-user requests.
|
||||
Must match the COORDINATOR_SECRET env var on Heimdall.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from threading import Lock
|
||||
|
||||
import httpx
|
||||
from fastapi import Request
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Unauthenticated paths — health check must always be accessible for monitoring.
|
||||
_EXEMPT_PATHS: frozenset[str] = frozenset({"/api/health", "/", "/openapi.json", "/docs", "/redoc"})
|
||||
|
||||
_TIER_ORDER: dict[str, int] = {"free": 0, "paid": 1, "premium": 2, "ultra": 3}
|
||||
|
||||
CACHE_TTL_S: float = 300.0 # 5 minutes — matches Kiwi cloud session TTL
|
||||
|
||||
|
||||
@dataclass
|
||||
class _CacheEntry:
|
||||
valid: bool
|
||||
tier: str
|
||||
user_id: str
|
||||
expires_at: float
|
||||
|
||||
|
||||
class _ValidationCache:
|
||||
"""Thread-safe TTL cache for Heimdall validation results."""
|
||||
|
||||
def __init__(self, ttl_s: float = CACHE_TTL_S) -> None:
|
||||
self._ttl = ttl_s
|
||||
self._store: dict[str, _CacheEntry] = {}
|
||||
self._lock = Lock()
|
||||
|
||||
def get(self, key: str) -> _CacheEntry | None:
|
||||
with self._lock:
|
||||
entry = self._store.get(key)
|
||||
if entry is None or time.monotonic() > entry.expires_at:
|
||||
return None
|
||||
return entry
|
||||
|
||||
def set(self, key: str, valid: bool, tier: str, user_id: str) -> None:
|
||||
with self._lock:
|
||||
self._store[key] = _CacheEntry(
|
||||
valid=valid,
|
||||
tier=tier,
|
||||
user_id=user_id,
|
||||
expires_at=time.monotonic() + self._ttl,
|
||||
)
|
||||
|
||||
def evict(self, key: str) -> None:
|
||||
with self._lock:
|
||||
self._store.pop(key, None)
|
||||
|
||||
def prune(self) -> int:
|
||||
"""Remove expired entries. Returns count removed."""
|
||||
now = time.monotonic()
|
||||
with self._lock:
|
||||
expired = [k for k, e in self._store.items() if now > e.expires_at]
|
||||
for k in expired:
|
||||
del self._store[k]
|
||||
return len(expired)
|
||||
|
||||
|
||||
class HeimdallAuthMiddleware:
|
||||
"""
|
||||
ASGI middleware that validates CF license keys against Heimdall.
|
||||
|
||||
Attach to a FastAPI app via app.middleware("http"):
|
||||
|
||||
middleware = HeimdallAuthMiddleware.from_env()
|
||||
if middleware:
|
||||
app.middleware("http")(middleware)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
heimdall_url: str,
|
||||
min_tier: str = "paid",
|
||||
auth_secret: str = "",
|
||||
cache_ttl_s: float = CACHE_TTL_S,
|
||||
) -> None:
|
||||
self._heimdall = heimdall_url.rstrip("/")
|
||||
self._min_tier_rank = _TIER_ORDER.get(min_tier, 1)
|
||||
self._min_tier = min_tier
|
||||
self._auth_secret = auth_secret
|
||||
self._cache = _ValidationCache(ttl_s=cache_ttl_s)
|
||||
logger.info(
|
||||
"[cf-orch auth] Heimdall auth enabled — url=%s min_tier=%s ttl=%ss",
|
||||
self._heimdall, min_tier, cache_ttl_s,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_env(cls) -> "HeimdallAuthMiddleware | None":
|
||||
"""Return a configured middleware instance, or None if HEIMDALL_URL is not set."""
|
||||
url = os.environ.get("HEIMDALL_URL", "")
|
||||
if not url:
|
||||
logger.info("[cf-orch auth] HEIMDALL_URL not set — auth disabled (self-hosted mode)")
|
||||
return None
|
||||
return cls(
|
||||
heimdall_url=url,
|
||||
min_tier=os.environ.get("HEIMDALL_MIN_TIER", "paid"),
|
||||
auth_secret=os.environ.get("CF_ORCH_AUTH_SECRET", ""),
|
||||
)
|
||||
|
||||
def _validate_against_heimdall(self, license_key: str) -> tuple[bool, str, str]:
|
||||
"""
|
||||
Call Heimdall's /licenses/verify endpoint.
|
||||
|
||||
Returns (valid, tier, user_id).
|
||||
On any network or parse error, returns (False, "", "") — fail closed.
|
||||
"""
|
||||
try:
|
||||
headers: dict[str, str] = {"Content-Type": "application/json"}
|
||||
if self._auth_secret:
|
||||
headers["X-Coordinator-Secret"] = self._auth_secret
|
||||
resp = httpx.post(
|
||||
f"{self._heimdall}/licenses/verify",
|
||||
json={"key": license_key, "min_tier": self._min_tier},
|
||||
headers=headers,
|
||||
timeout=5.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
return data.get("valid", False), data.get("tier", ""), data.get("user_id", "")
|
||||
# 401/403 from Heimdall = key invalid/insufficient tier
|
||||
logger.debug("[cf-orch auth] Heimdall returned %s for key ...%s", resp.status_code, license_key[-6:])
|
||||
return False, "", ""
|
||||
except Exception as exc:
|
||||
logger.warning("[cf-orch auth] Heimdall unreachable — failing closed: %s", exc)
|
||||
return False, "", ""
|
||||
|
||||
def _check_key(self, license_key: str) -> tuple[bool, str]:
|
||||
"""
|
||||
Validate key (cache-first). Returns (authorized, reason_if_denied).
|
||||
"""
|
||||
cached = self._cache.get(license_key)
|
||||
if cached is not None:
|
||||
if not cached.valid:
|
||||
return False, "license key invalid or expired"
|
||||
if _TIER_ORDER.get(cached.tier, -1) < self._min_tier_rank:
|
||||
return False, f"feature requires {self._min_tier} tier (have: {cached.tier})"
|
||||
return True, ""
|
||||
|
||||
valid, tier, user_id = self._validate_against_heimdall(license_key)
|
||||
self._cache.set(license_key, valid=valid, tier=tier, user_id=user_id)
|
||||
|
||||
if not valid:
|
||||
return False, "license key invalid or expired"
|
||||
if _TIER_ORDER.get(tier, -1) < self._min_tier_rank:
|
||||
return False, f"feature requires {self._min_tier} tier (have: {tier})"
|
||||
return True, ""
|
||||
|
||||
async def __call__(self, request: Request, call_next): # type: ignore[no-untyped-def]
|
||||
if request.url.path in _EXEMPT_PATHS:
|
||||
return await call_next(request)
|
||||
|
||||
auth_header = request.headers.get("Authorization", "")
|
||||
if not auth_header.startswith("Bearer "):
|
||||
return JSONResponse(
|
||||
status_code=401,
|
||||
content={"detail": "Authorization: Bearer <license_key> required"},
|
||||
)
|
||||
|
||||
license_key = auth_header.removeprefix("Bearer ").strip()
|
||||
authorized, reason = self._check_key(license_key)
|
||||
if not authorized:
|
||||
return JSONResponse(status_code=403, content={"detail": reason})
|
||||
|
||||
return await call_next(request)
|
||||
|
|
@ -1,473 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>cf-orch · dashboard</title>
|
||||
<style>
|
||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
||||
|
||||
:root {
|
||||
--bg: #0d1117;
|
||||
--bg2: #161b22;
|
||||
--bg3: #1c2129;
|
||||
--border: #30363d;
|
||||
--border-dim: #21262d;
|
||||
--text: #e6edf3;
|
||||
--muted: #8b949e;
|
||||
--dim: #4d5763;
|
||||
--indigo: #818cf8;
|
||||
--cyan: #22d3ee;
|
||||
--green: #4ade80;
|
||||
--amber: #fbbf24;
|
||||
--red: #f85149;
|
||||
--orange: #fb923c;
|
||||
--radius: 6px;
|
||||
--radius-sm: 3px;
|
||||
--font: 'JetBrains Mono', 'Fira Code', ui-monospace, monospace;
|
||||
}
|
||||
|
||||
body { background: var(--bg); color: var(--text); font-family: var(--font); font-size: 13px; line-height: 1.5; padding: 1rem; }
|
||||
|
||||
/* header */
|
||||
header { display: flex; align-items: center; gap: 1rem; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid var(--border); }
|
||||
.logo { color: var(--indigo); font-size: 1.1em; font-weight: 700; }
|
||||
#refresh-badge { margin-left: auto; font-size: 0.75em; color: var(--dim); }
|
||||
#refresh-badge span { color: var(--green); }
|
||||
|
||||
/* section labels */
|
||||
.section-label { font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.07em; color: var(--dim); margin-bottom: 0.5rem; }
|
||||
|
||||
/* health strip */
|
||||
#health-strip { display: flex; flex-wrap: wrap; gap: 0.4rem; margin-bottom: 1rem; padding: 0.6rem 0.75rem; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); min-height: 36px; }
|
||||
.pill { display: inline-flex; align-items: center; gap: 0.3rem; padding: 2px 10px; border-radius: 99px; font-size: 0.8em; font-weight: 600; }
|
||||
.pill.ok { background: rgba(74,222,128,.12); color: var(--green); }
|
||||
.pill.err { background: rgba(248,81,73,.12); color: var(--red); }
|
||||
.pill.off { background: rgba(139,148,158,.1); color: var(--dim); }
|
||||
|
||||
/* GPU grid */
|
||||
#gpu-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 0.6rem; margin-bottom: 1rem; }
|
||||
.gpu-card { background: var(--bg3); border: 1px solid var(--border); border-radius: var(--radius); padding: 0.7rem 0.8rem; }
|
||||
.gpu-card.offline { border-color: #7c2d12; opacity: 0.7; }
|
||||
.gpu-node { font-size: 0.75em; font-weight: 700; color: var(--indigo); margin-bottom: 1px; }
|
||||
.gpu-offline .gpu-node { color: var(--orange); }
|
||||
.gpu-name { font-size: 0.78em; color: var(--text); margin-bottom: 0.4rem; }
|
||||
.vram-track { position: relative; background: var(--bg); border-radius: var(--radius-sm); height: 6px; margin-bottom: 0.3rem; overflow: hidden; }
|
||||
.vram-leased { position: absolute; left: 0; top: 0; height: 100%; background: var(--cyan); transition: width 0.4s; }
|
||||
.vram-resident { position: absolute; top: 0; height: 100%; background: var(--amber); transition: left 0.4s, width 0.4s; }
|
||||
.vram-label { font-size: 0.72em; color: var(--muted); margin-bottom: 0.25rem; }
|
||||
.gpu-status { font-size: 0.72em; }
|
||||
.gpu-status.idle { color: var(--green); }
|
||||
.gpu-status.busy { color: var(--amber); }
|
||||
.gpu-status.full { color: var(--red); }
|
||||
.gpu-status.offline { color: var(--orange); }
|
||||
.spark-track { height: 24px; background: var(--bg); border-radius: var(--radius-sm); margin-top: 0.4rem; overflow: hidden; }
|
||||
|
||||
/* shared table base */
|
||||
.cf-table { width: 100%; border-collapse: collapse; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); overflow: hidden; margin-bottom: 1rem; }
|
||||
.cf-table th { background: var(--bg3); color: var(--dim); font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.05em; padding: 0.4rem 0.6rem; text-align: left; border-bottom: 1px solid var(--border); }
|
||||
.cf-table td { padding: 0.35rem 0.6rem; border-bottom: 1px solid var(--border-dim); font-size: 0.8em; vertical-align: middle; }
|
||||
.cf-table tr:last-child td { border-bottom: none; }
|
||||
.td-service { color: var(--indigo); font-weight: 600; }
|
||||
.td-node { color: var(--muted); }
|
||||
.td-mb { color: var(--text); }
|
||||
.td-priority { color: var(--amber); }
|
||||
.td-model { color: var(--cyan); font-size: 0.75em; }
|
||||
.td-warm { color: var(--amber); }
|
||||
.td-none { color: var(--dim); font-style: italic; }
|
||||
.ttl-wrap { display: flex; align-items: center; gap: 0.5rem; }
|
||||
.ttl-label { color: var(--cyan); font-variant-numeric: tabular-nums; white-space: nowrap; }
|
||||
.ttl-track { flex: 1; background: var(--bg); border-radius: var(--radius-sm); height: 4px; }
|
||||
.ttl-fill { height: 100%; border-radius: var(--radius-sm); background: var(--cyan); transition: width 0.4s; }
|
||||
|
||||
/* service state classes */
|
||||
.state-running { color: #2ecc40; }
|
||||
.state-idle { color: #ff851b; }
|
||||
.state-stopped { color: #aaa; }
|
||||
.state-starting { color: #0074d9; }
|
||||
.state-unknown { color: #ff4136; }
|
||||
|
||||
/* error */
|
||||
#error-banner { display: none; background: rgba(248,81,73,.1); border: 1px solid var(--red); border-radius: var(--radius); color: var(--red); padding: 0.5rem 0.75rem; font-size: 0.82em; margin-bottom: 1rem; }
|
||||
|
||||
/* footer */
|
||||
footer { border-top: 1px solid var(--border); padding-top: 0.5rem; color: var(--dim); font-size: 0.72em; display: flex; gap: 1.5rem; }
|
||||
footer a { color: var(--indigo); text-decoration: none; }
|
||||
footer a:hover { text-decoration: underline; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<header>
|
||||
<span class="logo">cf-orch</span>
|
||||
<span id="cluster-label" style="color:var(--muted)">coordinator</span>
|
||||
<div id="refresh-badge">auto-refresh <span id="countdown">5</span>s</div>
|
||||
</header>
|
||||
|
||||
<div id="error-banner"></div>
|
||||
|
||||
<div class="section-label">Services</div>
|
||||
<div id="health-strip"></div>
|
||||
|
||||
<div class="section-label">GPU Nodes</div>
|
||||
<div id="gpu-grid"></div>
|
||||
|
||||
<div id="services-section">
|
||||
<div class="section-label">Service Instances</div>
|
||||
<table class="cf-table" id="services-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Service</th><th>Node</th><th>GPU</th><th>State</th><th>Model</th><th>URL</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="services-body"></tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<div class="section-label">Active Leases</div>
|
||||
<table class="cf-table" id="leases-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Service</th><th>Node / GPU</th><th>VRAM</th><th>Priority</th><th>TTL / Expires</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="leases-body"></tbody>
|
||||
</table>
|
||||
|
||||
<div class="section-label">Warm Models</div>
|
||||
<table class="cf-table" id="resident-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Service</th><th>Node</th><th>Model</th><th>Warm Since</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="resident-body"></tbody>
|
||||
</table>
|
||||
|
||||
<footer>
|
||||
<span>cf-orch · circuitforge-core</span>
|
||||
<a href="/api/nodes" target="_blank">/api/nodes</a>
|
||||
<a href="/api/leases" target="_blank">/api/leases</a>
|
||||
<a href="/api/resident" target="_blank">/api/resident</a>
|
||||
<a href="/api/services" target="_blank">/api/services</a>
|
||||
<a href="/api/health" target="_blank">/api/health</a>
|
||||
</footer>
|
||||
|
||||
<script>
|
||||
"use strict";
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────
|
||||
|
||||
/** Create an element with optional className and textContent. */
|
||||
function el(tag, opts) {
|
||||
const e = document.createElement(tag);
|
||||
if (opts && opts.cls) { opts.cls.split(' ').forEach(c => c && e.classList.add(c)); }
|
||||
if (opts && opts.text != null) e.textContent = opts.text;
|
||||
if (opts && opts.style) Object.assign(e.style, opts.style);
|
||||
if (opts && opts.attr) Object.entries(opts.attr).forEach(([k,v]) => e.setAttribute(k, v));
|
||||
return e;
|
||||
}
|
||||
|
||||
/** Append children to a parent element. Returns parent. */
|
||||
function append(parent, ...children) {
|
||||
children.forEach(c => c && parent.appendChild(c));
|
||||
return parent;
|
||||
}
|
||||
|
||||
/** Replace all children of a DOM node. */
|
||||
function setChildren(parent, ...children) {
|
||||
while (parent.firstChild) parent.removeChild(parent.firstChild);
|
||||
append(parent, ...children);
|
||||
}
|
||||
|
||||
/** Build a sparkline SVG element (no innerHTML). */
|
||||
function buildSparkline(history, totalMb) {
|
||||
const ns = 'http://www.w3.org/2000/svg';
|
||||
const svg = document.createElementNS(ns, 'svg');
|
||||
svg.setAttribute('width', '100%');
|
||||
svg.setAttribute('height', '16');
|
||||
svg.setAttribute('viewBox', '0 0 100 16');
|
||||
|
||||
if (!history || history.length < 2) {
|
||||
const line = document.createElementNS(ns, 'line');
|
||||
line.setAttribute('x1', '0'); line.setAttribute('y1', '14');
|
||||
line.setAttribute('x2', '100'); line.setAttribute('y2', '14');
|
||||
line.setAttribute('stroke', '#30363d'); line.setAttribute('stroke-width', '1');
|
||||
svg.appendChild(line);
|
||||
return svg;
|
||||
}
|
||||
|
||||
const max = Math.max(totalMb, 1);
|
||||
const pts = history.map((v, i) => {
|
||||
const x = (i / (history.length - 1)) * 100;
|
||||
const y = 14 - ((v / max) * 12);
|
||||
return x.toFixed(1) + ',' + y.toFixed(1);
|
||||
}).join(' ');
|
||||
|
||||
const poly = document.createElementNS(ns, 'polyline');
|
||||
poly.setAttribute('points', pts);
|
||||
poly.setAttribute('fill', 'none');
|
||||
poly.setAttribute('stroke', '#818cf8');
|
||||
poly.setAttribute('stroke-width', '1.5');
|
||||
poly.setAttribute('stroke-linejoin', 'round');
|
||||
svg.appendChild(poly);
|
||||
return svg;
|
||||
}
|
||||
|
||||
/** VRAM fill colour based on utilisation fraction. */
|
||||
function vramColor(pct) {
|
||||
if (pct >= 0.9) return '#f85149';
|
||||
if (pct >= 0.7) return '#fbbf24';
|
||||
return '#22d3ee';
|
||||
}
|
||||
|
||||
// ── sparkline history ────────────────────────────────────────────
|
||||
// keyed "nodeId:gpuId" → array of vram_used_mb, max 20 samples
|
||||
const sparkHistory = {};
|
||||
|
||||
// ── countdown ────────────────────────────────────────────────────
|
||||
let countdown = 5;
|
||||
setInterval(() => {
|
||||
countdown = countdown <= 1 ? 5 : countdown - 1;
|
||||
document.getElementById('countdown').textContent = countdown;
|
||||
}, 1000);
|
||||
|
||||
// ── state class helper ───────────────────────────────────────────
|
||||
function stateClass(state) {
|
||||
const map = { running: 'state-running', idle: 'state-idle', stopped: 'state-stopped', starting: 'state-starting' };
|
||||
return map[state] || 'state-unknown';
|
||||
}
|
||||
|
||||
// ── render: services table ───────────────────────────────────────
|
||||
function renderServices(services) {
|
||||
const tbody = document.getElementById('services-body');
|
||||
if (!services || services.length === 0) {
|
||||
const tr = document.createElement('tr');
|
||||
const td = el('td', { cls: 'td-none', text: 'No service instances registered.' });
|
||||
td.setAttribute('colspan', '6');
|
||||
tr.appendChild(td);
|
||||
setChildren(tbody, tr);
|
||||
return;
|
||||
}
|
||||
|
||||
const rows = services.map(svc => {
|
||||
const tr = document.createElement('tr');
|
||||
const fields = [
|
||||
{ text: svc.service, cls: 'td-service' },
|
||||
{ text: svc.node_id, cls: 'td-node' },
|
||||
{ text: String(svc.gpu_id), cls: 'td-mb' },
|
||||
{ text: svc.state, cls: stateClass(svc.state) },
|
||||
{ text: svc.model || '\u2014', cls: 'td-model' },
|
||||
{ text: svc.url || '\u2014', cls: 'td-node' },
|
||||
];
|
||||
fields.forEach(f => tr.appendChild(el('td', { cls: f.cls, text: f.text })));
|
||||
return tr;
|
||||
});
|
||||
|
||||
setChildren(tbody, ...rows);
|
||||
}
|
||||
|
||||
// ── render: health strip ─────────────────────────────────────────
|
||||
function renderHealth(ok) {
|
||||
const strip = document.getElementById('health-strip');
|
||||
const pill = el('span', { cls: 'pill ' + (ok ? 'ok' : 'err'), text: (ok ? '● ' : '✕ ') + 'coordinator' });
|
||||
setChildren(strip, pill);
|
||||
}
|
||||
|
||||
// ── render: GPU grid ─────────────────────────────────────────────
|
||||
// leasedByGpu: "nodeId:gpuId" → total MB currently leased (from active leases)
|
||||
function renderNodes(nodes, leasedByGpu) {
|
||||
const grid = document.getElementById('gpu-grid');
|
||||
if (!nodes || nodes.length === 0) {
|
||||
setChildren(grid, el('div', { text: 'No nodes registered.', style: { color: 'var(--dim)', fontSize: '0.8em', padding: '0.5rem' } }));
|
||||
return;
|
||||
}
|
||||
|
||||
const cards = [];
|
||||
for (const node of nodes) {
|
||||
for (const gpu of node.gpus) {
|
||||
const key = node.node_id + ':' + gpu.gpu_id;
|
||||
const total = gpu.vram_total_mb || 1;
|
||||
const used = gpu.vram_used_mb;
|
||||
const leased = leasedByGpu[key] || 0;
|
||||
// Resident = nvidia-smi used minus actively leased; clamped to [0, used].
|
||||
const resident = Math.max(0, Math.min(used - leased, used));
|
||||
const pct = used / total;
|
||||
|
||||
if (!sparkHistory[key]) sparkHistory[key] = [];
|
||||
sparkHistory[key].push(used);
|
||||
if (sparkHistory[key].length > 20) sparkHistory[key].shift();
|
||||
|
||||
const statusCls = pct >= 0.9 ? 'full' : pct >= 0.1 ? 'busy' : 'idle';
|
||||
const statusText = pct >= 0.9 ? 'saturated' : pct >= 0.1 ? Math.round(pct * 100) + '% used' : 'idle';
|
||||
|
||||
const card = el('div', { cls: 'gpu-card' });
|
||||
const nodeLabel = el('div', { cls: 'gpu-node', text: node.node_id.toUpperCase() + ' · GPU ' + gpu.gpu_id });
|
||||
const nameLine = el('div', { cls: 'gpu-name', text: gpu.name || 'Unknown GPU' });
|
||||
|
||||
// Stacked bar: cyan (leased) → amber (resident) → dark bg (free).
|
||||
const leasedPct = (leased / total * 100).toFixed(1);
|
||||
const residentPct = (resident / total * 100).toFixed(1);
|
||||
const track = el('div', { cls: 'vram-track' });
|
||||
const fillLeased = el('div', { cls: 'vram-leased', style: { width: leasedPct + '%' } });
|
||||
const fillResident = el('div', { cls: 'vram-resident', style: { left: leasedPct + '%', width: residentPct + '%' } });
|
||||
append(track, fillLeased, fillResident);
|
||||
|
||||
// Breakdown label when something is allocated.
|
||||
let labelText = (used / 1024).toFixed(1) + ' / ' + (total / 1024).toFixed(1) + ' GB';
|
||||
if (leased > 0 || resident > 0) {
|
||||
const parts = [];
|
||||
if (leased > 0) parts.push((leased / 1024).toFixed(1) + 'G leased');
|
||||
if (resident > 0) parts.push((resident / 1024).toFixed(1) + 'G resident');
|
||||
labelText += ' (' + parts.join(' · ') + ')';
|
||||
}
|
||||
|
||||
const vramLbl = el('div', { cls: 'vram-label', text: labelText });
|
||||
const statusEl = el('div', { cls: 'gpu-status ' + statusCls, text: statusText });
|
||||
const sparkTrack = el('div', { cls: 'spark-track' });
|
||||
sparkTrack.appendChild(buildSparkline(sparkHistory[key], total));
|
||||
|
||||
append(card, nodeLabel, nameLine, track, vramLbl, statusEl, sparkTrack);
|
||||
cards.push(card);
|
||||
}
|
||||
}
|
||||
|
||||
setChildren(grid, ...cards);
|
||||
}
|
||||
|
||||
// ── render: warm models table ────────────────────────────────────
|
||||
function renderResidents(residents) {
|
||||
const tbody = document.getElementById('resident-body');
|
||||
if (!residents || residents.length === 0) {
|
||||
const tr = document.createElement('tr');
|
||||
const td = el('td', { cls: 'td-none', text: 'No warm models detected.' });
|
||||
td.setAttribute('colspan', '4');
|
||||
tr.appendChild(td);
|
||||
setChildren(tbody, tr);
|
||||
return;
|
||||
}
|
||||
|
||||
const now = Date.now() / 1000;
|
||||
const rows = residents.map(r => {
|
||||
const warmSecs = now - (r.first_seen || now);
|
||||
const warmText = warmSecs < 60
|
||||
? Math.floor(warmSecs) + 's'
|
||||
: warmSecs < 3600
|
||||
? Math.floor(warmSecs / 60) + 'm ' + String(Math.floor(warmSecs % 60)).padStart(2, '0') + 's'
|
||||
: Math.floor(warmSecs / 3600) + 'h ' + String(Math.floor((warmSecs % 3600) / 60)).padStart(2, '0') + 'm';
|
||||
|
||||
const tr = document.createElement('tr');
|
||||
append(tr,
|
||||
el('td', { cls: 'td-service', text: r.service }),
|
||||
el('td', { cls: 'td-node', text: r.node_id }),
|
||||
el('td', { cls: 'td-model', text: r.model_name || '—' }),
|
||||
el('td', { cls: 'td-warm', text: warmText }),
|
||||
);
|
||||
return tr;
|
||||
});
|
||||
|
||||
setChildren(tbody, ...rows);
|
||||
}
|
||||
|
||||
// ── render: leases table ─────────────────────────────────────────
|
||||
function renderLeases(leases) {
|
||||
const tbody = document.getElementById('leases-body');
|
||||
if (!leases || leases.length === 0) {
|
||||
const tr = document.createElement('tr');
|
||||
const td = el('td', { cls: 'td-none', text: 'No active leases.' });
|
||||
td.setAttribute('colspan', '5');
|
||||
tr.appendChild(td);
|
||||
setChildren(tbody, tr);
|
||||
return;
|
||||
}
|
||||
|
||||
const now = Date.now() / 1000;
|
||||
const rows = leases.map(lease => {
|
||||
const mbGb = lease.mb_granted >= 1024
|
||||
? (lease.mb_granted / 1024).toFixed(1) + ' GB'
|
||||
: lease.mb_granted + ' MB';
|
||||
|
||||
const tr = document.createElement('tr');
|
||||
|
||||
const tdService = el('td', { cls: 'td-service', text: lease.holder_service });
|
||||
const tdNode = el('td', { cls: 'td-node', text: lease.node_id + ' / GPU ' + lease.gpu_id });
|
||||
const tdMb = el('td', { cls: 'td-mb', text: mbGb });
|
||||
const tdPriority = el('td', { cls: 'td-priority', text: 'p' + lease.priority });
|
||||
|
||||
const tdTtl = document.createElement('td');
|
||||
if (!lease.expires_at) {
|
||||
tdTtl.appendChild(el('span', { cls: 'ttl-label', text: '∞' }));
|
||||
} else {
|
||||
const remaining = Math.max(0, lease.expires_at - now);
|
||||
const pct = Math.min(100, (remaining / 300) * 100);
|
||||
const mins = Math.floor(remaining / 60);
|
||||
const secs = Math.floor(remaining % 60);
|
||||
const label = remaining > 60
|
||||
? mins + 'm ' + String(secs).padStart(2, '0') + 's'
|
||||
: Math.floor(remaining) + 's';
|
||||
|
||||
const wrap = el('div', { cls: 'ttl-wrap' });
|
||||
const lbl = el('span', { cls: 'ttl-label', text: label });
|
||||
const track = el('div', { cls: 'ttl-track' });
|
||||
const fill = el('div', { cls: 'ttl-fill', style: { width: pct.toFixed(1) + '%' } });
|
||||
track.appendChild(fill);
|
||||
append(wrap, lbl, track);
|
||||
tdTtl.appendChild(wrap);
|
||||
}
|
||||
|
||||
append(tr, tdService, tdNode, tdMb, tdPriority, tdTtl);
|
||||
return tr;
|
||||
});
|
||||
|
||||
setChildren(tbody, ...rows);
|
||||
}
|
||||
|
||||
// ── error banner ─────────────────────────────────────────────────
|
||||
function showError(msg) {
|
||||
const el = document.getElementById('error-banner');
|
||||
el.textContent = msg; // textContent — safe
|
||||
el.style.display = 'block';
|
||||
}
|
||||
function clearError() { document.getElementById('error-banner').style.display = 'none'; }
|
||||
|
||||
// ── poll ─────────────────────────────────────────────────────────
|
||||
async function poll() {
|
||||
try {
|
||||
const [nodesRes, leasesRes, residentRes, healthRes, servicesRes] = await Promise.all([
|
||||
fetch('/api/nodes'),
|
||||
fetch('/api/leases'),
|
||||
fetch('/api/resident'),
|
||||
fetch('/api/health'),
|
||||
fetch('/api/services'),
|
||||
]);
|
||||
if (!nodesRes.ok || !leasesRes.ok) throw new Error('API error: ' + nodesRes.status);
|
||||
const [nodesData, leasesData, residentData, servicesData] = await Promise.all([
|
||||
nodesRes.json(), leasesRes.json(),
|
||||
residentRes.ok ? residentRes.json() : Promise.resolve({ residents: [] }),
|
||||
servicesRes.ok ? servicesRes.json() : Promise.resolve({ services: [] }),
|
||||
]);
|
||||
|
||||
// Build per-GPU leased-MB index for the stacked bar.
|
||||
const leasedByGpu = {};
|
||||
for (const lease of (leasesData.leases || [])) {
|
||||
const key = lease.node_id + ':' + lease.gpu_id;
|
||||
leasedByGpu[key] = (leasedByGpu[key] || 0) + lease.mb_granted;
|
||||
}
|
||||
|
||||
clearError();
|
||||
renderHealth(healthRes.ok);
|
||||
renderNodes(nodesData.nodes || [], leasedByGpu);
|
||||
renderServices(servicesData.services || []);
|
||||
renderLeases(leasesData.leases || []);
|
||||
renderResidents(residentData.residents || []);
|
||||
} catch (err) {
|
||||
showError('Failed to reach coordinator: ' + err.message);
|
||||
renderHealth(false);
|
||||
}
|
||||
}
|
||||
|
||||
poll();
|
||||
setInterval(poll, 5000);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.models import VRAMLease
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_EVICTION_TIMEOUT_S = 10.0
|
||||
|
||||
|
||||
class EvictionEngine:
|
||||
def __init__(
|
||||
self,
|
||||
lease_manager: LeaseManager,
|
||||
eviction_timeout_s: float = _DEFAULT_EVICTION_TIMEOUT_S,
|
||||
) -> None:
|
||||
self.lease_manager = lease_manager
|
||||
self._timeout = eviction_timeout_s
|
||||
|
||||
async def request_lease(
|
||||
self,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
mb: int,
|
||||
service: str,
|
||||
priority: int,
|
||||
agent_url: str,
|
||||
ttl_s: float = 0.0,
|
||||
) -> VRAMLease | None:
|
||||
# Fast path: enough free VRAM
|
||||
lease = await self.lease_manager.try_grant(
|
||||
node_id, gpu_id, mb, service, priority, ttl_s
|
||||
)
|
||||
if lease is not None:
|
||||
return lease
|
||||
|
||||
# Find eviction candidates
|
||||
candidates = self.lease_manager.get_eviction_candidates(
|
||||
node_id=node_id, gpu_id=gpu_id,
|
||||
needed_mb=mb, requester_priority=priority,
|
||||
)
|
||||
if not candidates:
|
||||
logger.info(
|
||||
"No eviction candidates for %s on %s:GPU%d (%dMB needed)",
|
||||
service, node_id, gpu_id, mb,
|
||||
)
|
||||
return None
|
||||
|
||||
# Evict candidates
|
||||
freed_mb = sum(c.mb_granted for c in candidates)
|
||||
logger.info(
|
||||
"Evicting %d lease(s) to free %dMB for %s",
|
||||
len(candidates), freed_mb, service,
|
||||
)
|
||||
for candidate in candidates:
|
||||
await self._evict_lease(candidate, agent_url)
|
||||
|
||||
# Wait for evictions to free up VRAM (poll with timeout)
|
||||
loop = asyncio.get_running_loop()
|
||||
deadline = loop.time() + self._timeout
|
||||
while loop.time() < deadline:
|
||||
lease = await self.lease_manager.try_grant(
|
||||
node_id, gpu_id, mb, service, priority, ttl_s
|
||||
)
|
||||
if lease is not None:
|
||||
return lease
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
logger.warning("Eviction timed out for %s after %.1fs", service, self._timeout)
|
||||
return None
|
||||
|
||||
async def _evict_lease(self, lease: VRAMLease, agent_url: str) -> None:
|
||||
"""Release lease accounting. Process-level eviction deferred to Plan B."""
|
||||
await self.lease_manager.release(lease.lease_id)
|
||||
|
||||
async def _call_agent_evict(self, agent_url: str, lease: VRAMLease) -> bool:
|
||||
"""POST /evict to the agent. Stub for v1 — real process lookup in Plan B."""
|
||||
return True
|
||||
|
|
@ -1,130 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from collections import defaultdict
|
||||
|
||||
from circuitforge_core.resources.models import ResidentAllocation, VRAMLease
|
||||
|
||||
|
||||
class LeaseManager:
|
||||
def __init__(self) -> None:
|
||||
self._leases: dict[str, VRAMLease] = {}
|
||||
self._gpu_total: dict[tuple[str, int], int] = {}
|
||||
self._gpu_used: dict[tuple[str, int], int] = defaultdict(int)
|
||||
self._lock = asyncio.Lock()
|
||||
# Resident allocations — keyed "node_id:service", updated by heartbeat.
|
||||
# No lock needed: only the single heartbeat task writes this dict.
|
||||
self._residents: dict[str, ResidentAllocation] = {}
|
||||
|
||||
def register_gpu(self, node_id: str, gpu_id: int, total_mb: int) -> None:
|
||||
self._gpu_total[(node_id, gpu_id)] = total_mb
|
||||
|
||||
def gpu_total_mb(self, node_id: str, gpu_id: int) -> int:
|
||||
return self._gpu_total.get((node_id, gpu_id), 0)
|
||||
|
||||
def used_mb(self, node_id: str, gpu_id: int) -> int:
|
||||
return self._gpu_used[(node_id, gpu_id)]
|
||||
|
||||
async def try_grant(
|
||||
self,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
mb: int,
|
||||
service: str,
|
||||
priority: int,
|
||||
ttl_s: float = 0.0,
|
||||
) -> VRAMLease | None:
|
||||
async with self._lock:
|
||||
total = self._gpu_total.get((node_id, gpu_id), 0)
|
||||
used = self._gpu_used[(node_id, gpu_id)]
|
||||
if total - used < mb:
|
||||
return None
|
||||
lease = VRAMLease.create(
|
||||
gpu_id=gpu_id, node_id=node_id, mb=mb,
|
||||
service=service, priority=priority, ttl_s=ttl_s,
|
||||
)
|
||||
self._leases[lease.lease_id] = lease
|
||||
self._gpu_used[(node_id, gpu_id)] += mb
|
||||
return lease
|
||||
|
||||
async def release(self, lease_id: str) -> bool:
|
||||
async with self._lock:
|
||||
lease = self._leases.pop(lease_id, None)
|
||||
if lease is None:
|
||||
return False
|
||||
self._gpu_used[(lease.node_id, lease.gpu_id)] -= lease.mb_granted
|
||||
return True
|
||||
|
||||
def get_eviction_candidates(
|
||||
self,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
needed_mb: int,
|
||||
requester_priority: int,
|
||||
) -> list[VRAMLease]:
|
||||
candidates = [
|
||||
lease for lease in self._leases.values()
|
||||
if lease.node_id == node_id
|
||||
and lease.gpu_id == gpu_id
|
||||
and lease.priority > requester_priority
|
||||
]
|
||||
candidates.sort(key=lambda lease: lease.priority, reverse=True)
|
||||
selected: list[VRAMLease] = []
|
||||
freed = 0
|
||||
for candidate in candidates:
|
||||
selected.append(candidate)
|
||||
freed += candidate.mb_granted
|
||||
if freed >= needed_mb:
|
||||
break
|
||||
return selected
|
||||
|
||||
def list_leases(
|
||||
self, node_id: str | None = None, gpu_id: int | None = None
|
||||
) -> list[VRAMLease]:
|
||||
return [
|
||||
lease for lease in self._leases.values()
|
||||
if (node_id is None or lease.node_id == node_id)
|
||||
and (gpu_id is None or lease.gpu_id == gpu_id)
|
||||
]
|
||||
|
||||
def all_leases(self) -> list[VRAMLease]:
|
||||
return list(self._leases.values())
|
||||
|
||||
# ── resident tracking ────────────────────────────────────────────
|
||||
|
||||
def set_residents_for_node(
|
||||
self,
|
||||
node_id: str,
|
||||
residents: list[tuple[str, str | None]], # (service, model_name)
|
||||
) -> None:
|
||||
"""
|
||||
Replace the resident snapshot for a node.
|
||||
|
||||
Preserves first_seen for entries whose service+model_name are unchanged,
|
||||
so the dashboard can show how long a model has been warm.
|
||||
"""
|
||||
new_keys = {f"{node_id}:{service}" for service, _ in residents}
|
||||
|
||||
# Remove stale entries (service no longer running on this node).
|
||||
for key in list(self._residents):
|
||||
if key.startswith(f"{node_id}:") and key not in new_keys:
|
||||
del self._residents[key]
|
||||
|
||||
# Upsert: preserve first_seen when model is unchanged, reset otherwise.
|
||||
for service, model_name in residents:
|
||||
key = f"{node_id}:{service}"
|
||||
existing = self._residents.get(key)
|
||||
if existing is not None and existing.model_name == model_name:
|
||||
continue # same model still loaded — keep original first_seen
|
||||
self._residents[key] = ResidentAllocation(
|
||||
service=service,
|
||||
node_id=node_id,
|
||||
model_name=model_name,
|
||||
)
|
||||
|
||||
def all_residents(self) -> list[ResidentAllocation]:
|
||||
return list(self._residents.values())
|
||||
|
||||
def resident_keys(self) -> set[str]:
|
||||
"""Return set of 'node_id:service' strings for currently-warm services."""
|
||||
return set(self._residents.keys())
|
||||
|
|
@ -1,74 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
|
||||
_WARM_BONUS_MB = 1000
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class _Scored:
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
vram_free_mb: int
|
||||
effective_free_mb: int
|
||||
can_fit: bool
|
||||
warm: bool
|
||||
|
||||
|
||||
def select_node(
|
||||
agents: "dict[str, AgentRecord]",
|
||||
service: str,
|
||||
profile_registry: "ProfileRegistry",
|
||||
resident_keys: set[str],
|
||||
) -> tuple[str, int] | None:
|
||||
"""
|
||||
Pick the best (node_id, gpu_id) for the requested service.
|
||||
Warm nodes (service already running) get priority, then sorted by free VRAM.
|
||||
Returns None if no suitable node exists.
|
||||
"""
|
||||
service_max_mb = _find_service_max_mb(service, profile_registry)
|
||||
if service_max_mb is None:
|
||||
return None # service not in any profile
|
||||
|
||||
candidates: list[_Scored] = []
|
||||
for node_id, record in agents.items():
|
||||
if not record.online:
|
||||
continue
|
||||
for gpu in record.gpus:
|
||||
warm = f"{node_id}:{service}" in resident_keys
|
||||
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
|
||||
can_fit = gpu.vram_free_mb >= service_max_mb
|
||||
candidates.append(_Scored(
|
||||
node_id=node_id,
|
||||
gpu_id=gpu.gpu_id,
|
||||
vram_free_mb=gpu.vram_free_mb,
|
||||
effective_free_mb=effective,
|
||||
can_fit=can_fit,
|
||||
warm=warm,
|
||||
))
|
||||
if not candidates:
|
||||
return None
|
||||
# Prefer: (1) warm nodes (model already resident — no cold start)
|
||||
# (2) cold nodes that can fit the service (free >= half of max_mb)
|
||||
# Fallback: best-effort node when nothing fits and nothing is warm
|
||||
# (coordinator will attempt to start the service anyway; it may evict or fail)
|
||||
# Note: resident_keys are per-node, not per-GPU. On multi-GPU nodes, the warm
|
||||
# bonus applies to all GPUs on the node. This is a known coarseness —
|
||||
# per-GPU resident tracking requires a resident_key format change.
|
||||
preferred = [c for c in candidates if c.warm or c.can_fit]
|
||||
pool = preferred if preferred else candidates
|
||||
best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
|
||||
return best.node_id, best.gpu_id
|
||||
|
||||
|
||||
def _find_service_max_mb(service: str, profile_registry: "ProfileRegistry") -> int | None:
|
||||
for profile in profile_registry.list_public():
|
||||
svc = profile.services.get(service)
|
||||
if svc is not None:
|
||||
return svc.max_mb
|
||||
return None
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
"""
|
||||
circuitforge_core.resources.coordinator.node_store — SQLite persistence for known agent nodes.
|
||||
|
||||
Gives the coordinator restart-safe memory of which nodes have ever registered.
|
||||
On startup the coordinator reloads all known nodes and immediately probes them;
|
||||
nodes that respond come back online within one heartbeat cycle (~10 s) without
|
||||
any manual intervention on the agent hosts.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_DEFAULT_DB_PATH = Path.home() / ".local" / "share" / "circuitforge" / "cf-orch-nodes.db"
|
||||
_STALE_AGE_DAYS = 30 # nodes unseen for this long are pruned automatically
|
||||
|
||||
|
||||
class NodeStore:
|
||||
"""
|
||||
Thin SQLite wrapper for persisting known agent nodes across coordinator restarts.
|
||||
|
||||
Thread-safe for single-writer use (coordinator runs in one asyncio thread).
|
||||
"""
|
||||
|
||||
def __init__(self, db_path: Path = _DEFAULT_DB_PATH) -> None:
|
||||
self.db_path = db_path
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._migrate()
|
||||
logger.debug("NodeStore initialised at %s", db_path)
|
||||
|
||||
def _migrate(self) -> None:
|
||||
self._conn.executescript("""
|
||||
CREATE TABLE IF NOT EXISTS known_nodes (
|
||||
node_id TEXT PRIMARY KEY,
|
||||
agent_url TEXT NOT NULL,
|
||||
last_seen REAL NOT NULL
|
||||
);
|
||||
""")
|
||||
self._conn.commit()
|
||||
|
||||
def upsert(self, node_id: str, agent_url: str) -> None:
|
||||
"""Record or update a node. Called on every successful registration."""
|
||||
self._conn.execute(
|
||||
"""
|
||||
INSERT INTO known_nodes (node_id, agent_url, last_seen)
|
||||
VALUES (?, ?, ?)
|
||||
ON CONFLICT(node_id) DO UPDATE SET
|
||||
agent_url = excluded.agent_url,
|
||||
last_seen = excluded.last_seen
|
||||
""",
|
||||
(node_id, agent_url, time.time()),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def all(self) -> list[tuple[str, str]]:
|
||||
"""Return all known (node_id, agent_url) pairs."""
|
||||
rows = self._conn.execute(
|
||||
"SELECT node_id, agent_url FROM known_nodes ORDER BY last_seen DESC"
|
||||
).fetchall()
|
||||
return [(r["node_id"], r["agent_url"]) for r in rows]
|
||||
|
||||
def remove(self, node_id: str) -> None:
|
||||
self._conn.execute("DELETE FROM known_nodes WHERE node_id = ?", (node_id,))
|
||||
self._conn.commit()
|
||||
|
||||
def prune_stale(self, max_age_days: int = _STALE_AGE_DAYS) -> int:
|
||||
"""Delete nodes not seen within max_age_days. Returns count removed."""
|
||||
cutoff = time.time() - max_age_days * 86400
|
||||
cur = self._conn.execute(
|
||||
"DELETE FROM known_nodes WHERE last_seen < ?", (cutoff,)
|
||||
)
|
||||
self._conn.commit()
|
||||
removed = cur.rowcount
|
||||
if removed:
|
||||
logger.info("NodeStore: pruned %d stale node(s) (>%d days old)", removed, max_age_days)
|
||||
return removed
|
||||
|
||||
def close(self) -> None:
|
||||
self._conn.close()
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
# circuitforge_core/resources/coordinator/profile_registry.py
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from circuitforge_core.resources.models import GpuInfo
|
||||
from circuitforge_core.resources.profiles.schema import GpuProfile, load_profile
|
||||
|
||||
_PUBLIC_DIR = Path(__file__).parent.parent / "profiles" / "public"
|
||||
|
||||
# VRAM thresholds for public profile selection (MB)
|
||||
_PROFILE_THRESHOLDS = [
|
||||
(22000, "single-gpu-24gb"),
|
||||
(14000, "single-gpu-16gb"),
|
||||
(8000, "single-gpu-8gb"),
|
||||
(5500, "single-gpu-6gb"),
|
||||
(3500, "single-gpu-4gb"),
|
||||
(0, "single-gpu-2gb"),
|
||||
]
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ProfileRegistry:
|
||||
def __init__(self, extra_dirs: list[Path] | None = None) -> None:
|
||||
self._profiles: dict[str, GpuProfile] = {}
|
||||
self._load_dir(_PUBLIC_DIR)
|
||||
for d in (extra_dirs or []):
|
||||
if d.exists():
|
||||
self._load_dir(d)
|
||||
|
||||
def _load_dir(self, directory: Path) -> None:
|
||||
for yaml_file in directory.glob("*.yaml"):
|
||||
try:
|
||||
profile = load_profile(yaml_file)
|
||||
self._profiles[profile.name] = profile
|
||||
except Exception as exc:
|
||||
_log.warning("Skipping %s: %s", yaml_file, exc)
|
||||
|
||||
def load(self, path: Path) -> GpuProfile:
|
||||
profile = load_profile(path)
|
||||
self._profiles[profile.name] = profile
|
||||
return profile
|
||||
|
||||
def list_public(self) -> list[GpuProfile]:
|
||||
# CPU profiles (cpu-*) are intentionally excluded — this endpoint
|
||||
# is used to match GPU hardware. CPU inference nodes self-select
|
||||
# their profile via the CLI and are not listed for lease matching.
|
||||
return [
|
||||
p for p in self._profiles.values()
|
||||
if p.name.startswith("single-gpu-")
|
||||
]
|
||||
|
||||
def get(self, name: str) -> GpuProfile | None:
|
||||
return self._profiles.get(name)
|
||||
|
||||
def auto_detect(self, gpus: list[GpuInfo]) -> GpuProfile:
|
||||
primary_vram = gpus[0].vram_total_mb if gpus else 0
|
||||
for threshold_mb, profile_name in _PROFILE_THRESHOLDS:
|
||||
if primary_vram >= threshold_mb:
|
||||
profile = self._profiles.get(profile_name)
|
||||
if profile:
|
||||
return profile
|
||||
return self._profiles["single-gpu-2gb"]
|
||||
|
|
@ -1,173 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceAllocation:
|
||||
allocation_id: str
|
||||
service: str
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
model: str | None
|
||||
caller: str
|
||||
url: str
|
||||
created_at: float
|
||||
expires_at: float # 0 = no expiry
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceInstance:
|
||||
service: str
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
state: Literal["starting", "running", "idle", "stopped"]
|
||||
model: str | None
|
||||
url: str | None
|
||||
idle_since: float | None = None
|
||||
health_path: str = "/health"
|
||||
|
||||
|
||||
class ServiceRegistry:
|
||||
"""
|
||||
In-memory registry of service allocations and instance state.
|
||||
|
||||
Allocations: per-caller request — many per service instance.
|
||||
Instances: per (service, node_id, gpu_id) — one per running container.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._allocations: dict[str, ServiceAllocation] = {}
|
||||
self._instances: dict[str, ServiceInstance] = {} # key: "service:node_id:gpu_id"
|
||||
|
||||
# ── allocation API ────────────────────────────────────────────────
|
||||
|
||||
def allocate(
|
||||
self,
|
||||
service: str,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
model: str | None,
|
||||
url: str,
|
||||
caller: str,
|
||||
ttl_s: float,
|
||||
) -> ServiceAllocation:
|
||||
alloc = ServiceAllocation(
|
||||
allocation_id=str(uuid.uuid4()),
|
||||
service=service,
|
||||
node_id=node_id,
|
||||
gpu_id=gpu_id,
|
||||
model=model,
|
||||
caller=caller,
|
||||
url=url,
|
||||
created_at=time.time(),
|
||||
expires_at=time.time() + ttl_s if ttl_s > 0 else 0.0,
|
||||
)
|
||||
self._allocations[alloc.allocation_id] = alloc
|
||||
|
||||
# If an instance exists in idle/stopped state, mark it running again
|
||||
key = f"{service}:{node_id}:{gpu_id}"
|
||||
if key in self._instances:
|
||||
inst = self._instances[key]
|
||||
if inst.state in ("idle", "stopped"):
|
||||
self._instances[key] = dataclasses.replace(
|
||||
inst, state="running", idle_since=None
|
||||
)
|
||||
return alloc
|
||||
|
||||
def release(self, allocation_id: str) -> bool:
|
||||
alloc = self._allocations.pop(allocation_id, None)
|
||||
if alloc is None:
|
||||
return False
|
||||
# If no active allocations remain for this instance, mark it idle
|
||||
key = f"{alloc.service}:{alloc.node_id}:{alloc.gpu_id}"
|
||||
if self.active_allocations(alloc.service, alloc.node_id, alloc.gpu_id) == 0:
|
||||
if key in self._instances:
|
||||
self._instances[key] = dataclasses.replace(
|
||||
self._instances[key], state="idle", idle_since=time.time()
|
||||
)
|
||||
return True
|
||||
|
||||
def active_allocations(self, service: str, node_id: str, gpu_id: int) -> int:
|
||||
return sum(
|
||||
1 for a in self._allocations.values()
|
||||
if a.service == service and a.node_id == node_id and a.gpu_id == gpu_id
|
||||
)
|
||||
|
||||
# ── instance API ─────────────────────────────────────────────────
|
||||
|
||||
def upsert_instance(
|
||||
self,
|
||||
service: str,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
state: Literal["starting", "running", "idle", "stopped"],
|
||||
model: str | None,
|
||||
url: str | None,
|
||||
health_path: str = "/health",
|
||||
) -> ServiceInstance:
|
||||
key = f"{service}:{node_id}:{gpu_id}"
|
||||
existing = self._instances.get(key)
|
||||
idle_since: float | None = None
|
||||
if state == "idle":
|
||||
# Preserve idle_since if already idle; set now if transitioning into idle
|
||||
idle_since = existing.idle_since if (existing and existing.state == "idle") else time.time()
|
||||
inst = ServiceInstance(
|
||||
service=service, node_id=node_id, gpu_id=gpu_id,
|
||||
state=state, model=model, url=url, idle_since=idle_since,
|
||||
health_path=health_path,
|
||||
)
|
||||
self._instances[key] = inst
|
||||
return inst
|
||||
|
||||
def get_allocation(self, allocation_id: str) -> ServiceAllocation | None:
|
||||
return self._allocations.get(allocation_id)
|
||||
|
||||
def sweep_expired_allocations(self) -> list[str]:
|
||||
"""
|
||||
Remove all allocations whose TTL has elapsed and transition the
|
||||
corresponding instance to 'idle' if no active allocations remain.
|
||||
Returns the list of expired allocation_ids.
|
||||
"""
|
||||
now = time.time()
|
||||
expired = [
|
||||
alloc_id
|
||||
for alloc_id, alloc in self._allocations.items()
|
||||
if alloc.expires_at > 0 and now > alloc.expires_at
|
||||
]
|
||||
for alloc_id in expired:
|
||||
self.release(alloc_id)
|
||||
return expired
|
||||
|
||||
def all_allocations(self) -> list[ServiceAllocation]:
|
||||
return list(self._allocations.values())
|
||||
|
||||
def all_instances(self) -> list[ServiceInstance]:
|
||||
return list(self._instances.values())
|
||||
|
||||
def mark_stopped(self, service: str, node_id: str, gpu_id: int) -> None:
|
||||
"""Transition an instance to 'stopped' state and clear idle_since."""
|
||||
key = f"{service}:{node_id}:{gpu_id}"
|
||||
if key in self._instances:
|
||||
self._instances[key] = dataclasses.replace(
|
||||
self._instances[key], state="stopped", idle_since=None
|
||||
)
|
||||
|
||||
def idle_past_timeout(self, idle_stop_config: dict[str, int]) -> list[ServiceInstance]:
|
||||
"""
|
||||
Return instances in 'idle' state whose idle time exceeds their configured timeout.
|
||||
idle_stop_config: {service_name: seconds} — 0 means never stop automatically.
|
||||
"""
|
||||
now = time.time()
|
||||
result = []
|
||||
for inst in self._instances.values():
|
||||
if inst.state != "idle" or inst.idle_since is None:
|
||||
continue
|
||||
timeout = idle_stop_config.get(inst.service, 0)
|
||||
if timeout > 0 and (now - inst.idle_since) >= timeout:
|
||||
result.append(inst)
|
||||
return result
|
||||
|
|
@ -1,250 +0,0 @@
|
|||
"""
|
||||
cf-docuvision — managed document understanding service.
|
||||
|
||||
Wraps ByteDance/Dolphin-v2 (Qwen2.5-VL backbone) behind a simple HTTP API.
|
||||
Managed by cf-orch; started/stopped as a ProcessSpec service.
|
||||
|
||||
API
|
||||
---
|
||||
GET /health → {"status": "ok", "model": "<path>"}
|
||||
POST /extract → ExtractResponse
|
||||
|
||||
Usage (standalone)::
|
||||
|
||||
python -m circuitforge_core.resources.docuvision.app \\
|
||||
--model /Library/Assets/LLM/docuvision/models/dolphin-v2 \\
|
||||
--port 8003 --gpu-id 0
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any
|
||||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Module-level state — populated by _load_model() on first /extract call
|
||||
_model: Any = None
|
||||
_processor: Any = None
|
||||
_model_path: str = ""
|
||||
_device: str = "cpu"
|
||||
|
||||
|
||||
# ── lazy loader ───────────────────────────────────────────────────────────────
|
||||
|
||||
def _load_model() -> None:
|
||||
"""Lazy-load Dolphin-v2. Called once on first /extract request."""
|
||||
global _model, _processor, _device
|
||||
|
||||
if _model is not None:
|
||||
return
|
||||
|
||||
import torch
|
||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
||||
|
||||
logger.info("Loading Dolphin-v2 from %s ...", _model_path)
|
||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
_processor = AutoProcessor.from_pretrained(
|
||||
_model_path,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
_model = AutoModelForCausalLM.from_pretrained(
|
||||
_model_path,
|
||||
trust_remote_code=True,
|
||||
torch_dtype=torch.float16 if _device == "cuda" else torch.float32,
|
||||
device_map=_device,
|
||||
)
|
||||
_model.eval()
|
||||
logger.info("Dolphin-v2 loaded on %s", _device)
|
||||
|
||||
|
||||
# ── FastAPI app ───────────────────────────────────────────────────────────────
|
||||
|
||||
@asynccontextmanager
|
||||
async def _lifespan(app: FastAPI):
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="cf-docuvision", lifespan=_lifespan)
|
||||
|
||||
|
||||
# ── request / response models ─────────────────────────────────────────────────
|
||||
|
||||
class ExtractRequest(BaseModel):
|
||||
"""
|
||||
Either image_b64 (base64-encoded bytes) or image_path (absolute path) must
|
||||
be provided. hint guides the extraction mode:
|
||||
- "auto" - Dolphin-v2 detects layout and element types automatically
|
||||
- "table" - optimise for tabular data (receipts, invoices, forms)
|
||||
- "text" - optimise for dense prose (contracts, letters)
|
||||
- "form" - optimise for form field extraction
|
||||
"""
|
||||
image_b64: str | None = None
|
||||
image_path: str | None = None
|
||||
hint: str = "auto"
|
||||
|
||||
|
||||
class ElementOut(BaseModel):
|
||||
type: str # heading | paragraph | list | table | figure | formula | code
|
||||
text: str
|
||||
bbox: list[float] | None = None # [x0, y0, x1, y1] normalised 0-1 if available
|
||||
|
||||
|
||||
class TableOut(BaseModel):
|
||||
html: str
|
||||
bbox: list[float] | None = None
|
||||
|
||||
|
||||
class ExtractResponse(BaseModel):
|
||||
elements: list[ElementOut]
|
||||
raw_text: str
|
||||
tables: list[TableOut]
|
||||
metadata: dict[str, Any]
|
||||
|
||||
|
||||
# ── helpers ───────────────────────────────────────────────────────────────────
|
||||
|
||||
_HINT_PROMPTS: dict[str, str] = {
|
||||
"auto": "Parse this document. Extract all elements with their types and text content.",
|
||||
"table": "Extract all tables from this document as structured HTML. Also extract any line-item text.",
|
||||
"text": "Extract all text from this document preserving paragraph and heading structure.",
|
||||
"form": "Extract all form fields from this document. Return field labels and their values.",
|
||||
}
|
||||
|
||||
|
||||
def _image_from_request(req: ExtractRequest):
|
||||
"""Return a PIL Image from either image_b64 or image_path."""
|
||||
from PIL import Image
|
||||
|
||||
if req.image_b64:
|
||||
img_bytes = base64.b64decode(req.image_b64)
|
||||
return Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
||||
|
||||
if req.image_path:
|
||||
from pathlib import Path
|
||||
p = Path(req.image_path)
|
||||
if not p.exists():
|
||||
raise HTTPException(status_code=404, detail=f"image_path not found: {req.image_path}")
|
||||
return Image.open(p).convert("RGB")
|
||||
|
||||
raise HTTPException(status_code=422, detail="Either image_b64 or image_path must be provided")
|
||||
|
||||
|
||||
def _parse_dolphin_output(raw: str) -> tuple[list[ElementOut], list[TableOut], str]:
|
||||
"""
|
||||
Parse Dolphin-v2's structured output into elements and tables.
|
||||
|
||||
Dolphin-v2 returns a JSON array of element dicts with keys:
|
||||
type, text, [html], [bbox]
|
||||
|
||||
Falls back gracefully if the model returns plain text instead.
|
||||
"""
|
||||
elements: list[ElementOut] = []
|
||||
tables: list[TableOut] = []
|
||||
|
||||
# Try JSON parse first
|
||||
try:
|
||||
parsed = json.loads(raw)
|
||||
if isinstance(parsed, list):
|
||||
for item in parsed:
|
||||
etype = item.get("type", "paragraph")
|
||||
text = item.get("text", "")
|
||||
bbox = item.get("bbox")
|
||||
if etype == "table":
|
||||
tables.append(TableOut(html=item.get("html", text), bbox=bbox))
|
||||
elements.append(ElementOut(type=etype, text=text, bbox=bbox))
|
||||
raw_text = "\n".join(e.text for e in elements)
|
||||
return elements, tables, raw_text
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
pass
|
||||
|
||||
# Plain-text fallback: treat entire output as a single paragraph
|
||||
elements = [ElementOut(type="paragraph", text=raw.strip())]
|
||||
return elements, tables, raw.strip()
|
||||
|
||||
|
||||
# ── routes ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@app.get("/health")
|
||||
async def health() -> dict[str, str]:
|
||||
return {"status": "ok", "model": _model_path}
|
||||
|
||||
|
||||
@app.post("/extract", response_model=ExtractResponse)
|
||||
async def extract(req: ExtractRequest) -> ExtractResponse:
|
||||
_load_model()
|
||||
|
||||
image = _image_from_request(req)
|
||||
prompt = _HINT_PROMPTS.get(req.hint, _HINT_PROMPTS["auto"])
|
||||
|
||||
import torch
|
||||
|
||||
inputs = _processor(
|
||||
text=prompt,
|
||||
images=image,
|
||||
return_tensors="pt",
|
||||
).to(_device)
|
||||
|
||||
with torch.no_grad():
|
||||
output_ids = _model.generate(
|
||||
**inputs,
|
||||
max_new_tokens=2048,
|
||||
do_sample=False,
|
||||
)
|
||||
|
||||
# Decode only the newly generated tokens
|
||||
input_len = inputs["input_ids"].shape[1]
|
||||
raw_output = _processor.decode(
|
||||
output_ids[0][input_len:],
|
||||
skip_special_tokens=True,
|
||||
)
|
||||
|
||||
elements, tables, raw_text = _parse_dolphin_output(raw_output)
|
||||
|
||||
w, h = image.size
|
||||
|
||||
return ExtractResponse(
|
||||
elements=elements,
|
||||
raw_text=raw_text,
|
||||
tables=tables,
|
||||
metadata={
|
||||
"hint": req.hint,
|
||||
"width": w,
|
||||
"height": h,
|
||||
"model": _model_path,
|
||||
"device": _device,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
# ── CLI entry point ───────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="cf-docuvision service")
|
||||
parser.add_argument("--model", required=True, help="Path to Dolphin-v2 model directory")
|
||||
parser.add_argument("--port", type=int, default=8003)
|
||||
parser.add_argument("--host", default="0.0.0.0")
|
||||
parser.add_argument("--gpu-id", type=int, default=0)
|
||||
args = parser.parse_args()
|
||||
|
||||
global _model_path
|
||||
_model_path = args.model
|
||||
|
||||
import os
|
||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
|
||||
uvicorn.run(app, host=args.host, port=args.port)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,137 +0,0 @@
|
|||
"""Generic OpenAI-compatible inference server for HuggingFace causal LMs."""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import Any
|
||||
|
||||
import torch
|
||||
import uvicorn
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
|
||||
_model: Any = None
|
||||
_tokenizer: Any = None
|
||||
_model_id: str = ""
|
||||
_device: str = "cpu"
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(lifespan=lifespan)
|
||||
|
||||
|
||||
class Message(BaseModel):
|
||||
role: str
|
||||
content: str
|
||||
|
||||
|
||||
class ChatRequest(BaseModel):
|
||||
model: str | None = None
|
||||
messages: list[Message]
|
||||
max_tokens: int | None = 512
|
||||
temperature: float | None = 0.7
|
||||
stream: bool | None = False
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
def health() -> dict[str, str]:
|
||||
return {"status": "ok", "model": _model_id}
|
||||
|
||||
|
||||
@app.get("/v1/models")
|
||||
def list_models() -> dict[str, Any]:
|
||||
return {
|
||||
"object": "list",
|
||||
"data": [{"id": _model_id, "object": "model", "owned_by": "cf-orch"}],
|
||||
}
|
||||
|
||||
|
||||
@app.post("/v1/chat/completions")
|
||||
def chat_completions(req: ChatRequest) -> dict[str, Any]:
|
||||
if _model is None:
|
||||
raise HTTPException(503, detail="Model not loaded")
|
||||
if req.stream:
|
||||
raise HTTPException(501, detail="Streaming not supported")
|
||||
|
||||
conversation = [{"role": m.role, "content": m.content} for m in req.messages]
|
||||
try:
|
||||
encoded = _tokenizer.apply_chat_template(
|
||||
conversation,
|
||||
return_tensors="pt",
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
# transformers 5.x returns BatchEncoding; 4.x returned a bare tensor
|
||||
input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(_device)
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
|
||||
|
||||
max_new = req.max_tokens or 512
|
||||
temp = req.temperature if req.temperature is not None else 0.7
|
||||
gen_kwargs: dict[str, Any] = {
|
||||
"max_new_tokens": max_new,
|
||||
"do_sample": temp > 0,
|
||||
"pad_token_id": _tokenizer.eos_token_id,
|
||||
}
|
||||
if temp > 0:
|
||||
gen_kwargs["temperature"] = temp
|
||||
|
||||
with torch.inference_mode():
|
||||
output_ids = _model.generate(input_ids, **gen_kwargs)
|
||||
|
||||
new_tokens = output_ids[0][input_ids.shape[-1]:]
|
||||
reply = _tokenizer.decode(new_tokens, skip_special_tokens=True)
|
||||
|
||||
return {
|
||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
||||
"object": "chat.completion",
|
||||
"created": int(time.time()),
|
||||
"model": _model_id,
|
||||
"choices": [
|
||||
{
|
||||
"index": 0,
|
||||
"message": {"role": "assistant", "content": reply},
|
||||
"finish_reason": "stop",
|
||||
}
|
||||
],
|
||||
"usage": {
|
||||
"prompt_tokens": input_ids.shape[-1],
|
||||
"completion_tokens": len(new_tokens),
|
||||
"total_tokens": input_ids.shape[-1] + len(new_tokens),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _load_model(model_path: str, gpu_id: int) -> None:
|
||||
global _model, _tokenizer, _model_id, _device
|
||||
_device = f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
|
||||
_model_id = model_path
|
||||
_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||
_model = AutoModelForCausalLM.from_pretrained(
|
||||
model_path,
|
||||
dtype=torch.float16 if "cuda" in _device else torch.float32,
|
||||
device_map={"": _device},
|
||||
trust_remote_code=True,
|
||||
)
|
||||
_model.eval()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="cf-orch generic LLM inference server")
|
||||
parser.add_argument("--model", required=True)
|
||||
parser.add_argument("--port", type=int, default=8000)
|
||||
parser.add_argument("--host", default="0.0.0.0")
|
||||
parser.add_argument("--gpu-id", type=int, default=0)
|
||||
args = parser.parse_args()
|
||||
_load_model(args.model, args.gpu_id)
|
||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,66 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class VRAMLease:
|
||||
lease_id: str
|
||||
gpu_id: int
|
||||
node_id: str
|
||||
mb_granted: int
|
||||
holder_service: str
|
||||
priority: int
|
||||
expires_at: float # unix timestamp; 0.0 = no expiry
|
||||
|
||||
@classmethod
|
||||
def create(
|
||||
cls,
|
||||
gpu_id: int,
|
||||
node_id: str,
|
||||
mb: int,
|
||||
service: str,
|
||||
priority: int,
|
||||
ttl_s: float = 0.0,
|
||||
) -> VRAMLease:
|
||||
return cls(
|
||||
lease_id=str(uuid.uuid4()),
|
||||
gpu_id=gpu_id,
|
||||
node_id=node_id,
|
||||
mb_granted=mb,
|
||||
holder_service=service,
|
||||
priority=priority,
|
||||
expires_at=time.time() + ttl_s if ttl_s > 0.0 else 0.0,
|
||||
)
|
||||
|
||||
def is_expired(self) -> bool:
|
||||
return self.expires_at > 0.0 and time.time() > self.expires_at
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GpuInfo:
|
||||
gpu_id: int
|
||||
name: str
|
||||
vram_total_mb: int
|
||||
vram_used_mb: int
|
||||
vram_free_mb: int
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ResidentAllocation:
|
||||
"""A model that is loaded and warm in VRAM but not actively serving a request."""
|
||||
service: str
|
||||
node_id: str
|
||||
model_name: Optional[str] # None if service is running but model probe failed
|
||||
first_seen: float = field(default_factory=time.time)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeInfo:
|
||||
node_id: str
|
||||
agent_url: str
|
||||
gpus: list[GpuInfo]
|
||||
last_heartbeat: float = field(default_factory=time.time)
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
schema_version: 1
|
||||
name: cpu-16gb
|
||||
eviction_timeout_s: 30.0
|
||||
services:
|
||||
ollama:
|
||||
max_mb: 0
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-stt:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
backend: moonshine
|
||||
cf-tts:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
cf-embed:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
always_on: true
|
||||
cf-classify:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
always_on: true
|
||||
model_size_hints:
|
||||
llm_max_params: 3b-q4
|
||||
image_gen_max: none
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
schema_version: 1
|
||||
name: cpu-32gb
|
||||
eviction_timeout_s: 30.0
|
||||
services:
|
||||
ollama:
|
||||
max_mb: 0
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-stt:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
backend: faster-whisper
|
||||
cf-tts:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
cf-embed:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 4
|
||||
always_on: true
|
||||
cf-classify:
|
||||
max_mb: 0
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 4
|
||||
always_on: true
|
||||
model_size_hints:
|
||||
llm_max_params: 7b-q4
|
||||
image_gen_max: none
|
||||
|
|
@ -1,73 +0,0 @@
|
|||
schema_version: 1
|
||||
name: single-gpu-16gb
|
||||
vram_total_mb: 16384
|
||||
eviction_timeout_s: 10.0
|
||||
services:
|
||||
vllm:
|
||||
max_mb: 9000
|
||||
priority: 1
|
||||
idle_stop_after_s: 600
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
||||
port: 8000
|
||||
host_port: 8000
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
ollama:
|
||||
max_mb: 12288
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-vision:
|
||||
max_mb: 3072
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 4
|
||||
cf-docuvision:
|
||||
max_mb: 6144
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 3
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
||||
port: 8003
|
||||
host_port: 8003
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
cf-stt:
|
||||
max_mb: 1200
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 3
|
||||
backend: parakeet-tdt
|
||||
cf-tts:
|
||||
max_mb: 1024
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 3
|
||||
cf-embed:
|
||||
max_mb: 512
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 6
|
||||
always_on: true
|
||||
cf-classify:
|
||||
max_mb: 512
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 6
|
||||
always_on: true
|
||||
comfyui:
|
||||
max_mb: 14336
|
||||
priority: 4
|
||||
model_size_hints:
|
||||
llm_max_params: 34b
|
||||
image_gen_max: flux-dev-fp8
|
||||
|
|
@ -1,73 +0,0 @@
|
|||
schema_version: 1
|
||||
name: single-gpu-24gb
|
||||
vram_total_mb: 24576
|
||||
eviction_timeout_s: 10.0
|
||||
services:
|
||||
vllm:
|
||||
max_mb: 9000
|
||||
priority: 1
|
||||
idle_stop_after_s: 600
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
||||
port: 8000
|
||||
host_port: 8000
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
ollama:
|
||||
max_mb: 18432
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-vision:
|
||||
max_mb: 4096
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 6
|
||||
cf-docuvision:
|
||||
max_mb: 8192
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 4
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
||||
port: 8003
|
||||
host_port: 8003
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
cf-stt:
|
||||
max_mb: 1200
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 4
|
||||
backend: parakeet-tdt
|
||||
cf-tts:
|
||||
max_mb: 1024
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 4
|
||||
cf-embed:
|
||||
max_mb: 512
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 8
|
||||
always_on: true
|
||||
cf-classify:
|
||||
max_mb: 512
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 8
|
||||
always_on: true
|
||||
comfyui:
|
||||
max_mb: 20480
|
||||
priority: 4
|
||||
model_size_hints:
|
||||
llm_max_params: 70b
|
||||
image_gen_max: flux-dev-fp16
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
schema_version: 1
|
||||
name: single-gpu-2gb
|
||||
vram_total_mb: 2048
|
||||
eviction_timeout_s: 15.0
|
||||
services:
|
||||
ollama:
|
||||
max_mb: 1536
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-vision:
|
||||
max_mb: 512
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
cf-stt:
|
||||
max_mb: 200
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
backend: moonshine
|
||||
model_size_hints:
|
||||
llm_max_params: 3b
|
||||
image_gen_max: none
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
schema_version: 1
|
||||
name: single-gpu-4gb
|
||||
vram_total_mb: 4096
|
||||
eviction_timeout_s: 15.0
|
||||
services:
|
||||
ollama:
|
||||
max_mb: 3072
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-vision:
|
||||
max_mb: 1024
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
cf-stt:
|
||||
max_mb: 600
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
backend: faster-whisper
|
||||
cf-tts:
|
||||
max_mb: 512
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
comfyui:
|
||||
max_mb: 3584
|
||||
priority: 4
|
||||
model_size_hints:
|
||||
llm_max_params: 3b
|
||||
image_gen_max: sd15-fp8
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
schema_version: 1
|
||||
name: single-gpu-6gb
|
||||
vram_total_mb: 6144
|
||||
eviction_timeout_s: 10.0
|
||||
services:
|
||||
vllm:
|
||||
max_mb: 5500
|
||||
priority: 1
|
||||
idle_stop_after_s: 600
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
||||
port: 8000
|
||||
host_port: 8000
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
ollama:
|
||||
max_mb: 3584
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-vision:
|
||||
max_mb: 1536
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
cf-docuvision:
|
||||
max_mb: 3072
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
||||
port: 8003
|
||||
host_port: 8003
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
cf-stt:
|
||||
max_mb: 600
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
backend: faster-whisper
|
||||
cf-tts:
|
||||
max_mb: 768
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 1
|
||||
comfyui:
|
||||
max_mb: 5120
|
||||
priority: 4
|
||||
model_size_hints:
|
||||
llm_max_params: 7b
|
||||
image_gen_max: sd15
|
||||
|
|
@ -1,68 +0,0 @@
|
|||
schema_version: 1
|
||||
name: single-gpu-8gb
|
||||
vram_total_mb: 8192
|
||||
eviction_timeout_s: 10.0
|
||||
services:
|
||||
vllm:
|
||||
max_mb: 6500
|
||||
priority: 1
|
||||
idle_stop_after_s: 600
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
||||
port: 8000
|
||||
host_port: 8000
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
ollama:
|
||||
max_mb: 4096
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: "/usr/local/bin/ollama"
|
||||
args_template: "serve"
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
cf-vision:
|
||||
max_mb: 2048
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 3
|
||||
cf-docuvision:
|
||||
max_mb: 4096
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
||||
port: 8003
|
||||
host_port: 8003
|
||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
||||
cf-stt:
|
||||
max_mb: 1200
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
backend: parakeet-tdt
|
||||
cf-tts:
|
||||
max_mb: 1024
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 2
|
||||
comfyui:
|
||||
max_mb: 6144
|
||||
priority: 4
|
||||
managed:
|
||||
type: process
|
||||
exec_path: "/opt/miniconda3/envs/comfyui/bin/python"
|
||||
args_template: "/opt/ComfyUI/main.py --listen 0.0.0.0 --port {port} --cuda-device {gpu_id}"
|
||||
cwd: "/opt/ComfyUI"
|
||||
port: 8188
|
||||
host_port: 8188
|
||||
model_size_hints:
|
||||
llm_max_params: 8b
|
||||
image_gen_max: sdxl-fp8
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
# circuitforge_core/resources/profiles/schema.py
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
SUPPORTED_SCHEMA_VERSION = 1
|
||||
|
||||
|
||||
class DockerSpec(BaseModel):
|
||||
"""Spec for a Docker-managed service."""
|
||||
|
||||
image: str
|
||||
port: int
|
||||
host_port: int
|
||||
command_template: str = ""
|
||||
volumes: list[str] = Field(default_factory=list)
|
||||
env: dict[str, str] = Field(default_factory=dict)
|
||||
runtime: str = "nvidia"
|
||||
ipc: str = "host"
|
||||
|
||||
model_config = {"frozen": True}
|
||||
|
||||
|
||||
class ProcessSpec(BaseModel):
|
||||
"""Spec for a process-managed service (non-Docker, e.g. conda env)."""
|
||||
|
||||
exec_path: str
|
||||
args_template: str = ""
|
||||
cwd: str = ""
|
||||
env: dict[str, str] = Field(default_factory=dict)
|
||||
port: int = 0
|
||||
host_port: int = 0
|
||||
# adopt=True: if the service is already listening on host_port, claim it rather
|
||||
# than spawning a new process (useful for system daemons like Ollama).
|
||||
adopt: bool = False
|
||||
# Override the health probe path; defaults to /health (Ollama uses /api/tags).
|
||||
health_path: str = "/health"
|
||||
|
||||
model_config = {"frozen": True}
|
||||
|
||||
|
||||
class ServiceProfile(BaseModel):
|
||||
max_mb: int
|
||||
priority: int
|
||||
shared: bool = False
|
||||
max_concurrent: int = 1
|
||||
always_on: bool = False
|
||||
idle_stop_after_s: int = 0
|
||||
backend: str | None = None
|
||||
consumers: list[str] = Field(default_factory=list)
|
||||
managed: DockerSpec | ProcessSpec | None = None
|
||||
|
||||
model_config = {"frozen": True}
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def _parse_managed(cls, values: Any) -> Any:
|
||||
if not isinstance(values, dict):
|
||||
return values
|
||||
raw = values.get("managed")
|
||||
if raw is None:
|
||||
return values
|
||||
if not isinstance(raw, dict):
|
||||
return values
|
||||
spec_type = raw.get("type")
|
||||
managed_fields = {k: v for k, v in raw.items() if k != "type"}
|
||||
if spec_type == "docker":
|
||||
values["managed"] = DockerSpec(**managed_fields)
|
||||
elif spec_type == "process":
|
||||
values["managed"] = ProcessSpec(**managed_fields)
|
||||
else:
|
||||
raise ValueError(f"Unknown managed service type: {spec_type!r}")
|
||||
return values
|
||||
|
||||
|
||||
class GpuNodeEntry(BaseModel):
|
||||
id: int
|
||||
vram_mb: int
|
||||
role: str
|
||||
card: str = "unknown"
|
||||
always_on: bool = False
|
||||
services: list[str] = Field(default_factory=list)
|
||||
|
||||
model_config = {"frozen": True}
|
||||
|
||||
|
||||
class NodeProfile(BaseModel):
|
||||
gpus: list[GpuNodeEntry]
|
||||
agent_url: str | None = None
|
||||
nas_mount: str | None = None
|
||||
|
||||
model_config = {"frozen": True}
|
||||
|
||||
|
||||
class GpuProfile(BaseModel):
|
||||
schema_version: int
|
||||
name: str
|
||||
vram_total_mb: int | None = None
|
||||
eviction_timeout_s: float = 10.0
|
||||
services: dict[str, ServiceProfile] = Field(default_factory=dict)
|
||||
model_size_hints: dict[str, str] = Field(default_factory=dict)
|
||||
nodes: dict[str, NodeProfile] = Field(default_factory=dict)
|
||||
|
||||
model_config = {"frozen": True}
|
||||
|
||||
|
||||
def load_profile(path: Path) -> GpuProfile:
|
||||
raw: dict[str, Any] = yaml.safe_load(path.read_text())
|
||||
if not isinstance(raw, dict):
|
||||
raise ValueError(f"Profile file {path} must be a YAML mapping, got {type(raw).__name__}")
|
||||
version = raw.get("schema_version")
|
||||
if version != SUPPORTED_SCHEMA_VERSION:
|
||||
raise ValueError(
|
||||
f"Unsupported schema_version {version!r} in {path}. "
|
||||
f"Expected {SUPPORTED_SCHEMA_VERSION}."
|
||||
)
|
||||
return GpuProfile.model_validate(raw)
|
||||
|
|
@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|||
|
||||
[project]
|
||||
name = "circuitforge-core"
|
||||
version = "0.7.0"
|
||||
description = "Shared scaffold for CircuitForge products"
|
||||
version = "0.8.0"
|
||||
description = "Shared scaffold for CircuitForge products (MIT)"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"pyyaml>=6.0",
|
||||
|
|
@ -14,32 +14,17 @@ dependencies = [
|
|||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
orch = [
|
||||
"fastapi>=0.110",
|
||||
"uvicorn[standard]>=0.29",
|
||||
"httpx>=0.27",
|
||||
"pydantic>=2.0",
|
||||
"typer[all]>=0.12",
|
||||
"psutil>=5.9",
|
||||
]
|
||||
tasks = [
|
||||
"httpx>=0.27",
|
||||
]
|
||||
manage = [
|
||||
"platformdirs>=4.0",
|
||||
"typer[all]>=0.12",
|
||||
]
|
||||
dev = [
|
||||
"circuitforge-core[orch]",
|
||||
"circuitforge-core[tasks]",
|
||||
"circuitforge-core[manage]",
|
||||
"pytest>=8.0",
|
||||
"pytest-asyncio>=0.23",
|
||||
"httpx>=0.27",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
cf-orch = "circuitforge_core.resources.cli:app"
|
||||
cf-manage = "circuitforge_core.manage.cli:app"
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
|
|
|
|||
|
|
@ -1,68 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from circuitforge_core.resources.agent.app import create_agent_app
|
||||
from circuitforge_core.resources.models import GpuInfo
|
||||
from circuitforge_core.resources.agent.eviction_executor import EvictionResult
|
||||
|
||||
MOCK_GPUS = [
|
||||
GpuInfo(
|
||||
gpu_id=0,
|
||||
name="RTX 4000",
|
||||
vram_total_mb=8192,
|
||||
vram_used_mb=1024,
|
||||
vram_free_mb=7168,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def agent_client():
|
||||
mock_monitor = MagicMock()
|
||||
mock_monitor.poll.return_value = MOCK_GPUS
|
||||
mock_executor = MagicMock()
|
||||
app = create_agent_app(
|
||||
node_id="heimdall",
|
||||
monitor=mock_monitor,
|
||||
executor=mock_executor,
|
||||
)
|
||||
return TestClient(app), mock_monitor, mock_executor
|
||||
|
||||
|
||||
def test_health_returns_ok(agent_client):
|
||||
client, _, _ = agent_client
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "ok"
|
||||
assert resp.json()["node_id"] == "heimdall"
|
||||
|
||||
|
||||
def test_gpu_info_returns_gpu_list(agent_client):
|
||||
client, _, _ = agent_client
|
||||
resp = client.get("/gpu-info")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert len(data["gpus"]) == 1
|
||||
assert data["gpus"][0]["gpu_id"] == 0
|
||||
assert data["gpus"][0]["name"] == "RTX 4000"
|
||||
assert data["gpus"][0]["vram_free_mb"] == 7168
|
||||
|
||||
|
||||
def test_evict_calls_executor(agent_client):
|
||||
client, _, mock_executor = agent_client
|
||||
mock_executor.evict_pid.return_value = EvictionResult(
|
||||
success=True, method="sigterm", message="done"
|
||||
)
|
||||
resp = client.post("/evict", json={"pid": 1234, "grace_period_s": 5.0})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["success"] is True
|
||||
mock_executor.evict_pid.assert_called_once_with(pid=1234, grace_period_s=5.0)
|
||||
|
||||
|
||||
def test_evict_requires_pid(agent_client):
|
||||
client, _, _ = agent_client
|
||||
resp = client.post("/evict", json={"grace_period_s": 5.0})
|
||||
assert resp.status_code == 422
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
import asyncio
|
||||
import time
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry, ServiceInstance
|
||||
|
||||
|
||||
def test_build_idle_stop_config_empty_without_registry():
|
||||
lm = LeaseManager()
|
||||
supervisor = AgentSupervisor(lease_manager=lm)
|
||||
assert supervisor._build_idle_stop_config() == {}
|
||||
|
||||
|
||||
def test_build_idle_stop_config_from_profiles():
|
||||
lm = LeaseManager()
|
||||
mock_svc = MagicMock()
|
||||
mock_svc.idle_stop_after_s = 600
|
||||
mock_profile = MagicMock()
|
||||
mock_profile.services = {"vllm": mock_svc}
|
||||
mock_profile_registry = MagicMock()
|
||||
mock_profile_registry.list_public.return_value = [mock_profile]
|
||||
|
||||
supervisor = AgentSupervisor(lease_manager=lm, profile_registry=mock_profile_registry)
|
||||
config = supervisor._build_idle_stop_config()
|
||||
assert config == {"vllm": 600}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_idle_sweep_posts_stop():
|
||||
lm = LeaseManager()
|
||||
service_registry = ServiceRegistry()
|
||||
|
||||
# Upsert instance as running, then allocate + release to transition it to idle
|
||||
service_registry.upsert_instance(
|
||||
service="vllm",
|
||||
node_id="heimdall",
|
||||
gpu_id=0,
|
||||
state="running",
|
||||
model="test-model",
|
||||
url="http://heimdall:8000",
|
||||
)
|
||||
alloc = service_registry.allocate(
|
||||
service="vllm",
|
||||
node_id="heimdall",
|
||||
gpu_id=0,
|
||||
model="test-model",
|
||||
url="http://heimdall:8000",
|
||||
caller="test",
|
||||
ttl_s=300.0,
|
||||
)
|
||||
service_registry.release(alloc.allocation_id)
|
||||
|
||||
# Backdate idle_since so it exceeds the timeout
|
||||
import dataclasses
|
||||
key = "vllm:heimdall:0"
|
||||
inst = service_registry._instances[key]
|
||||
service_registry._instances[key] = dataclasses.replace(inst, idle_since=time.time() - 700)
|
||||
|
||||
mock_profile_registry = MagicMock()
|
||||
mock_svc = MagicMock()
|
||||
mock_svc.idle_stop_after_s = 600
|
||||
mock_profile = MagicMock()
|
||||
mock_profile.services = {"vllm": mock_svc}
|
||||
mock_profile_registry.list_public.return_value = [mock_profile]
|
||||
|
||||
supervisor = AgentSupervisor(
|
||||
lease_manager=lm,
|
||||
service_registry=service_registry,
|
||||
profile_registry=mock_profile_registry,
|
||||
)
|
||||
supervisor.register("heimdall", "http://heimdall:7701")
|
||||
|
||||
posted_urls = []
|
||||
|
||||
async def fake_http_post(url: str) -> bool:
|
||||
posted_urls.append(url)
|
||||
return True
|
||||
|
||||
supervisor._http_post = fake_http_post
|
||||
await supervisor._run_idle_sweep()
|
||||
|
||||
assert len(posted_urls) == 1
|
||||
assert posted_urls[0] == "http://heimdall:7701/services/vllm/stop"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_run_idle_sweep_skips_without_registry():
|
||||
lm = LeaseManager()
|
||||
supervisor = AgentSupervisor(lease_manager=lm)
|
||||
# Should return immediately without error
|
||||
await supervisor._run_idle_sweep()
|
||||
|
|
@ -1,151 +0,0 @@
|
|||
# tests/test_resources/test_agent_watchdog.py
|
||||
"""
|
||||
Tests for AgentSupervisor watchdog behaviour:
|
||||
- restore_from_store() reloads known nodes from NodeStore on startup
|
||||
- register() persists to NodeStore
|
||||
- restored nodes start offline and come online after a successful poll
|
||||
- NodeStore=None path is a no-op (backwards compatibility)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
||||
|
||||
|
||||
# ── fixtures ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture
|
||||
def store(tmp_path: Path) -> NodeStore:
|
||||
return NodeStore(db_path=tmp_path / "nodes.db")
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def supervisor(store: NodeStore) -> AgentSupervisor:
|
||||
return AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def supervisor_no_store() -> AgentSupervisor:
|
||||
return AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
|
||||
|
||||
|
||||
# ── register() persists ───────────────────────────────────────────────────────
|
||||
|
||||
def test_register_persists_to_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
|
||||
supervisor.register("heimdall", "http://127.0.0.1:7701")
|
||||
rows = store.all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
|
||||
|
||||
|
||||
def test_register_updates_url_in_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
|
||||
supervisor.register("navi", "http://10.1.10.10:7701")
|
||||
supervisor.register("navi", "http://10.1.10.10:9999")
|
||||
rows = store.all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0][1] == "http://10.1.10.10:9999"
|
||||
|
||||
|
||||
def test_register_without_store_does_not_crash(supervisor_no_store: AgentSupervisor) -> None:
|
||||
supervisor_no_store.register("heimdall", "http://127.0.0.1:7701")
|
||||
assert supervisor_no_store.get_node_info("heimdall") is not None
|
||||
|
||||
|
||||
# ── restore_from_store() ──────────────────────────────────────────────────────
|
||||
|
||||
def test_restore_loads_known_nodes(tmp_path: Path) -> None:
|
||||
"""Nodes written by a previous supervisor session are restored into a fresh one."""
|
||||
db = tmp_path / "nodes.db"
|
||||
|
||||
# Session 1: register two nodes
|
||||
s1 = NodeStore(db_path=db)
|
||||
sup1 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s1)
|
||||
sup1.register("navi", "http://10.1.10.10:7701")
|
||||
sup1.register("strahl", "http://10.1.10.20:7701")
|
||||
|
||||
# Session 2: fresh supervisor, same DB
|
||||
s2 = NodeStore(db_path=db)
|
||||
sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
|
||||
restored = sup2.restore_from_store()
|
||||
|
||||
assert restored == 2
|
||||
assert sup2.get_node_info("navi") is not None
|
||||
assert sup2.get_node_info("strahl") is not None
|
||||
|
||||
|
||||
def test_restore_marks_nodes_offline(tmp_path: Path) -> None:
|
||||
"""Restored nodes start offline — they haven't been polled yet."""
|
||||
db = tmp_path / "nodes.db"
|
||||
|
||||
s1 = NodeStore(db_path=db)
|
||||
AgentSupervisor(lease_manager=LeaseManager(), node_store=s1).register(
|
||||
"navi", "http://10.1.10.10:7701"
|
||||
)
|
||||
|
||||
s2 = NodeStore(db_path=db)
|
||||
sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
|
||||
sup2.restore_from_store()
|
||||
|
||||
assert sup2.online_agents() == {}
|
||||
|
||||
|
||||
def test_restore_returns_zero_without_store() -> None:
|
||||
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
|
||||
assert sup.restore_from_store() == 0
|
||||
|
||||
|
||||
def test_restore_skips_already_registered(tmp_path: Path) -> None:
|
||||
"""Nodes manually registered before restore_from_store() are not duplicated."""
|
||||
db = tmp_path / "nodes.db"
|
||||
store = NodeStore(db_path=db)
|
||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
||||
|
||||
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
|
||||
sup.register("heimdall", "http://127.0.0.1:7701") # already in memory
|
||||
restored = sup.restore_from_store()
|
||||
|
||||
assert restored == 0 # already present, not double-counted
|
||||
|
||||
|
||||
# ── restored node comes online after poll ─────────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_restored_node_comes_online_after_poll(tmp_path: Path) -> None:
|
||||
"""After restore, a successful poll_agent() brings the node online."""
|
||||
db = tmp_path / "nodes.db"
|
||||
store = NodeStore(db_path=db)
|
||||
store.upsert("navi", "http://10.1.10.10:7701")
|
||||
|
||||
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
|
||||
sup.restore_from_store()
|
||||
|
||||
# Stub poll_agent to succeed
|
||||
gpu_payload = {"gpus": [{"gpu_id": 0, "name": "RTX 4000",
|
||||
"vram_total_mb": 8192, "vram_used_mb": 512, "vram_free_mb": 7680}]}
|
||||
resident_payload = {"residents": []}
|
||||
|
||||
mock_resp_gpu = MagicMock()
|
||||
mock_resp_gpu.raise_for_status = MagicMock()
|
||||
mock_resp_gpu.json.return_value = gpu_payload
|
||||
|
||||
mock_resp_res = MagicMock()
|
||||
mock_resp_res.is_success = True
|
||||
mock_resp_res.json.return_value = resident_payload
|
||||
|
||||
mock_client = AsyncMock()
|
||||
mock_client.get = AsyncMock(side_effect=[mock_resp_gpu, mock_resp_res])
|
||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("circuitforge_core.resources.coordinator.agent_supervisor.httpx.AsyncClient",
|
||||
return_value=mock_client):
|
||||
result = await sup.poll_agent("navi")
|
||||
|
||||
assert result is True
|
||||
assert "navi" in sup.online_agents()
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from circuitforge_core.resources.cli import app
|
||||
|
||||
runner = CliRunner()
|
||||
|
||||
|
||||
def test_cli_help():
|
||||
result = runner.invoke(app, ["--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "cf-orch" in result.output.lower() or "Usage" in result.output
|
||||
|
||||
|
||||
def test_status_command_shows_no_coordinator_message():
|
||||
with patch("httpx.get", side_effect=ConnectionRefusedError("refused")):
|
||||
result = runner.invoke(app, ["status"])
|
||||
assert result.exit_code != 0 or "unreachable" in result.output.lower() \
|
||||
or "coordinator" in result.output.lower()
|
||||
|
||||
|
||||
def test_install_service_creates_systemd_unit(tmp_path: Path):
|
||||
unit_path = tmp_path / "cf-orch.service"
|
||||
with patch(
|
||||
"circuitforge_core.resources.cli._SYSTEMD_UNIT_PATH", unit_path
|
||||
):
|
||||
result = runner.invoke(app, ["install-service", "--dry-run"])
|
||||
assert result.exit_code == 0
|
||||
assert "cf-orch.service" in result.output or "systemd" in result.output.lower()
|
||||
|
|
@ -1,94 +0,0 @@
|
|||
import json
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import httpretty
|
||||
from circuitforge_core.resources.client import CFOrchClient, Allocation
|
||||
|
||||
_ALLOC_BODY = (
|
||||
'{"allocation_id":"abc123","service":"vllm","node_id":"heimdall",'
|
||||
'"gpu_id":0,"model":"Ouro-1.4B","url":"http://heimdall:8000","started":false,"warm":true}'
|
||||
)
|
||||
|
||||
|
||||
@httpretty.activate
|
||||
def test_sync_allocate_returns_allocation():
|
||||
httpretty.register_uri(
|
||||
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
|
||||
body=_ALLOC_BODY, content_type="application/json",
|
||||
)
|
||||
httpretty.register_uri(
|
||||
httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/abc123",
|
||||
body='{"released":true}', content_type="application/json",
|
||||
)
|
||||
client = CFOrchClient("http://orch:7700")
|
||||
with client.allocate("vllm", model_candidates=["Ouro-1.4B"], caller="test") as alloc:
|
||||
assert isinstance(alloc, Allocation)
|
||||
assert alloc.url == "http://heimdall:8000"
|
||||
assert alloc.model == "Ouro-1.4B"
|
||||
assert alloc.allocation_id == "abc123"
|
||||
assert httpretty.last_request().method == "DELETE"
|
||||
|
||||
|
||||
@httpretty.activate
|
||||
def test_sync_allocate_ignores_404_on_release():
|
||||
httpretty.register_uri(
|
||||
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
|
||||
body='{"allocation_id":"xyz","service":"vllm","node_id":"a","gpu_id":0,'
|
||||
'"model":"m","url":"http://a:8000","started":false,"warm":false}',
|
||||
content_type="application/json",
|
||||
)
|
||||
httpretty.register_uri(
|
||||
httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/xyz",
|
||||
status=404, body='{"detail":"not found"}', content_type="application/json",
|
||||
)
|
||||
client = CFOrchClient("http://orch:7700")
|
||||
with client.allocate("vllm", model_candidates=["m"]) as alloc:
|
||||
assert alloc.url == "http://a:8000"
|
||||
# No exception raised — 404 on release is silently ignored
|
||||
|
||||
|
||||
@httpretty.activate
|
||||
def test_sync_allocate_raises_on_503():
|
||||
httpretty.register_uri(
|
||||
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
|
||||
status=503, body='{"detail":"no capacity"}', content_type="application/json",
|
||||
)
|
||||
client = CFOrchClient("http://orch:7700")
|
||||
with pytest.raises(RuntimeError, match="cf-orch allocation failed"):
|
||||
with client.allocate("vllm", model_candidates=["m"]):
|
||||
pass
|
||||
|
||||
|
||||
async def test_async_allocate_works():
|
||||
# httpretty only patches stdlib sockets; httpx async uses anyio sockets so
|
||||
# we mock httpx.AsyncClient directly instead.
|
||||
alloc_data = {
|
||||
"allocation_id": "a1", "service": "vllm", "node_id": "n",
|
||||
"gpu_id": 0, "model": "m", "url": "http://n:8000",
|
||||
"started": False, "warm": False,
|
||||
}
|
||||
release_data = {"released": True}
|
||||
|
||||
def _make_response(data, status_code=200):
|
||||
resp = MagicMock()
|
||||
resp.is_success = status_code < 400
|
||||
resp.status_code = status_code
|
||||
resp.json.return_value = data
|
||||
return resp
|
||||
|
||||
mock_post = AsyncMock(return_value=_make_response(alloc_data))
|
||||
mock_delete = AsyncMock(return_value=_make_response(release_data))
|
||||
|
||||
mock_async_client = MagicMock()
|
||||
mock_async_client.post = mock_post
|
||||
mock_async_client.delete = mock_delete
|
||||
mock_async_client.__aenter__ = AsyncMock(return_value=mock_async_client)
|
||||
mock_async_client.__aexit__ = AsyncMock(return_value=False)
|
||||
|
||||
with patch("httpx.AsyncClient", return_value=mock_async_client):
|
||||
client = CFOrchClient("http://orch:7700")
|
||||
async with client.allocate_async("vllm", model_candidates=["m"]) as alloc:
|
||||
assert alloc.url == "http://n:8000"
|
||||
assert alloc.allocation_id == "a1"
|
||||
mock_delete.assert_called_once()
|
||||
|
|
@ -1,132 +0,0 @@
|
|||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
from fastapi.testclient import TestClient
|
||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
|
||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
||||
|
||||
|
||||
def _make_supervisor_mock(online: bool = True):
|
||||
sup = MagicMock()
|
||||
record = AgentRecord(node_id="heimdall", agent_url="http://heimdall:7701")
|
||||
record.gpus = [GpuInfo(0, "RTX 4000", 8192, 0, 8192)]
|
||||
record.online = online
|
||||
sup.online_agents.return_value = {"heimdall": record} if online else {}
|
||||
sup.get_node_info.return_value = NodeInfo(
|
||||
node_id="heimdall",
|
||||
agent_url="http://heimdall:7701",
|
||||
gpus=record.gpus,
|
||||
last_heartbeat=0.0,
|
||||
)
|
||||
return sup
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def alloc_client():
|
||||
lm = LeaseManager()
|
||||
pr = ProfileRegistry()
|
||||
sup = _make_supervisor_mock()
|
||||
sr = ServiceRegistry()
|
||||
app = create_coordinator_app(lease_manager=lm, profile_registry=pr, agent_supervisor=sup, service_registry=sr)
|
||||
return TestClient(app), sup, sr
|
||||
|
||||
|
||||
def test_allocate_returns_allocation_id_and_url(alloc_client):
|
||||
client, sup, sr = alloc_client
|
||||
with patch("httpx.AsyncClient") as mock_http:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.is_success = True
|
||||
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
|
||||
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
resp = client.post("/api/services/vllm/allocate", json={
|
||||
"model_candidates": ["Ouro-1.4B"],
|
||||
"ttl_s": 300.0,
|
||||
"caller": "test",
|
||||
})
|
||||
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert "allocation_id" in data
|
||||
assert data["service"] == "vllm"
|
||||
assert data["node_id"] == "heimdall"
|
||||
assert data["url"] == "http://heimdall:8000"
|
||||
|
||||
|
||||
def test_allocate_returns_503_when_no_online_nodes(alloc_client):
|
||||
client, sup, sr = alloc_client
|
||||
sup.online_agents.return_value = {}
|
||||
resp = client.post("/api/services/vllm/allocate", json={"model_candidates": ["Ouro-1.4B"]})
|
||||
assert resp.status_code == 503
|
||||
|
||||
|
||||
def test_allocate_returns_422_for_empty_candidates(alloc_client):
|
||||
client, _, sr = alloc_client
|
||||
resp = client.post("/api/services/vllm/allocate", json={"model_candidates": []})
|
||||
assert resp.status_code == 422
|
||||
|
||||
|
||||
def test_allocate_returns_422_for_unknown_service(alloc_client):
|
||||
client, _, sr = alloc_client
|
||||
resp = client.post("/api/services/cf-made-up/allocate", json={"model_candidates": ["x"]})
|
||||
assert resp.status_code == 422
|
||||
|
||||
|
||||
def test_allocate_records_in_registry(alloc_client):
|
||||
client, sup, sr = alloc_client
|
||||
with patch("httpx.AsyncClient") as mock_http:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.is_success = True
|
||||
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
|
||||
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
resp = client.post("/api/services/vllm/allocate", json={
|
||||
"model_candidates": ["Ouro-1.4B"],
|
||||
"ttl_s": 300.0,
|
||||
"caller": "test",
|
||||
})
|
||||
|
||||
assert resp.status_code == 200
|
||||
allocation_id = resp.json()["allocation_id"]
|
||||
|
||||
status_resp = client.get("/api/services/vllm/status")
|
||||
assert status_resp.status_code == 200
|
||||
status_data = status_resp.json()
|
||||
assert status_data["service"] == "vllm"
|
||||
alloc_ids = [a["allocation_id"] for a in status_data["allocations"]]
|
||||
assert allocation_id in alloc_ids
|
||||
|
||||
|
||||
def test_release_allocation(alloc_client):
|
||||
client, sup, sr = alloc_client
|
||||
with patch("httpx.AsyncClient") as mock_http:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.is_success = True
|
||||
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
|
||||
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
|
||||
|
||||
resp = client.post("/api/services/vllm/allocate", json={
|
||||
"model_candidates": ["Ouro-1.4B"],
|
||||
"ttl_s": 300.0,
|
||||
"caller": "test",
|
||||
})
|
||||
|
||||
assert resp.status_code == 200
|
||||
allocation_id = resp.json()["allocation_id"]
|
||||
|
||||
del_resp = client.delete(f"/api/services/vllm/allocations/{allocation_id}")
|
||||
assert del_resp.status_code == 200
|
||||
assert del_resp.json() == {"released": True, "allocation_id": allocation_id}
|
||||
|
||||
status_resp = client.get("/api/services/vllm/status")
|
||||
alloc_ids = [a["allocation_id"] for a in status_resp.json()["allocations"]]
|
||||
assert allocation_id not in alloc_ids
|
||||
|
||||
|
||||
def test_release_allocation_not_found(alloc_client):
|
||||
client, _, sr = alloc_client
|
||||
resp = client.delete("/api/services/vllm/allocations/bad-id")
|
||||
assert resp.status_code == 404
|
||||
|
|
@ -1,183 +0,0 @@
|
|||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
from pathlib import Path
|
||||
from fastapi.testclient import TestClient
|
||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
||||
from circuitforge_core.resources.profiles.schema import load_profile
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def coordinator_client():
|
||||
lease_manager = LeaseManager()
|
||||
lease_manager.register_gpu("heimdall", 0, 8192)
|
||||
profile_registry = ProfileRegistry()
|
||||
supervisor = MagicMock()
|
||||
supervisor.all_nodes.return_value = [
|
||||
NodeInfo(
|
||||
node_id="heimdall",
|
||||
agent_url="http://localhost:7701",
|
||||
gpus=[GpuInfo(gpu_id=0, name="RTX 4000",
|
||||
vram_total_mb=8192, vram_used_mb=0, vram_free_mb=8192)],
|
||||
last_heartbeat=0.0,
|
||||
)
|
||||
]
|
||||
supervisor.get_node_info.return_value = NodeInfo(
|
||||
node_id="heimdall",
|
||||
agent_url="http://localhost:7701",
|
||||
gpus=[],
|
||||
last_heartbeat=0.0,
|
||||
)
|
||||
app = create_coordinator_app(
|
||||
lease_manager=lease_manager,
|
||||
profile_registry=profile_registry,
|
||||
agent_supervisor=supervisor,
|
||||
service_registry=ServiceRegistry(),
|
||||
)
|
||||
return TestClient(app), lease_manager
|
||||
|
||||
|
||||
def test_health_returns_ok(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
resp = client.get("/api/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "ok"
|
||||
|
||||
|
||||
def test_get_nodes_returns_list(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
resp = client.get("/api/nodes")
|
||||
assert resp.status_code == 200
|
||||
nodes = resp.json()["nodes"]
|
||||
assert len(nodes) == 1
|
||||
assert nodes[0]["node_id"] == "heimdall"
|
||||
|
||||
|
||||
def test_get_profiles_returns_public_profiles(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
resp = client.get("/api/profiles")
|
||||
assert resp.status_code == 200
|
||||
names = [p["name"] for p in resp.json()["profiles"]]
|
||||
assert "single-gpu-8gb" in names
|
||||
|
||||
|
||||
def test_post_lease_grants_lease(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
resp = client.post("/api/leases", json={
|
||||
"node_id": "heimdall", "gpu_id": 0,
|
||||
"mb": 2048, "service": "peregrine", "priority": 1,
|
||||
})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["lease"]["mb_granted"] == 2048
|
||||
assert data["lease"]["holder_service"] == "peregrine"
|
||||
assert "lease_id" in data["lease"]
|
||||
|
||||
|
||||
def test_delete_lease_releases_it(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
resp = client.post("/api/leases", json={
|
||||
"node_id": "heimdall", "gpu_id": 0,
|
||||
"mb": 2048, "service": "peregrine", "priority": 1,
|
||||
})
|
||||
lease_id = resp.json()["lease"]["lease_id"]
|
||||
del_resp = client.delete(f"/api/leases/{lease_id}")
|
||||
assert del_resp.status_code == 200
|
||||
assert del_resp.json()["released"] is True
|
||||
|
||||
|
||||
def test_delete_unknown_lease_returns_404(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
resp = client.delete("/api/leases/nonexistent-id")
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_get_leases_returns_active_leases(coordinator_client):
|
||||
client, _ = coordinator_client
|
||||
client.post("/api/leases", json={
|
||||
"node_id": "heimdall", "gpu_id": 0,
|
||||
"mb": 1024, "service": "kiwi", "priority": 2,
|
||||
})
|
||||
resp = client.get("/api/leases")
|
||||
assert resp.status_code == 200
|
||||
assert len(resp.json()["leases"]) == 1
|
||||
|
||||
|
||||
def test_dashboard_serves_html(coordinator_client):
|
||||
"""GET / returns the dashboard HTML page."""
|
||||
client, _ = coordinator_client
|
||||
resp = client.get("/")
|
||||
assert resp.status_code == 200
|
||||
assert "text/html" in resp.headers["content-type"]
|
||||
# Verify key structural markers are present (without asserting exact markup)
|
||||
assert "cf-orch" in resp.text
|
||||
assert "/api/nodes" in resp.text
|
||||
assert "/api/leases" in resp.text
|
||||
|
||||
|
||||
def test_online_agents_excludes_offline():
|
||||
lm = LeaseManager()
|
||||
sup = AgentSupervisor(lm)
|
||||
sup.register("online_node", "http://a:7701")
|
||||
sup.register("offline_node", "http://b:7701")
|
||||
sup._agents["online_node"].online = True
|
||||
sup._agents["offline_node"].online = False
|
||||
result = sup.online_agents()
|
||||
assert "online_node" in result
|
||||
assert "offline_node" not in result
|
||||
|
||||
|
||||
def test_resident_keys_returns_set_of_node_service():
|
||||
lm = LeaseManager()
|
||||
lm.set_residents_for_node("heimdall", [("vllm", "Ouro-1.4B"), ("ollama", None)])
|
||||
keys = lm.resident_keys()
|
||||
assert keys == {"heimdall:vllm", "heimdall:ollama"}
|
||||
|
||||
|
||||
def test_single_gpu_8gb_profile_has_idle_stop_after_s():
|
||||
profile = load_profile(
|
||||
Path("circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml")
|
||||
)
|
||||
vllm_svc = profile.services.get("vllm")
|
||||
assert vllm_svc is not None
|
||||
assert hasattr(vllm_svc, "idle_stop_after_s")
|
||||
assert vllm_svc.idle_stop_after_s == 600
|
||||
|
||||
|
||||
def test_ensure_service_returns_503_when_vram_too_low():
|
||||
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
|
||||
# Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
|
||||
lease_manager = LeaseManager()
|
||||
lease_manager.register_gpu("low-vram-node", 0, 512)
|
||||
profile_registry = ProfileRegistry()
|
||||
supervisor = MagicMock()
|
||||
supervisor.get_node_info.return_value = NodeInfo(
|
||||
node_id="low-vram-node",
|
||||
agent_url="http://localhost:7701",
|
||||
gpus=[GpuInfo(gpu_id=0, name="GTX 1050",
|
||||
vram_total_mb=512, vram_used_mb=412, vram_free_mb=100)],
|
||||
last_heartbeat=0.0,
|
||||
)
|
||||
supervisor.all_nodes.return_value = []
|
||||
app = create_coordinator_app(
|
||||
lease_manager=lease_manager,
|
||||
profile_registry=profile_registry,
|
||||
agent_supervisor=supervisor,
|
||||
service_registry=ServiceRegistry(),
|
||||
)
|
||||
client = TestClient(app)
|
||||
|
||||
resp = client.post("/api/services/vllm/ensure", json={
|
||||
"node_id": "low-vram-node",
|
||||
"gpu_id": 0,
|
||||
"params": {"model": "some-model"},
|
||||
})
|
||||
|
||||
assert resp.status_code == 503
|
||||
assert "Insufficient VRAM" in resp.json()["detail"]
|
||||
# Guard must fire before any agent HTTP call is attempted.
|
||||
supervisor.get_node_info.assert_called_once_with("low-vram-node")
|
||||
|
|
@ -1,148 +0,0 @@
|
|||
"""Tests for HeimdallAuthMiddleware — TTL cache and request gating."""
|
||||
import time
|
||||
import pytest
|
||||
from unittest.mock import patch, MagicMock
|
||||
from fastapi import FastAPI
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from circuitforge_core.resources.coordinator.auth import (
|
||||
HeimdallAuthMiddleware,
|
||||
_ValidationCache,
|
||||
CACHE_TTL_S,
|
||||
)
|
||||
|
||||
|
||||
# ── Cache unit tests ──────────────────────────────────────────────────────────
|
||||
|
||||
def test_cache_miss_returns_none():
|
||||
cache = _ValidationCache()
|
||||
assert cache.get("nonexistent") is None
|
||||
|
||||
|
||||
def test_cache_stores_and_retrieves():
|
||||
cache = _ValidationCache()
|
||||
cache.set("key1", valid=True, tier="paid", user_id="u1")
|
||||
entry = cache.get("key1")
|
||||
assert entry is not None
|
||||
assert entry.valid is True
|
||||
assert entry.tier == "paid"
|
||||
|
||||
|
||||
def test_cache_entry_expires():
|
||||
cache = _ValidationCache(ttl_s=0.05)
|
||||
cache.set("key1", valid=True, tier="paid", user_id="u1")
|
||||
time.sleep(0.1)
|
||||
assert cache.get("key1") is None
|
||||
|
||||
|
||||
def test_cache_evict_removes_key():
|
||||
cache = _ValidationCache()
|
||||
cache.set("key1", valid=True, tier="paid", user_id="u1")
|
||||
cache.evict("key1")
|
||||
assert cache.get("key1") is None
|
||||
|
||||
|
||||
def test_cache_prune_removes_expired():
|
||||
cache = _ValidationCache(ttl_s=0.05)
|
||||
cache.set("k1", valid=True, tier="paid", user_id="")
|
||||
cache.set("k2", valid=True, tier="paid", user_id="")
|
||||
time.sleep(0.1)
|
||||
removed = cache.prune()
|
||||
assert removed == 2
|
||||
|
||||
|
||||
# ── Middleware integration tests ──────────────────────────────────────────────
|
||||
|
||||
def _make_app_with_auth(middleware: HeimdallAuthMiddleware) -> TestClient:
|
||||
app = FastAPI()
|
||||
app.middleware("http")(middleware)
|
||||
|
||||
@app.get("/api/health")
|
||||
def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.post("/api/services/vllm/allocate")
|
||||
def allocate():
|
||||
return {"allocation_id": "abc", "url": "http://gpu:8000"}
|
||||
|
||||
return TestClient(app, raise_server_exceptions=False)
|
||||
|
||||
|
||||
def _patched_middleware(valid: bool, tier: str = "paid") -> HeimdallAuthMiddleware:
|
||||
"""Return a middleware whose Heimdall call is pre-mocked."""
|
||||
mw = HeimdallAuthMiddleware(
|
||||
heimdall_url="http://heimdall.test",
|
||||
min_tier="paid",
|
||||
)
|
||||
mw._validate_against_heimdall = MagicMock( # type: ignore[method-assign]
|
||||
return_value=(valid, tier, "user-1" if valid else "")
|
||||
)
|
||||
return mw
|
||||
|
||||
|
||||
def test_health_exempt_no_auth_required():
|
||||
mw = _patched_middleware(valid=True)
|
||||
client = _make_app_with_auth(mw)
|
||||
resp = client.get("/api/health")
|
||||
assert resp.status_code == 200
|
||||
|
||||
|
||||
def test_missing_auth_header_returns_401():
|
||||
mw = _patched_middleware(valid=True)
|
||||
client = _make_app_with_auth(mw)
|
||||
resp = client.post("/api/services/vllm/allocate")
|
||||
assert resp.status_code == 401
|
||||
|
||||
|
||||
def test_invalid_key_returns_403():
|
||||
mw = _patched_middleware(valid=False)
|
||||
client = _make_app_with_auth(mw)
|
||||
resp = client.post(
|
||||
"/api/services/vllm/allocate",
|
||||
headers={"Authorization": "Bearer BAD-KEY"},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
|
||||
|
||||
def test_valid_paid_key_passes():
|
||||
mw = _patched_middleware(valid=True, tier="paid")
|
||||
client = _make_app_with_auth(mw)
|
||||
resp = client.post(
|
||||
"/api/services/vllm/allocate",
|
||||
headers={"Authorization": "Bearer CFG-KIWI-GOOD-GOOD-GOOD"},
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
|
||||
|
||||
def test_free_tier_key_rejected_when_min_is_paid():
|
||||
mw = _patched_middleware(valid=True, tier="free")
|
||||
client = _make_app_with_auth(mw)
|
||||
resp = client.post(
|
||||
"/api/services/vllm/allocate",
|
||||
headers={"Authorization": "Bearer CFG-KIWI-FREE-FREE-FREE"},
|
||||
)
|
||||
assert resp.status_code == 403
|
||||
assert "paid" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_cache_prevents_second_heimdall_call():
|
||||
mw = _patched_middleware(valid=True, tier="paid")
|
||||
client = _make_app_with_auth(mw)
|
||||
key = "CFG-KIWI-CACHED-KEY-1"
|
||||
headers = {"Authorization": f"Bearer {key}"}
|
||||
client.post("/api/services/vllm/allocate", headers=headers)
|
||||
client.post("/api/services/vllm/allocate", headers=headers)
|
||||
# Heimdall should only have been called once — second hit is from cache
|
||||
assert mw._validate_against_heimdall.call_count == 1 # type: ignore[attr-defined]
|
||||
|
||||
|
||||
def test_from_env_returns_none_without_heimdall_url(monkeypatch):
|
||||
monkeypatch.delenv("HEIMDALL_URL", raising=False)
|
||||
assert HeimdallAuthMiddleware.from_env() is None
|
||||
|
||||
|
||||
def test_from_env_returns_middleware_when_set(monkeypatch):
|
||||
monkeypatch.setenv("HEIMDALL_URL", "http://heimdall.test")
|
||||
mw = HeimdallAuthMiddleware.from_env()
|
||||
assert mw is not None
|
||||
assert mw._heimdall == "http://heimdall.test"
|
||||
|
|
@ -1,215 +0,0 @@
|
|||
# tests/test_resources/test_coordinator_probe.py
|
||||
"""
|
||||
Unit tests for _run_instance_probe_loop in coordinator/app.py.
|
||||
|
||||
Covers:
|
||||
- healthy path: /health → 200 → state transitions starting → running
|
||||
- timeout path: no healthy response within _PROBE_TIMEOUT_S → starting → stopped
|
||||
- cleanup path: non-starting instance cleans up its start_times entry
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from circuitforge_core.resources.coordinator.app import (
|
||||
_PROBE_TIMEOUT_S,
|
||||
_run_instance_probe_loop,
|
||||
)
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceInstance, ServiceRegistry
|
||||
|
||||
|
||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _inst(**kwargs) -> ServiceInstance:
|
||||
defaults = dict(
|
||||
service="vllm", node_id="node1", gpu_id=0,
|
||||
state="starting", model="qwen", url="http://localhost:8000",
|
||||
)
|
||||
defaults.update(kwargs)
|
||||
return ServiceInstance(**defaults)
|
||||
|
||||
|
||||
def _registry(*instances: ServiceInstance) -> MagicMock:
|
||||
reg = MagicMock(spec=ServiceRegistry)
|
||||
reg.all_instances.return_value = list(instances)
|
||||
return reg
|
||||
|
||||
|
||||
def _health_resp(status: int = 200) -> MagicMock:
|
||||
"""Context-manager mock that simulates an HTTP response."""
|
||||
resp = MagicMock()
|
||||
resp.status = status
|
||||
resp.__enter__ = lambda s: resp
|
||||
resp.__exit__ = MagicMock(return_value=False)
|
||||
return resp
|
||||
|
||||
|
||||
async def _one_tick(coro_fn, registry, *, time_val: float = 1000.0, **url_patch):
|
||||
"""
|
||||
Run the probe loop for exactly one iteration then cancel it.
|
||||
|
||||
asyncio.sleep is patched to return immediately on the first call
|
||||
and raise CancelledError on the second (ending the loop cleanly).
|
||||
"""
|
||||
calls = 0
|
||||
|
||||
async def _fake_sleep(_delay):
|
||||
nonlocal calls
|
||||
calls += 1
|
||||
if calls > 1:
|
||||
raise asyncio.CancelledError()
|
||||
|
||||
patches = [
|
||||
patch("asyncio.sleep", new=_fake_sleep),
|
||||
patch("time.time", return_value=time_val),
|
||||
]
|
||||
if url_patch:
|
||||
patches.append(patch("urllib.request.urlopen", **url_patch))
|
||||
|
||||
ctx = [p.__enter__() for p in patches]
|
||||
try:
|
||||
await coro_fn(registry)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
finally:
|
||||
for p in reversed(patches):
|
||||
p.__exit__(None, None, None)
|
||||
|
||||
|
||||
# ── tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_probe_transitions_starting_to_running():
|
||||
"""GET /health → 200 while in starting state → upsert_instance(state='running')."""
|
||||
reg = _registry(_inst(state="starting", url="http://localhost:8000"))
|
||||
|
||||
calls = 0
|
||||
|
||||
async def fake_sleep(_delay):
|
||||
nonlocal calls
|
||||
calls += 1
|
||||
if calls > 1:
|
||||
raise asyncio.CancelledError()
|
||||
|
||||
with patch("asyncio.sleep", new=fake_sleep), \
|
||||
patch("time.time", return_value=1000.0), \
|
||||
patch("urllib.request.urlopen", return_value=_health_resp(200)):
|
||||
try:
|
||||
await _run_instance_probe_loop(reg)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
reg.upsert_instance.assert_called_once_with(
|
||||
service="vllm", node_id="node1", gpu_id=0,
|
||||
state="running", model="qwen", url="http://localhost:8000",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_probe_transitions_starting_to_stopped_on_timeout():
|
||||
"""No healthy response + time past _PROBE_TIMEOUT_S → upsert_instance(state='stopped').
|
||||
|
||||
Tick 1: seeds start_times[key] = 1000.0
|
||||
Tick 2: time has advanced past _PROBE_TIMEOUT_S → timeout fires → stopped
|
||||
Tick 3: CancelledError exits the loop
|
||||
"""
|
||||
reg = _registry(_inst(state="starting", url="http://localhost:8000"))
|
||||
|
||||
tick = 0
|
||||
# Tick 1: t=1000 (seed); Tick 2: t=far_future (timeout fires)
|
||||
times = [1000.0, 1000.0 + _PROBE_TIMEOUT_S + 1.0]
|
||||
|
||||
async def fake_sleep(_delay):
|
||||
nonlocal tick
|
||||
tick += 1
|
||||
if tick > 2:
|
||||
raise asyncio.CancelledError()
|
||||
|
||||
with patch("asyncio.sleep", new=fake_sleep), \
|
||||
patch("time.time", side_effect=times * 10), \
|
||||
patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
|
||||
try:
|
||||
await _run_instance_probe_loop(reg)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
reg.upsert_instance.assert_called_once_with(
|
||||
service="vllm", node_id="node1", gpu_id=0,
|
||||
state="stopped", model="qwen", url="http://localhost:8000",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_probe_cleans_up_start_times_for_non_starting():
|
||||
"""
|
||||
An instance that is no longer in 'starting' state should not cause
|
||||
upsert_instance to be called, and its key should be removed from start_times.
|
||||
|
||||
We verify this indirectly: run two ticks — first with state='starting' (seeds
|
||||
the key and transitions to running), second with the updated registry returning
|
||||
state='running' (should not call upsert again).
|
||||
"""
|
||||
starting_inst = _inst(state="starting", url="http://localhost:8000")
|
||||
running_inst = _inst(state="running", url="http://localhost:8000")
|
||||
|
||||
tick = 0
|
||||
|
||||
# First tick: instance is starting → transitions to running
|
||||
# Second tick: registry now returns running → no upsert
|
||||
# Third tick: cancel
|
||||
def instances_side_effect():
|
||||
if tick <= 1:
|
||||
return [starting_inst]
|
||||
return [running_inst]
|
||||
|
||||
reg = MagicMock(spec=ServiceRegistry)
|
||||
reg.all_instances.side_effect = instances_side_effect
|
||||
|
||||
async def fake_sleep(_delay):
|
||||
nonlocal tick
|
||||
tick += 1
|
||||
if tick > 2:
|
||||
raise asyncio.CancelledError()
|
||||
|
||||
with patch("asyncio.sleep", new=fake_sleep), \
|
||||
patch("time.time", return_value=1000.0), \
|
||||
patch("urllib.request.urlopen", return_value=_health_resp(200)):
|
||||
try:
|
||||
await _run_instance_probe_loop(reg)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# upsert should have been called exactly once (the starting→running transition)
|
||||
assert reg.upsert_instance.call_count == 1
|
||||
reg.upsert_instance.assert_called_once_with(
|
||||
service="vllm", node_id="node1", gpu_id=0,
|
||||
state="running", model="qwen", url="http://localhost:8000",
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_probe_no_url_does_not_attempt_health_check():
|
||||
"""Instance with no URL stays in starting state (no health check, no timeout yet)."""
|
||||
reg = _registry(_inst(state="starting", url=None))
|
||||
|
||||
tick = 0
|
||||
|
||||
async def fake_sleep(_delay):
|
||||
nonlocal tick
|
||||
tick += 1
|
||||
if tick > 1:
|
||||
raise asyncio.CancelledError()
|
||||
|
||||
with patch("asyncio.sleep", new=fake_sleep), \
|
||||
patch("time.time", return_value=1000.0), \
|
||||
patch("urllib.request.urlopen") as mock_urlopen:
|
||||
try:
|
||||
await _run_instance_probe_loop(reg)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
mock_urlopen.assert_not_called()
|
||||
reg.upsert_instance.assert_not_called()
|
||||
|
|
@ -1,215 +0,0 @@
|
|||
# tests/test_resources/test_docuvision.py
|
||||
"""
|
||||
Unit tests for cf-docuvision FastAPI service (circuitforge_core/resources/docuvision/app.py).
|
||||
|
||||
Covers:
|
||||
- GET /health → status + model path
|
||||
- POST /extract → image_b64, image_path, hint routing, metadata fields
|
||||
- _parse_dolphin_output → JSON list path, table detection, plain-text fallback
|
||||
- _image_from_request → missing both fields → 422; bad image_path → 404
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from fastapi.testclient import TestClient
|
||||
from PIL import Image
|
||||
|
||||
import circuitforge_core.resources.docuvision.app as docuvision_module
|
||||
from circuitforge_core.resources.docuvision.app import (
|
||||
_parse_dolphin_output,
|
||||
app,
|
||||
)
|
||||
|
||||
|
||||
# ── fixtures ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _make_jpeg_b64(width: int = 10, height: int = 10) -> str:
|
||||
"""Return a base64-encoded 10x10 white JPEG."""
|
||||
img = Image.new("RGB", (width, height), color=(255, 255, 255))
|
||||
buf = io.BytesIO()
|
||||
img.save(buf, format="JPEG")
|
||||
return base64.b64encode(buf.getvalue()).decode()
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_module_state():
|
||||
"""Reset module-level model state between tests."""
|
||||
docuvision_module._model = None
|
||||
docuvision_module._processor = None
|
||||
docuvision_module._model_path = "/fake/model"
|
||||
docuvision_module._device = "cpu"
|
||||
yield
|
||||
docuvision_module._model = None
|
||||
docuvision_module._processor = None
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_model():
|
||||
"""
|
||||
Inject fake model + processor into the module so _load_model() is skipped.
|
||||
|
||||
The processor returns a dict-like with 'input_ids'; the model generate()
|
||||
returns a tensor-like whose decode produces a JSON string.
|
||||
"""
|
||||
fake_ids = MagicMock()
|
||||
fake_ids.shape = [1, 5] # input_len = 5
|
||||
|
||||
fake_inputs = {"input_ids": fake_ids}
|
||||
fake_inputs_obj = MagicMock()
|
||||
fake_inputs_obj.__getitem__ = lambda self, k: fake_inputs[k]
|
||||
fake_inputs_obj.to = lambda device: fake_inputs_obj
|
||||
|
||||
fake_output = MagicMock()
|
||||
fake_output.__getitem__ = lambda self, idx: MagicMock() # output_ids[0]
|
||||
|
||||
fake_model = MagicMock()
|
||||
fake_model.generate.return_value = fake_output
|
||||
|
||||
fake_processor = MagicMock()
|
||||
fake_processor.return_value = fake_inputs_obj
|
||||
fake_processor.decode.return_value = json.dumps([
|
||||
{"type": "heading", "text": "Invoice", "bbox": [0.0, 0.0, 1.0, 0.1]},
|
||||
{"type": "table", "text": "row1", "html": "<table><tr><td>row1</td></tr></table>",
|
||||
"bbox": [0.0, 0.1, 1.0, 0.5]},
|
||||
])
|
||||
|
||||
docuvision_module._model = fake_model
|
||||
docuvision_module._processor = fake_processor
|
||||
return fake_model, fake_processor
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
return TestClient(app)
|
||||
|
||||
|
||||
# ── health ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_health_returns_ok(client):
|
||||
resp = client.get("/health")
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert data["status"] == "ok"
|
||||
assert data["model"] == "/fake/model"
|
||||
|
||||
|
||||
# ── _parse_dolphin_output ────────────────────────────────────────────────────
|
||||
|
||||
def test_parse_json_list_elements():
|
||||
raw = json.dumps([
|
||||
{"type": "heading", "text": "Title"},
|
||||
{"type": "paragraph", "text": "Body text"},
|
||||
])
|
||||
elements, tables, raw_text = _parse_dolphin_output(raw)
|
||||
assert len(elements) == 2
|
||||
assert elements[0].type == "heading"
|
||||
assert elements[0].text == "Title"
|
||||
assert elements[1].type == "paragraph"
|
||||
assert raw_text == "Title\nBody text"
|
||||
assert tables == []
|
||||
|
||||
|
||||
def test_parse_json_table_extracted():
|
||||
raw = json.dumps([
|
||||
{"type": "table", "text": "row", "html": "<table><tr><td>A</td></tr></table>",
|
||||
"bbox": [0.0, 0.0, 1.0, 0.5]},
|
||||
])
|
||||
elements, tables, raw_text = _parse_dolphin_output(raw)
|
||||
assert len(tables) == 1
|
||||
assert tables[0].html == "<table><tr><td>A</td></tr></table>"
|
||||
assert tables[0].bbox == [0.0, 0.0, 1.0, 0.5]
|
||||
assert len(elements) == 1
|
||||
assert elements[0].type == "table"
|
||||
|
||||
|
||||
def test_parse_plain_text_fallback():
|
||||
raw = "This is not JSON at all."
|
||||
elements, tables, raw_text = _parse_dolphin_output(raw)
|
||||
assert len(elements) == 1
|
||||
assert elements[0].type == "paragraph"
|
||||
assert elements[0].text == raw
|
||||
assert tables == []
|
||||
assert raw_text == raw
|
||||
|
||||
|
||||
def test_parse_empty_string_fallback():
|
||||
elements, tables, raw_text = _parse_dolphin_output("")
|
||||
assert len(elements) == 1
|
||||
assert elements[0].type == "paragraph"
|
||||
assert elements[0].text == ""
|
||||
|
||||
|
||||
def test_parse_json_missing_type_defaults_to_paragraph():
|
||||
raw = json.dumps([{"text": "no type field"}])
|
||||
elements, tables, _ = _parse_dolphin_output(raw)
|
||||
assert elements[0].type == "paragraph"
|
||||
|
||||
|
||||
# ── POST /extract ─────────────────────────────────────────────────────────────
|
||||
|
||||
def test_extract_image_b64(client, mock_model):
|
||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "auto"})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert "elements" in data
|
||||
assert "raw_text" in data
|
||||
assert "tables" in data
|
||||
assert data["metadata"]["hint"] == "auto"
|
||||
assert data["metadata"]["model"] == "/fake/model"
|
||||
assert data["metadata"]["width"] == 10
|
||||
assert data["metadata"]["height"] == 10
|
||||
|
||||
|
||||
def test_extract_hint_table_routes_correct_prompt(client, mock_model):
|
||||
_, fake_processor = mock_model
|
||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "table"})
|
||||
assert resp.status_code == 200
|
||||
# Verify processor was called with the table-specific prompt
|
||||
call_kwargs = fake_processor.call_args
|
||||
assert "table" in call_kwargs.kwargs.get("text", "") or \
|
||||
"table" in str(call_kwargs)
|
||||
|
||||
|
||||
def test_extract_hint_unknown_falls_back_to_auto(client, mock_model):
|
||||
"""An unrecognised hint silently falls back to the 'auto' prompt."""
|
||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "nonsense"})
|
||||
assert resp.status_code == 200
|
||||
|
||||
|
||||
def test_extract_image_path(tmp_path, client, mock_model):
|
||||
img_file = tmp_path / "doc.png"
|
||||
Image.new("RGB", (8, 8), color=(0, 0, 0)).save(img_file)
|
||||
resp = client.post("/extract", json={"image_path": str(img_file)})
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["metadata"]["width"] == 8
|
||||
|
||||
|
||||
def test_extract_image_path_not_found(client, mock_model):
|
||||
resp = client.post("/extract", json={"image_path": "/nonexistent/path/img.png"})
|
||||
assert resp.status_code == 404
|
||||
|
||||
|
||||
def test_extract_no_image_raises_422(client, mock_model):
|
||||
resp = client.post("/extract", json={"hint": "auto"})
|
||||
assert resp.status_code == 422
|
||||
|
||||
|
||||
def test_extract_response_includes_tables(client, mock_model):
|
||||
"""Verify table objects surface in response when model returns table elements."""
|
||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
|
||||
assert resp.status_code == 200
|
||||
data = resp.json()
|
||||
assert len(data["tables"]) == 1
|
||||
assert "<table>" in data["tables"][0]["html"]
|
||||
|
||||
|
||||
def test_extract_device_in_metadata(client, mock_model):
|
||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
|
||||
assert resp.status_code == 200
|
||||
assert "device" in resp.json()["metadata"]
|
||||
|
|
@ -1,67 +0,0 @@
|
|||
import asyncio
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, patch
|
||||
from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def lease_manager():
|
||||
mgr = LeaseManager()
|
||||
mgr.register_gpu("heimdall", 0, 8192)
|
||||
return mgr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def engine(lease_manager):
|
||||
return EvictionEngine(lease_manager=lease_manager, eviction_timeout_s=0.1)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_lease_grants_when_vram_available(engine, lease_manager):
|
||||
lease = await engine.request_lease(
|
||||
node_id="heimdall", gpu_id=0, mb=4096,
|
||||
service="peregrine", priority=1,
|
||||
agent_url="http://localhost:7701",
|
||||
)
|
||||
assert lease is not None
|
||||
assert lease.mb_granted == 4096
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_lease_evicts_and_grants(engine, lease_manager):
|
||||
# Pre-fill with a low-priority lease
|
||||
big_lease = await lease_manager.try_grant(
|
||||
"heimdall", 0, 7000, "comfyui", priority=4
|
||||
)
|
||||
assert big_lease is not None
|
||||
|
||||
# Mock the agent eviction call
|
||||
with patch(
|
||||
"circuitforge_core.resources.coordinator.eviction_engine.EvictionEngine._call_agent_evict",
|
||||
new_callable=AsyncMock,
|
||||
) as mock_evict:
|
||||
mock_evict.return_value = True
|
||||
# Simulate the comfyui lease being released (as if the agent evicted it)
|
||||
asyncio.get_event_loop().call_later(
|
||||
0.05, lambda: asyncio.ensure_future(lease_manager.release(big_lease.lease_id))
|
||||
)
|
||||
lease = await engine.request_lease(
|
||||
node_id="heimdall", gpu_id=0, mb=4096,
|
||||
service="peregrine", priority=1,
|
||||
agent_url="http://localhost:7701",
|
||||
)
|
||||
assert lease is not None
|
||||
assert lease.holder_service == "peregrine"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_request_lease_returns_none_when_no_eviction_candidates(engine):
|
||||
await engine.lease_manager.try_grant("heimdall", 0, 6000, "vllm", priority=1)
|
||||
# Requesting 4GB but no lower-priority leases exist
|
||||
lease = await engine.request_lease(
|
||||
node_id="heimdall", gpu_id=0, mb=4096,
|
||||
service="kiwi", priority=2,
|
||||
agent_url="http://localhost:7701",
|
||||
)
|
||||
assert lease is None
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
import signal
|
||||
from unittest.mock import patch, call
|
||||
import pytest
|
||||
from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor, EvictionResult
|
||||
|
||||
|
||||
def test_evict_by_pid_sends_sigterm_then_sigkill():
|
||||
executor = EvictionExecutor(grace_period_s=0.01)
|
||||
# pid_exists always True → grace period expires → SIGKILL fires
|
||||
with patch("os.kill") as mock_kill, \
|
||||
patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
|
||||
mock_psutil.pid_exists.return_value = True
|
||||
result = executor.evict_pid(pid=1234, grace_period_s=0.01)
|
||||
|
||||
assert result.success is True
|
||||
calls = mock_kill.call_args_list
|
||||
assert call(1234, signal.SIGTERM) in calls
|
||||
assert call(1234, signal.SIGKILL) in calls
|
||||
|
||||
|
||||
def test_evict_pid_succeeds_on_sigterm_alone():
|
||||
executor = EvictionExecutor(grace_period_s=0.1)
|
||||
with patch("os.kill"), \
|
||||
patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
|
||||
mock_psutil.pid_exists.side_effect = [True, False] # gone after SIGTERM
|
||||
result = executor.evict_pid(pid=5678, grace_period_s=0.01)
|
||||
assert result.success is True
|
||||
assert result.method == "sigterm"
|
||||
|
||||
|
||||
def test_evict_pid_not_found_returns_failure():
|
||||
executor = EvictionExecutor()
|
||||
with patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
|
||||
mock_psutil.pid_exists.return_value = False
|
||||
result = executor.evict_pid(pid=9999)
|
||||
assert result.success is False
|
||||
assert "not found" in result.message.lower()
|
||||
|
||||
|
||||
def test_eviction_result_is_immutable():
|
||||
result = EvictionResult(success=True, method="sigterm", message="ok")
|
||||
with pytest.raises((AttributeError, TypeError)):
|
||||
result.success = False # type: ignore
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
from unittest.mock import patch
|
||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
||||
|
||||
|
||||
SAMPLE_NVIDIA_SMI_OUTPUT = (
|
||||
"0, Quadro RTX 4000, 8192, 6843, 1349\n"
|
||||
"1, Quadro RTX 4000, 8192, 721, 7471\n"
|
||||
)
|
||||
|
||||
|
||||
def test_parse_returns_list_of_gpu_info():
|
||||
monitor = GpuMonitor()
|
||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 0
|
||||
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
||||
gpus = monitor.poll()
|
||||
assert len(gpus) == 2
|
||||
assert gpus[0].gpu_id == 0
|
||||
assert gpus[0].name == "Quadro RTX 4000"
|
||||
assert gpus[0].vram_total_mb == 8192
|
||||
assert gpus[0].vram_used_mb == 6843
|
||||
assert gpus[0].vram_free_mb == 1349
|
||||
|
||||
|
||||
def test_parse_second_gpu():
|
||||
monitor = GpuMonitor()
|
||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 0
|
||||
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
||||
gpus = monitor.poll()
|
||||
assert gpus[1].gpu_id == 1
|
||||
assert gpus[1].vram_used_mb == 721
|
||||
assert gpus[1].vram_free_mb == 7471
|
||||
|
||||
|
||||
def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
|
||||
monitor = GpuMonitor()
|
||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run", side_effect=FileNotFoundError):
|
||||
gpus = monitor.poll()
|
||||
assert gpus == []
|
||||
|
||||
|
||||
def test_poll_returns_empty_list_on_nonzero_exit():
|
||||
monitor = GpuMonitor()
|
||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 1
|
||||
mock_run.return_value.stdout = ""
|
||||
gpus = monitor.poll()
|
||||
assert gpus == []
|
||||
|
||||
|
||||
def test_poll_skips_malformed_lines():
|
||||
monitor = GpuMonitor()
|
||||
malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
|
||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
||||
mock_run.return_value.returncode = 0
|
||||
mock_run.return_value.stdout = malformed
|
||||
gpus = monitor.poll()
|
||||
assert len(gpus) == 1
|
||||
assert gpus[0].gpu_id == 1
|
||||
|
|
@ -1,221 +0,0 @@
|
|||
"""Integration test: full lease → eviction → re-grant cycle.
|
||||
|
||||
Runs coordinator in-process (no subprocesses, no real nvidia-smi).
|
||||
Uses TestClient for HTTP, mocks AgentSupervisor to return fixed node state.
|
||||
"""
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
from fastapi.testclient import TestClient
|
||||
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def system():
|
||||
"""Create an in-process coordinator system with 8GB GPU and mock supervisor."""
|
||||
lease_manager = LeaseManager()
|
||||
lease_manager.register_gpu("local", 0, 8192)
|
||||
|
||||
mock_supervisor = MagicMock(spec=AgentSupervisor)
|
||||
mock_supervisor.all_nodes.return_value = [
|
||||
NodeInfo(
|
||||
node_id="local",
|
||||
agent_url="http://localhost:7701",
|
||||
gpus=[GpuInfo(
|
||||
gpu_id=0,
|
||||
name="RTX 4000",
|
||||
vram_total_mb=8192,
|
||||
vram_used_mb=0,
|
||||
vram_free_mb=8192,
|
||||
)],
|
||||
last_heartbeat=0.0,
|
||||
)
|
||||
]
|
||||
mock_supervisor.get_node_info.return_value = NodeInfo(
|
||||
node_id="local",
|
||||
agent_url="http://localhost:7701",
|
||||
gpus=[],
|
||||
last_heartbeat=0.0,
|
||||
)
|
||||
|
||||
profile_registry = ProfileRegistry()
|
||||
app = create_coordinator_app(
|
||||
lease_manager=lease_manager,
|
||||
profile_registry=profile_registry,
|
||||
agent_supervisor=mock_supervisor,
|
||||
service_registry=ServiceRegistry(),
|
||||
)
|
||||
client = TestClient(app)
|
||||
return client, lease_manager
|
||||
|
||||
|
||||
def test_full_lease_cycle(system):
|
||||
"""Test: grant, verify, release, verify gone."""
|
||||
client, _ = system
|
||||
|
||||
# Grant a lease
|
||||
resp = client.post("/api/leases", json={
|
||||
"node_id": "local",
|
||||
"gpu_id": 0,
|
||||
"mb": 4096,
|
||||
"service": "peregrine",
|
||||
"priority": 1,
|
||||
})
|
||||
assert resp.status_code == 200
|
||||
lease_data = resp.json()["lease"]
|
||||
lease_id = lease_data["lease_id"]
|
||||
assert lease_data["mb_granted"] == 4096
|
||||
assert lease_data["holder_service"] == "peregrine"
|
||||
|
||||
# Verify it appears in active leases
|
||||
resp = client.get("/api/leases")
|
||||
assert resp.status_code == 200
|
||||
leases = resp.json()["leases"]
|
||||
assert any(l["lease_id"] == lease_id for l in leases)
|
||||
|
||||
# Release it
|
||||
resp = client.delete(f"/api/leases/{lease_id}")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["released"] is True
|
||||
|
||||
# Verify it's gone
|
||||
resp = client.get("/api/leases")
|
||||
assert resp.status_code == 200
|
||||
leases = resp.json()["leases"]
|
||||
assert not any(l["lease_id"] == lease_id for l in leases)
|
||||
|
||||
|
||||
def test_vram_exhaustion_returns_503(system):
|
||||
"""Test: fill GPU, then request with no eviction candidates returns 503."""
|
||||
client, _ = system
|
||||
|
||||
# Fill GPU 0 with high-priority lease
|
||||
resp = client.post("/api/leases", json={
|
||||
"node_id": "local",
|
||||
"gpu_id": 0,
|
||||
"mb": 8000,
|
||||
"service": "vllm",
|
||||
"priority": 1,
|
||||
})
|
||||
assert resp.status_code == 200
|
||||
|
||||
# Try to get more VRAM with same priority (no eviction candidates)
|
||||
resp = client.post("/api/leases", json={
|
||||
"node_id": "local",
|
||||
"gpu_id": 0,
|
||||
"mb": 2000,
|
||||
"service": "kiwi",
|
||||
"priority": 1,
|
||||
})
|
||||
assert resp.status_code == 503
|
||||
assert "Insufficient VRAM" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_auto_detect_profile_for_8gb():
|
||||
"""Test: ProfileRegistry auto-detects single-gpu-8gb for 8GB GPU."""
|
||||
registry = ProfileRegistry()
|
||||
gpu = GpuInfo(
|
||||
gpu_id=0,
|
||||
name="RTX 4000",
|
||||
vram_total_mb=8192,
|
||||
vram_used_mb=0,
|
||||
vram_free_mb=8192,
|
||||
)
|
||||
profile = registry.auto_detect([gpu])
|
||||
assert profile.name == "single-gpu-8gb"
|
||||
# Verify profile has services configured
|
||||
assert hasattr(profile, "services")
|
||||
|
||||
|
||||
def test_node_endpoint_shows_nodes(system):
|
||||
"""Test: GET /api/nodes returns the mocked node."""
|
||||
client, _ = system
|
||||
resp = client.get("/api/nodes")
|
||||
assert resp.status_code == 200
|
||||
nodes = resp.json()["nodes"]
|
||||
assert len(nodes) == 1
|
||||
assert nodes[0]["node_id"] == "local"
|
||||
assert nodes[0]["agent_url"] == "http://localhost:7701"
|
||||
assert len(nodes[0]["gpus"]) == 1
|
||||
assert nodes[0]["gpus"][0]["name"] == "RTX 4000"
|
||||
|
||||
|
||||
def test_profiles_endpoint_returns_public_profiles(system):
|
||||
"""Test: GET /api/profiles returns standard public profiles."""
|
||||
client, _ = system
|
||||
resp = client.get("/api/profiles")
|
||||
assert resp.status_code == 200
|
||||
profiles = resp.json()["profiles"]
|
||||
names = [p["name"] for p in profiles]
|
||||
# Verify common public profiles are present
|
||||
assert "single-gpu-8gb" in names
|
||||
assert "single-gpu-6gb" in names
|
||||
assert "single-gpu-2gb" in names
|
||||
|
||||
|
||||
def test_multiple_leases_tracked_independently(system):
|
||||
"""Test: multiple active leases are tracked correctly."""
|
||||
client, _ = system
|
||||
|
||||
# Grant lease 1
|
||||
resp1 = client.post("/api/leases", json={
|
||||
"node_id": "local",
|
||||
"gpu_id": 0,
|
||||
"mb": 2048,
|
||||
"service": "peregrine",
|
||||
"priority": 2,
|
||||
})
|
||||
assert resp1.status_code == 200
|
||||
lease1_id = resp1.json()["lease"]["lease_id"]
|
||||
|
||||
# Grant lease 2
|
||||
resp2 = client.post("/api/leases", json={
|
||||
"node_id": "local",
|
||||
"gpu_id": 0,
|
||||
"mb": 2048,
|
||||
"service": "kiwi",
|
||||
"priority": 2,
|
||||
})
|
||||
assert resp2.status_code == 200
|
||||
lease2_id = resp2.json()["lease"]["lease_id"]
|
||||
|
||||
# Both should be in active leases
|
||||
resp = client.get("/api/leases")
|
||||
leases = resp.json()["leases"]
|
||||
lease_ids = [l["lease_id"] for l in leases]
|
||||
assert lease1_id in lease_ids
|
||||
assert lease2_id in lease_ids
|
||||
assert len(leases) == 2
|
||||
|
||||
# Release lease 1
|
||||
resp = client.delete(f"/api/leases/{lease1_id}")
|
||||
assert resp.status_code == 200
|
||||
|
||||
# Only lease 2 should remain
|
||||
resp = client.get("/api/leases")
|
||||
leases = resp.json()["leases"]
|
||||
lease_ids = [l["lease_id"] for l in leases]
|
||||
assert lease1_id not in lease_ids
|
||||
assert lease2_id in lease_ids
|
||||
assert len(leases) == 1
|
||||
|
||||
|
||||
def test_delete_nonexistent_lease_returns_404(system):
|
||||
"""Test: deleting a nonexistent lease returns 404."""
|
||||
client, _ = system
|
||||
resp = client.delete("/api/leases/nonexistent-lease-id")
|
||||
assert resp.status_code == 404
|
||||
assert "not found" in resp.json()["detail"]
|
||||
|
||||
|
||||
def test_health_endpoint_returns_ok(system):
|
||||
"""Test: GET /api/health returns status ok."""
|
||||
client, _ = system
|
||||
resp = client.get("/api/health")
|
||||
assert resp.status_code == 200
|
||||
assert resp.json()["status"] == "ok"
|
||||
|
|
@ -1,85 +0,0 @@
|
|||
import pytest
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mgr():
|
||||
m = LeaseManager()
|
||||
m.register_gpu(node_id="heimdall", gpu_id=0, total_mb=8192)
|
||||
return m
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_grant_succeeds_when_vram_available(mgr):
|
||||
lease = await mgr.try_grant(
|
||||
node_id="heimdall", gpu_id=0, mb=4096,
|
||||
service="peregrine", priority=1
|
||||
)
|
||||
assert lease is not None
|
||||
assert lease.mb_granted == 4096
|
||||
assert lease.node_id == "heimdall"
|
||||
assert lease.gpu_id == 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_grant_fails_when_vram_insufficient(mgr):
|
||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
|
||||
service="vllm", priority=1)
|
||||
lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
|
||||
service="kiwi", priority=2)
|
||||
assert lease is None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_frees_vram(mgr):
|
||||
lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
|
||||
service="vllm", priority=1)
|
||||
assert lease is not None
|
||||
released = await mgr.release(lease.lease_id)
|
||||
assert released is True
|
||||
lease2 = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
|
||||
service="comfyui", priority=4)
|
||||
assert lease2 is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_release_unknown_lease_returns_false(mgr):
|
||||
result = await mgr.release("nonexistent-id")
|
||||
assert result is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_get_eviction_candidates_returns_lower_priority_leases(mgr):
|
||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=3000,
|
||||
service="comfyui", priority=4)
|
||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
|
||||
service="ollama", priority=1)
|
||||
candidates = mgr.get_eviction_candidates(
|
||||
node_id="heimdall", gpu_id=0,
|
||||
needed_mb=3000, requester_priority=2
|
||||
)
|
||||
assert len(candidates) == 1
|
||||
assert candidates[0].holder_service == "comfyui"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_list_leases_for_gpu(mgr):
|
||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=1024,
|
||||
service="peregrine", priority=1)
|
||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=512,
|
||||
service="kiwi", priority=2)
|
||||
leases = mgr.list_leases(node_id="heimdall", gpu_id=0)
|
||||
assert len(leases) == 2
|
||||
|
||||
|
||||
def test_register_gpu_sets_total(mgr):
|
||||
assert mgr.gpu_total_mb("heimdall", 0) == 8192
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_used_mb_tracks_grants():
|
||||
mgr = LeaseManager()
|
||||
mgr.register_gpu("heimdall", 0, 8192)
|
||||
await mgr.try_grant("heimdall", 0, 3000, "a", 1)
|
||||
await mgr.try_grant("heimdall", 0, 2000, "b", 2)
|
||||
assert mgr.used_mb("heimdall", 0) == 5000
|
||||
|
|
@ -1,47 +0,0 @@
|
|||
import time
|
||||
import pytest
|
||||
from circuitforge_core.resources.models import VRAMLease, GpuInfo, NodeInfo
|
||||
|
||||
|
||||
def test_vram_lease_create_assigns_unique_ids():
|
||||
lease_a = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
|
||||
service="peregrine", priority=1)
|
||||
lease_b = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
|
||||
service="peregrine", priority=1)
|
||||
assert lease_a.lease_id != lease_b.lease_id
|
||||
|
||||
|
||||
def test_vram_lease_create_with_ttl_sets_expiry():
|
||||
before = time.time()
|
||||
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=2048,
|
||||
service="kiwi", priority=2, ttl_s=60.0)
|
||||
after = time.time()
|
||||
assert before + 60.0 <= lease.expires_at <= after + 60.0
|
||||
|
||||
|
||||
def test_vram_lease_create_no_ttl_has_zero_expiry():
|
||||
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
|
||||
service="snipe", priority=2)
|
||||
assert lease.expires_at == 0.0
|
||||
|
||||
|
||||
def test_vram_lease_is_immutable():
|
||||
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
|
||||
service="snipe", priority=2)
|
||||
with pytest.raises((AttributeError, TypeError)):
|
||||
lease.mb_granted = 999 # type: ignore
|
||||
|
||||
|
||||
def test_gpu_info_fields():
|
||||
info = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
|
||||
vram_used_mb=2048, vram_free_mb=6144)
|
||||
assert info.vram_free_mb == 6144
|
||||
|
||||
|
||||
def test_node_info_fields():
|
||||
gpu = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
|
||||
vram_used_mb=0, vram_free_mb=8192)
|
||||
node = NodeInfo(node_id="heimdall", agent_url="http://localhost:7701",
|
||||
gpus=[gpu], last_heartbeat=time.time())
|
||||
assert node.node_id == "heimdall"
|
||||
assert len(node.gpus) == 1
|
||||
|
|
@ -1,82 +0,0 @@
|
|||
import pytest
|
||||
from circuitforge_core.resources.coordinator.node_selector import select_node
|
||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
|
||||
from circuitforge_core.resources.models import GpuInfo
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
|
||||
|
||||
def _make_agent(node_id: str, free_mb: int, online: bool = True) -> AgentRecord:
|
||||
r = AgentRecord(node_id=node_id, agent_url=f"http://{node_id}:7701")
|
||||
r.gpus = [GpuInfo(gpu_id=0, name="RTX", vram_total_mb=8192,
|
||||
vram_used_mb=8192 - free_mb, vram_free_mb=free_mb)]
|
||||
r.online = online
|
||||
return r
|
||||
|
||||
|
||||
def test_selects_node_with_most_free_vram():
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=2000),
|
||||
"b": _make_agent("b", free_mb=6000),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
||||
assert result == ("b", 0)
|
||||
|
||||
|
||||
def test_prefers_warm_node_even_with_less_free_vram():
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=2000),
|
||||
"b": _make_agent("b", free_mb=6000),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
result = select_node(agents, "vllm", registry, resident_keys={"a:vllm"})
|
||||
assert result == ("a", 0)
|
||||
|
||||
|
||||
def test_excludes_offline_nodes():
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=8000, online=False),
|
||||
"b": _make_agent("b", free_mb=2000, online=True),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
||||
assert result == ("b", 0)
|
||||
|
||||
|
||||
def test_returns_none_when_no_node_has_profile_for_service():
|
||||
agents = {"a": _make_agent("a", free_mb=8000)}
|
||||
registry = ProfileRegistry()
|
||||
result = select_node(agents, "cf-nonexistent-service", registry, resident_keys=set())
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_returns_none_when_no_agents():
|
||||
registry = ProfileRegistry()
|
||||
result = select_node({}, "vllm", registry, resident_keys=set())
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
|
||||
"""can_fit requires free_mb >= service max_mb (full ceiling, not half).
|
||||
9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
|
||||
"""
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=1000),
|
||||
"b": _make_agent("b", free_mb=9500),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
||||
# "b" is the only node in the preferred (can_fit) pool
|
||||
assert result == ("b", 0)
|
||||
|
||||
|
||||
def test_falls_back_to_best_effort_when_no_node_fully_fits():
|
||||
"""When nothing can_fit, select_node returns the best-VRAM node as fallback."""
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=1000),
|
||||
"b": _make_agent("b", free_mb=2000),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
# Neither has enough free VRAM; fallback picks highest effective_free_mb
|
||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
||||
assert result == ("b", 0)
|
||||
|
|
@ -1,87 +0,0 @@
|
|||
# tests/test_resources/test_node_store.py
|
||||
"""Unit tests for NodeStore — SQLite persistence layer for known agent nodes."""
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def store(tmp_path: Path) -> NodeStore:
|
||||
return NodeStore(db_path=tmp_path / "test-nodes.db")
|
||||
|
||||
|
||||
def test_upsert_and_all(store: NodeStore) -> None:
|
||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
||||
rows = store.all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
|
||||
|
||||
|
||||
def test_upsert_updates_url(store: NodeStore) -> None:
|
||||
store.upsert("navi", "http://10.1.10.10:7701")
|
||||
store.upsert("navi", "http://10.1.10.10:7702")
|
||||
rows = store.all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0][1] == "http://10.1.10.10:7702"
|
||||
|
||||
|
||||
def test_multiple_nodes(store: NodeStore) -> None:
|
||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
||||
store.upsert("navi", "http://10.1.10.10:7701")
|
||||
store.upsert("strahl", "http://10.1.10.20:7701")
|
||||
assert len(store.all()) == 3
|
||||
|
||||
|
||||
def test_remove(store: NodeStore) -> None:
|
||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
||||
store.upsert("navi", "http://10.1.10.10:7701")
|
||||
store.remove("navi")
|
||||
ids = [r[0] for r in store.all()]
|
||||
assert "navi" not in ids
|
||||
assert "heimdall" in ids
|
||||
|
||||
|
||||
def test_prune_stale_removes_old_entries(store: NodeStore) -> None:
|
||||
# Insert a node with a last_seen in the distant past
|
||||
store._conn.execute(
|
||||
"INSERT INTO known_nodes (node_id, agent_url, last_seen) VALUES (?, ?, ?)",
|
||||
("ghost", "http://dead:7701", time.time() - 40 * 86400),
|
||||
)
|
||||
store._conn.commit()
|
||||
store.upsert("live", "http://live:7701")
|
||||
|
||||
removed = store.prune_stale(max_age_days=30)
|
||||
assert removed == 1
|
||||
ids = [r[0] for r in store.all()]
|
||||
assert "ghost" not in ids
|
||||
assert "live" in ids
|
||||
|
||||
|
||||
def test_prune_stale_keeps_recent(store: NodeStore) -> None:
|
||||
store.upsert("recent", "http://recent:7701")
|
||||
removed = store.prune_stale(max_age_days=30)
|
||||
assert removed == 0
|
||||
assert len(store.all()) == 1
|
||||
|
||||
|
||||
def test_all_empty(store: NodeStore) -> None:
|
||||
assert store.all() == []
|
||||
|
||||
|
||||
def test_db_persists_across_instances(tmp_path: Path) -> None:
|
||||
"""Data written by one NodeStore instance is visible to a new one on the same file."""
|
||||
db = tmp_path / "shared.db"
|
||||
s1 = NodeStore(db_path=db)
|
||||
s1.upsert("navi", "http://10.1.10.10:7701")
|
||||
s1.close()
|
||||
|
||||
s2 = NodeStore(db_path=db)
|
||||
rows = s2.all()
|
||||
assert len(rows) == 1
|
||||
assert rows[0][0] == "navi"
|
||||
s2.close()
|
||||
|
|
@ -1,176 +0,0 @@
|
|||
# tests/test_resources/test_ollama_adopt.py
|
||||
"""
|
||||
Tests for the Ollama adopt-if-running path:
|
||||
- ProcessSpec: adopt and health_path fields parsed from YAML
|
||||
- ServiceManager.start(): adopt path claims running service; falls through if not running
|
||||
- ServiceManager.is_running(): adopt path uses health probe, not proc table
|
||||
- ServiceInstance.health_path persists through upsert_instance
|
||||
- Probe loop uses inst.health_path instead of hardcoded /health
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
||||
from circuitforge_core.resources.profiles.schema import GpuProfile, ProcessSpec, ServiceProfile, load_profile
|
||||
|
||||
|
||||
# ── ProcessSpec schema ────────────────────────────────────────────────────────
|
||||
|
||||
def test_process_spec_defaults():
|
||||
spec = ProcessSpec(exec_path="/usr/local/bin/ollama")
|
||||
assert spec.adopt is False
|
||||
assert spec.health_path == "/health"
|
||||
|
||||
|
||||
def test_process_spec_adopt_fields():
|
||||
spec = ProcessSpec(
|
||||
exec_path="/usr/local/bin/ollama",
|
||||
adopt=True,
|
||||
health_path="/api/tags",
|
||||
port=11434,
|
||||
host_port=11434,
|
||||
)
|
||||
assert spec.adopt is True
|
||||
assert spec.health_path == "/api/tags"
|
||||
|
||||
|
||||
def test_profile_yaml_parses_adopt(tmp_path: Path):
|
||||
yaml_text = """\
|
||||
schema_version: 1
|
||||
name: test
|
||||
services:
|
||||
ollama:
|
||||
max_mb: 4096
|
||||
priority: 1
|
||||
managed:
|
||||
type: process
|
||||
adopt: true
|
||||
exec_path: /usr/local/bin/ollama
|
||||
args_template: serve
|
||||
port: 11434
|
||||
host_port: 11434
|
||||
health_path: /api/tags
|
||||
"""
|
||||
p = tmp_path / "profile.yaml"
|
||||
p.write_text(yaml_text)
|
||||
profile = load_profile(p)
|
||||
spec = profile.services["ollama"].managed
|
||||
assert isinstance(spec, ProcessSpec)
|
||||
assert spec.adopt is True
|
||||
assert spec.health_path == "/api/tags"
|
||||
assert spec.host_port == 11434
|
||||
|
||||
|
||||
# ── ServiceManager adopt path ─────────────────────────────────────────────────
|
||||
|
||||
def _make_manager_with_ollama(advertise_host: str = "127.0.0.1") -> ServiceManager:
|
||||
profile = GpuProfile(
|
||||
schema_version=1,
|
||||
name="test",
|
||||
services={
|
||||
"ollama": ServiceProfile(
|
||||
max_mb=4096,
|
||||
priority=1,
|
||||
managed=ProcessSpec(
|
||||
exec_path="/usr/local/bin/ollama",
|
||||
args_template="serve",
|
||||
port=11434,
|
||||
host_port=11434,
|
||||
adopt=True,
|
||||
health_path="/api/tags",
|
||||
),
|
||||
)
|
||||
},
|
||||
)
|
||||
return ServiceManager(node_id="heimdall", profile=profile, advertise_host=advertise_host)
|
||||
|
||||
|
||||
def test_start_adopt_claims_running_service():
|
||||
"""When Ollama is already healthy, start() returns its URL without spawning a process."""
|
||||
mgr = _make_manager_with_ollama()
|
||||
with patch.object(mgr, "_probe_health", return_value=True) as mock_probe:
|
||||
url = mgr.start("ollama", gpu_id=0, params={})
|
||||
assert url == "http://127.0.0.1:11434"
|
||||
mock_probe.assert_called_once_with(11434, "/api/tags")
|
||||
assert "ollama" not in mgr._procs # no subprocess spawned
|
||||
|
||||
|
||||
def test_start_adopt_spawns_when_not_running():
|
||||
"""When Ollama is not yet running, start() spawns it normally."""
|
||||
mgr = _make_manager_with_ollama()
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.poll.return_value = None
|
||||
|
||||
with patch.object(mgr, "_probe_health", return_value=False), \
|
||||
patch("subprocess.Popen", return_value=mock_proc) as mock_popen:
|
||||
url = mgr.start("ollama", gpu_id=0, params={})
|
||||
|
||||
assert url == "http://127.0.0.1:11434"
|
||||
mock_popen.assert_called_once()
|
||||
assert "ollama" in mgr._procs
|
||||
|
||||
|
||||
def test_is_running_adopt_uses_health_probe():
|
||||
"""is_running() for adopt=True services checks the health endpoint, not the proc table."""
|
||||
mgr = _make_manager_with_ollama()
|
||||
with patch.object(mgr, "_probe_health", return_value=True):
|
||||
assert mgr.is_running("ollama") is True
|
||||
with patch.object(mgr, "_probe_health", return_value=False):
|
||||
assert mgr.is_running("ollama") is False
|
||||
|
||||
|
||||
def test_probe_health_returns_true_on_200():
|
||||
mgr = _make_manager_with_ollama()
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status = 200
|
||||
mock_resp.__enter__ = lambda s: mock_resp
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
|
||||
with patch("urllib.request.urlopen", return_value=mock_resp):
|
||||
assert mgr._probe_health(11434, "/api/tags") is True
|
||||
|
||||
|
||||
def test_probe_health_returns_false_on_connection_error():
|
||||
mgr = _make_manager_with_ollama()
|
||||
with patch("urllib.request.urlopen", side_effect=OSError("refused")):
|
||||
assert mgr._probe_health(11434, "/api/tags") is False
|
||||
|
||||
|
||||
# ── ServiceRegistry health_path ───────────────────────────────────────────────
|
||||
|
||||
def test_upsert_instance_stores_health_path():
|
||||
reg = ServiceRegistry()
|
||||
inst = reg.upsert_instance(
|
||||
service="ollama", node_id="heimdall", gpu_id=0,
|
||||
state="running", model=None, url="http://127.0.0.1:11434",
|
||||
health_path="/api/tags",
|
||||
)
|
||||
assert inst.health_path == "/api/tags"
|
||||
|
||||
|
||||
def test_upsert_instance_default_health_path():
|
||||
reg = ServiceRegistry()
|
||||
inst = reg.upsert_instance(
|
||||
service="vllm", node_id="heimdall", gpu_id=0,
|
||||
state="starting", model="qwen", url="http://127.0.0.1:8000",
|
||||
)
|
||||
assert inst.health_path == "/health"
|
||||
|
||||
|
||||
def test_all_gpu_profiles_have_ollama_managed_block():
|
||||
"""Sanity check: all public GPU profiles now have a managed block for ollama."""
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
registry = ProfileRegistry()
|
||||
for profile in registry.list_public():
|
||||
svc = profile.services.get("ollama")
|
||||
if svc is None:
|
||||
continue # profile may not define ollama
|
||||
assert svc.managed is not None, f"{profile.name}: ollama missing managed block"
|
||||
assert isinstance(svc.managed, ProcessSpec)
|
||||
assert svc.managed.adopt is True, f"{profile.name}: ollama adopt should be True"
|
||||
assert svc.managed.health_path == "/api/tags", f"{profile.name}: wrong health_path"
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
# tests/test_resources/test_profile_registry.py
|
||||
import pytest
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
from circuitforge_core.resources.profiles.schema import (
|
||||
GpuProfile, ServiceProfile, load_profile
|
||||
)
|
||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||
|
||||
|
||||
def test_load_8gb_profile(tmp_path):
|
||||
yaml_content = """
|
||||
schema_version: 1
|
||||
name: single-gpu-8gb
|
||||
vram_total_mb: 8192
|
||||
eviction_timeout_s: 10.0
|
||||
services:
|
||||
vllm:
|
||||
max_mb: 5120
|
||||
priority: 1
|
||||
cf-vision:
|
||||
max_mb: 2048
|
||||
priority: 2
|
||||
shared: true
|
||||
max_concurrent: 3
|
||||
"""
|
||||
profile_file = tmp_path / "test.yaml"
|
||||
profile_file.write_text(yaml_content)
|
||||
profile = load_profile(profile_file)
|
||||
|
||||
assert profile.name == "single-gpu-8gb"
|
||||
assert profile.schema_version == 1
|
||||
assert profile.vram_total_mb == 8192
|
||||
assert profile.eviction_timeout_s == 10.0
|
||||
assert "vllm" in profile.services
|
||||
assert profile.services["vllm"].max_mb == 5120
|
||||
assert profile.services["vllm"].priority == 1
|
||||
assert profile.services["cf-vision"].shared is True
|
||||
assert profile.services["cf-vision"].max_concurrent == 3
|
||||
|
||||
|
||||
def test_load_profile_rejects_wrong_schema_version(tmp_path):
|
||||
yaml_content = "schema_version: 99\nname: future\n"
|
||||
profile_file = tmp_path / "future.yaml"
|
||||
profile_file.write_text(yaml_content)
|
||||
with pytest.raises(ValueError, match="schema_version"):
|
||||
load_profile(profile_file)
|
||||
|
||||
|
||||
def test_service_profile_defaults():
|
||||
svc = ServiceProfile(max_mb=1024, priority=2)
|
||||
assert svc.shared is False
|
||||
assert svc.max_concurrent == 1
|
||||
assert svc.always_on is False
|
||||
assert svc.backend is None
|
||||
assert svc.consumers == []
|
||||
|
||||
|
||||
def test_profile_registry_loads_public_profiles():
|
||||
registry = ProfileRegistry()
|
||||
profiles = registry.list_public()
|
||||
names = [p.name for p in profiles]
|
||||
assert "single-gpu-8gb" in names
|
||||
assert "single-gpu-6gb" in names
|
||||
assert "single-gpu-2gb" in names
|
||||
|
||||
|
||||
def test_profile_registry_auto_detect_selects_8gb():
|
||||
registry = ProfileRegistry()
|
||||
mock_gpus = [
|
||||
MagicMock(vram_total_mb=8192),
|
||||
]
|
||||
profile = registry.auto_detect(mock_gpus)
|
||||
assert profile.name == "single-gpu-8gb"
|
||||
|
||||
|
||||
def test_profile_registry_auto_detect_selects_6gb():
|
||||
registry = ProfileRegistry()
|
||||
mock_gpus = [MagicMock(vram_total_mb=6144)]
|
||||
profile = registry.auto_detect(mock_gpus)
|
||||
assert profile.name == "single-gpu-6gb"
|
||||
|
||||
|
||||
def test_profile_registry_auto_detect_selects_2gb():
|
||||
registry = ProfileRegistry()
|
||||
mock_gpus = [MagicMock(vram_total_mb=2048)]
|
||||
profile = registry.auto_detect(mock_gpus)
|
||||
assert profile.name == "single-gpu-2gb"
|
||||
|
||||
|
||||
def test_profile_registry_load_from_path(tmp_path):
|
||||
yaml_content = (
|
||||
"schema_version: 1\nname: custom\n"
|
||||
"vram_total_mb: 12288\neviction_timeout_s: 5.0\n"
|
||||
)
|
||||
p = tmp_path / "custom.yaml"
|
||||
p.write_text(yaml_content)
|
||||
registry = ProfileRegistry()
|
||||
profile = registry.load(p)
|
||||
assert profile.name == "custom"
|
||||
assert profile.vram_total_mb == 12288
|
||||
|
|
@ -1,194 +0,0 @@
|
|||
"""Tests for ServiceManager ProcessSpec support."""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
||||
from circuitforge_core.resources.profiles.schema import (
|
||||
GpuProfile,
|
||||
ProcessSpec,
|
||||
ServiceProfile,
|
||||
)
|
||||
|
||||
|
||||
def _make_profile(args_template: str = "--port {port} --gpu-id {gpu_id}") -> GpuProfile:
|
||||
return GpuProfile(
|
||||
schema_version=1,
|
||||
name="test",
|
||||
vram_total_mb=8192,
|
||||
services={
|
||||
"vllm": ServiceProfile(
|
||||
max_mb=5120,
|
||||
priority=1,
|
||||
managed=ProcessSpec(
|
||||
exec_path="/usr/bin/python",
|
||||
args_template=args_template,
|
||||
port=8000,
|
||||
host_port=8000,
|
||||
cwd="/tmp",
|
||||
),
|
||||
),
|
||||
"no_managed": ServiceProfile(max_mb=1024, priority=2),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def manager():
|
||||
return ServiceManager(node_id="test-node", profile=_make_profile(), advertise_host="127.0.0.1")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# is_running
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_is_running_returns_false_when_no_proc(manager):
|
||||
assert manager.is_running("vllm") is False
|
||||
|
||||
|
||||
def test_is_running_returns_false_when_proc_exited(manager):
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.poll.return_value = 1 # exited
|
||||
manager._procs["vllm"] = mock_proc
|
||||
assert manager.is_running("vllm") is False
|
||||
|
||||
|
||||
def test_is_running_returns_false_when_port_not_listening(manager):
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.poll.return_value = None # still running
|
||||
manager._procs["vllm"] = mock_proc
|
||||
|
||||
with patch("socket.create_connection", side_effect=OSError("refused")):
|
||||
assert manager.is_running("vllm") is False
|
||||
|
||||
|
||||
def test_is_running_returns_true_when_proc_alive_and_port_open(manager):
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.poll.return_value = None # still running
|
||||
manager._procs["vllm"] = mock_proc
|
||||
|
||||
mock_socket = MagicMock()
|
||||
mock_socket.__enter__ = MagicMock(return_value=mock_socket)
|
||||
mock_socket.__exit__ = MagicMock(return_value=False)
|
||||
with patch("socket.create_connection", return_value=mock_socket):
|
||||
assert manager.is_running("vllm") is True
|
||||
|
||||
|
||||
def test_is_running_unknown_service_returns_false(manager):
|
||||
assert manager.is_running("nonexistent") is False
|
||||
|
||||
|
||||
def test_is_running_no_managed_spec_returns_false(manager):
|
||||
assert manager.is_running("no_managed") is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# start
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_start_launches_process_and_returns_url(manager):
|
||||
with patch("subprocess.Popen") as mock_popen, \
|
||||
patch.object(manager, "is_running", return_value=False):
|
||||
mock_popen.return_value = MagicMock()
|
||||
url = manager.start("vllm", gpu_id=0, params={"model": "mymodel"})
|
||||
|
||||
assert url == "http://127.0.0.1:8000"
|
||||
mock_popen.assert_called_once()
|
||||
call_args = mock_popen.call_args
|
||||
cmd = call_args[0][0]
|
||||
assert cmd[0] == "/usr/bin/python"
|
||||
assert "--port" in cmd
|
||||
assert "8000" in cmd
|
||||
assert "--gpu-id" in cmd
|
||||
assert "0" in cmd
|
||||
|
||||
|
||||
def test_start_returns_url_immediately_when_already_running(manager):
|
||||
with patch.object(manager, "is_running", return_value=True):
|
||||
with patch("subprocess.Popen") as mock_popen:
|
||||
url = manager.start("vllm", gpu_id=0, params={})
|
||||
|
||||
assert url == "http://127.0.0.1:8000"
|
||||
mock_popen.assert_not_called()
|
||||
|
||||
|
||||
def test_start_raises_for_unknown_service(manager):
|
||||
with pytest.raises(ValueError, match="not in profile"):
|
||||
manager.start("nonexistent", gpu_id=0, params={})
|
||||
|
||||
|
||||
def test_start_stores_proc_in_procs(manager):
|
||||
mock_proc = MagicMock()
|
||||
with patch("subprocess.Popen", return_value=mock_proc), \
|
||||
patch.object(manager, "is_running", return_value=False):
|
||||
manager.start("vllm", gpu_id=0, params={})
|
||||
|
||||
assert manager._procs["vllm"] is mock_proc
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# stop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_stop_terminates_running_process(manager):
|
||||
mock_proc = MagicMock()
|
||||
manager._procs["vllm"] = mock_proc
|
||||
|
||||
result = manager.stop("vllm")
|
||||
|
||||
assert result is True
|
||||
mock_proc.terminate.assert_called_once()
|
||||
mock_proc.wait.assert_called_once()
|
||||
assert "vllm" not in manager._procs
|
||||
|
||||
|
||||
def test_stop_kills_process_that_wont_terminate(manager):
|
||||
mock_proc = MagicMock()
|
||||
mock_proc.wait.side_effect = Exception("timeout")
|
||||
manager._procs["vllm"] = mock_proc
|
||||
|
||||
result = manager.stop("vllm")
|
||||
|
||||
assert result is True
|
||||
mock_proc.kill.assert_called_once()
|
||||
|
||||
|
||||
def test_stop_returns_true_when_no_proc_tracked(manager):
|
||||
# No proc in _procs — still returns True (idempotent stop)
|
||||
result = manager.stop("vllm")
|
||||
assert result is True
|
||||
|
||||
|
||||
def test_stop_returns_false_for_unknown_service(manager):
|
||||
result = manager.stop("nonexistent")
|
||||
assert result is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# list_running / get_url
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_list_running_returns_running_services(manager):
|
||||
def _is_running(svc: str) -> bool:
|
||||
return svc == "vllm"
|
||||
|
||||
with patch.object(manager, "is_running", side_effect=_is_running):
|
||||
running = manager.list_running()
|
||||
|
||||
assert running == ["vllm"]
|
||||
|
||||
|
||||
def test_get_url_returns_none_when_not_running(manager):
|
||||
with patch.object(manager, "is_running", return_value=False):
|
||||
assert manager.get_url("vllm") is None
|
||||
|
||||
|
||||
def test_get_url_returns_url_when_running(manager):
|
||||
with patch.object(manager, "is_running", return_value=True):
|
||||
assert manager.get_url("vllm") == "http://127.0.0.1:8000"
|
||||
|
|
@ -1,86 +0,0 @@
|
|||
import time
|
||||
import dataclasses
|
||||
import pytest
|
||||
from circuitforge_core.resources.coordinator.service_registry import (
|
||||
ServiceRegistry, ServiceAllocation, ServiceInstance,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def registry():
|
||||
return ServiceRegistry()
|
||||
|
||||
|
||||
def test_allocate_creates_allocation(registry):
|
||||
alloc = registry.allocate(
|
||||
service="vllm", node_id="heimdall", gpu_id=0,
|
||||
model="Ouro-1.4B", url="http://heimdall:8000",
|
||||
caller="test", ttl_s=300.0,
|
||||
)
|
||||
assert alloc.service == "vllm"
|
||||
assert alloc.node_id == "heimdall"
|
||||
assert alloc.allocation_id # non-empty UUID string
|
||||
|
||||
|
||||
def test_active_allocations_count(registry):
|
||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
|
||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "b", 300.0)
|
||||
assert registry.active_allocations("vllm", "heimdall", 0) == 2
|
||||
|
||||
|
||||
def test_release_decrements_count(registry):
|
||||
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
|
||||
registry.release(alloc.allocation_id)
|
||||
assert registry.active_allocations("vllm", "heimdall", 0) == 0
|
||||
|
||||
|
||||
def test_release_nonexistent_returns_false(registry):
|
||||
assert registry.release("nonexistent-id") is False
|
||||
|
||||
|
||||
def test_upsert_instance_sets_running_state(registry):
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
||||
model="Ouro-1.4B", url="http://heimdall:8000")
|
||||
instances = registry.all_instances()
|
||||
assert len(instances) == 1
|
||||
assert instances[0].state == "running"
|
||||
|
||||
|
||||
def test_release_last_alloc_marks_instance_idle(registry):
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
||||
model="Ouro-1.4B", url="http://heimdall:8000")
|
||||
alloc = registry.allocate("vllm", "heimdall", 0, "Ouro-1.4B", "http://heimdall:8000", "a", 300.0)
|
||||
registry.release(alloc.allocation_id)
|
||||
instance = registry.all_instances()[0]
|
||||
assert instance.state == "idle"
|
||||
assert instance.idle_since is not None
|
||||
|
||||
|
||||
def test_new_alloc_on_idle_instance_marks_it_running(registry):
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="idle",
|
||||
model="M", url="http://h:8000")
|
||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "x", 300.0)
|
||||
assert registry.all_instances()[0].state == "running"
|
||||
|
||||
|
||||
def test_sweep_expired_allocations(registry):
|
||||
# Register a running instance so idle-transition logic has something to act on.
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
||||
model="M", url="http://h:8000")
|
||||
# Create an allocation with a very short TTL (1 second).
|
||||
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "caller", ttl_s=1)
|
||||
assert registry.active_allocations("vllm", "heimdall", 0) == 1
|
||||
|
||||
# Wait for TTL to elapse.
|
||||
time.sleep(1.1)
|
||||
|
||||
expired = registry.sweep_expired_allocations()
|
||||
|
||||
# The allocation should have been swept.
|
||||
assert alloc.allocation_id in expired
|
||||
assert registry.active_allocations("vllm", "heimdall", 0) == 0
|
||||
|
||||
# The instance should have transitioned to idle since no allocations remain.
|
||||
instance = registry.all_instances()[0]
|
||||
assert instance.state == "idle"
|
||||
assert instance.idle_since is not None
|
||||
Loading…
Reference in a new issue