feat!: strip resources/ from MIT core — moves to circuitforge-orch (v0.8.0)
BREAKING CHANGE: circuitforge_core.resources is no longer available. Import CFOrchClient from circuitforge_orch.client instead. cf-orch CLI entry point is now in the circuitforge-orch package.
This commit is contained in:
parent
2259382d0b
commit
c244260d1c
63 changed files with 34 additions and 6571 deletions
22
README.md
22
README.md
|
|
@ -2,15 +2,29 @@
|
||||||
|
|
||||||
Shared scaffold for CircuitForge products.
|
Shared scaffold for CircuitForge products.
|
||||||
|
|
||||||
|
**Current version: 0.7.0**
|
||||||
|
|
||||||
## Modules
|
## Modules
|
||||||
|
|
||||||
|
### Implemented
|
||||||
|
|
||||||
- `circuitforge_core.db` — SQLite connection factory and migration runner
|
- `circuitforge_core.db` — SQLite connection factory and migration runner
|
||||||
- `circuitforge_core.llm` — LLM router with fallback chain
|
- `circuitforge_core.llm` — LLM router with fallback chain (Ollama, vLLM, Anthropic, OpenAI-compatible)
|
||||||
- `circuitforge_core.tiers` — Tier system with BYOK and local vision unlocks
|
- `circuitforge_core.tiers` — Tier system with BYOK and local vision unlocks
|
||||||
- `circuitforge_core.config` — Env validation and .env loader
|
- `circuitforge_core.config` — Env validation and .env loader
|
||||||
- `circuitforge_core.vision` — Vision router stub (v0.2+)
|
- `circuitforge_core.hardware` — Hardware detection and LLM backend profile generation (VRAM tiers, GPU/CPU auto-select)
|
||||||
- `circuitforge_core.wizard` — First-run wizard base class stub
|
- `circuitforge_core.documents` — Document ingestion pipeline: PDF, DOCX, and image OCR → `StructuredDocument`
|
||||||
- `circuitforge_core.pipeline` — Staging queue stub (v0.2+)
|
- `circuitforge_core.affiliates` — Affiliate URL wrapping with opt-out, BYOK user IDs, and CF env-var fallback (`wrap_url`)
|
||||||
|
- `circuitforge_core.preferences` — User preference store (local YAML file, pluggable backend); dot-path get/set API
|
||||||
|
- `circuitforge_core.tasks` — VRAM-aware LLM task scheduler; shared slot manager across services (`TaskScheduler`)
|
||||||
|
- `circuitforge_core.manage` — Cross-platform product process manager (Docker and native modes)
|
||||||
|
- `circuitforge_core.resources` — Resource coordinator and agent: VRAM allocation, eviction engine, GPU profile registry
|
||||||
|
|
||||||
|
### Stubs (in-tree, not yet implemented)
|
||||||
|
|
||||||
|
- `circuitforge_core.vision` — Vision router base class (planned: moondream2 / Claude vision dispatch)
|
||||||
|
- `circuitforge_core.wizard` — First-run wizard base class (products subclass `BaseWizard`)
|
||||||
|
- `circuitforge_core.pipeline` — Staging queue base (`StagingDB`; products provide concrete schema)
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
__version__ = "0.7.0"
|
__version__ = "0.8.0"
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,12 @@ def _build_ebay_url(url: str, affiliate_id: str) -> str:
|
||||||
return f"{url}{sep}{params}"
|
return f"{url}{sep}{params}"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_instacart_url(url: str, affiliate_id: str) -> str:
|
||||||
|
"""Append Instacart affiliate parameter to a search URL."""
|
||||||
|
sep = "&" if "?" in url else "?"
|
||||||
|
return f"{url}{sep}aff={affiliate_id}"
|
||||||
|
|
||||||
|
|
||||||
def _build_amazon_url(url: str, affiliate_id: str) -> str:
|
def _build_amazon_url(url: str, affiliate_id: str) -> str:
|
||||||
"""Merge an Amazon Associates tag into a product URL's query string."""
|
"""Merge an Amazon Associates tag into a product URL's query string."""
|
||||||
parsed = urlparse(url)
|
parsed = urlparse(url)
|
||||||
|
|
@ -101,3 +107,10 @@ register_program(AffiliateProgram(
|
||||||
env_var="AMAZON_ASSOCIATES_TAG",
|
env_var="AMAZON_ASSOCIATES_TAG",
|
||||||
build_url=_build_amazon_url,
|
build_url=_build_amazon_url,
|
||||||
))
|
))
|
||||||
|
|
||||||
|
register_program(AffiliateProgram(
|
||||||
|
name="Instacart",
|
||||||
|
retailer_key="instacart",
|
||||||
|
env_var="INSTACART_AFFILIATE_ID",
|
||||||
|
build_url=_build_instacart_url,
|
||||||
|
))
|
||||||
|
|
|
||||||
|
|
@ -1 +0,0 @@
|
||||||
from circuitforge_core.resources.client import CFOrchClient, Allocation # noqa: F401
|
|
||||||
|
|
@ -1,105 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from fastapi import FastAPI, HTTPException
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor
|
|
||||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
|
||||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class EvictRequest(BaseModel):
|
|
||||||
pid: int
|
|
||||||
grace_period_s: float = 5.0
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceStartRequest(BaseModel):
|
|
||||||
gpu_id: int = 0
|
|
||||||
params: dict[str, str] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def create_agent_app(
|
|
||||||
node_id: str,
|
|
||||||
monitor: GpuMonitor | None = None,
|
|
||||||
executor: EvictionExecutor | None = None,
|
|
||||||
service_manager: ServiceManager | None = None,
|
|
||||||
) -> FastAPI:
|
|
||||||
_monitor = monitor or GpuMonitor()
|
|
||||||
_executor = executor or EvictionExecutor()
|
|
||||||
|
|
||||||
app = FastAPI(title=f"cf-orch-agent [{node_id}]")
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
def health() -> dict[str, Any]:
|
|
||||||
return {"status": "ok", "node_id": node_id}
|
|
||||||
|
|
||||||
@app.get("/gpu-info")
|
|
||||||
def gpu_info() -> dict[str, Any]:
|
|
||||||
gpus = _monitor.poll()
|
|
||||||
return {
|
|
||||||
"node_id": node_id,
|
|
||||||
"gpus": [
|
|
||||||
{
|
|
||||||
"gpu_id": g.gpu_id,
|
|
||||||
"name": g.name,
|
|
||||||
"vram_total_mb": g.vram_total_mb,
|
|
||||||
"vram_used_mb": g.vram_used_mb,
|
|
||||||
"vram_free_mb": g.vram_free_mb,
|
|
||||||
}
|
|
||||||
for g in gpus
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.post("/evict")
|
|
||||||
def evict(req: EvictRequest) -> dict[str, Any]:
|
|
||||||
result = _executor.evict_pid(pid=req.pid, grace_period_s=req.grace_period_s)
|
|
||||||
return {
|
|
||||||
"success": result.success,
|
|
||||||
"method": result.method,
|
|
||||||
"message": result.message,
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.get("/resident-info")
|
|
||||||
def resident_info() -> dict[str, Any]:
|
|
||||||
"""Return which models are currently loaded in each running managed service."""
|
|
||||||
if service_manager is None:
|
|
||||||
return {"residents": []}
|
|
||||||
from circuitforge_core.resources.agent.service_probe import probe_all
|
|
||||||
return {"residents": probe_all(service_manager)}
|
|
||||||
|
|
||||||
if service_manager is not None:
|
|
||||||
@app.get("/services")
|
|
||||||
def list_services() -> dict:
|
|
||||||
return {"running": service_manager.list_running()}
|
|
||||||
|
|
||||||
@app.get("/services/{service}")
|
|
||||||
def service_status(service: str) -> dict:
|
|
||||||
running = service_manager.is_running(service)
|
|
||||||
url = service_manager.get_url(service) if running else None
|
|
||||||
return {"service": service, "running": running, "url": url}
|
|
||||||
|
|
||||||
@app.post("/services/{service}/start")
|
|
||||||
def start_service(service: str, req: ServiceStartRequest) -> dict:
|
|
||||||
try:
|
|
||||||
already_running = service_manager.is_running(service)
|
|
||||||
url = service_manager.start(service, req.gpu_id, req.params)
|
|
||||||
# adopted=True signals the coordinator to treat this instance as
|
|
||||||
# immediately running rather than waiting for the probe loop.
|
|
||||||
adopted = already_running and service_manager.is_running(service)
|
|
||||||
return {"service": service, "url": url, "running": True, "adopted": adopted}
|
|
||||||
except (ValueError, NotImplementedError) as exc:
|
|
||||||
raise HTTPException(status_code=422, detail=str(exc))
|
|
||||||
except Exception as exc:
|
|
||||||
raise HTTPException(status_code=500, detail=f"Failed to start {service}: {exc}")
|
|
||||||
|
|
||||||
@app.post("/services/{service}/stop")
|
|
||||||
def stop_service(service: str) -> dict:
|
|
||||||
stopped = service_manager.stop(service)
|
|
||||||
return {"service": service, "stopped": stopped}
|
|
||||||
|
|
||||||
return app
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import time
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
import psutil
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_DEFAULT_GRACE_S = 5.0
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class EvictionResult:
|
|
||||||
success: bool
|
|
||||||
method: str # "sigterm", "sigkill", "already_gone", "not_found", "error"
|
|
||||||
message: str
|
|
||||||
|
|
||||||
|
|
||||||
class EvictionExecutor:
|
|
||||||
def __init__(self, grace_period_s: float = _DEFAULT_GRACE_S) -> None:
|
|
||||||
self._default_grace = grace_period_s
|
|
||||||
|
|
||||||
def evict_pid(
|
|
||||||
self,
|
|
||||||
pid: int,
|
|
||||||
grace_period_s: float | None = None,
|
|
||||||
) -> EvictionResult:
|
|
||||||
grace = grace_period_s if grace_period_s is not None else self._default_grace
|
|
||||||
|
|
||||||
if pid <= 0:
|
|
||||||
return EvictionResult(
|
|
||||||
success=False, method="error",
|
|
||||||
message=f"Refusing to signal invalid PID {pid}"
|
|
||||||
)
|
|
||||||
|
|
||||||
if not psutil.pid_exists(pid):
|
|
||||||
return EvictionResult(
|
|
||||||
success=False, method="not_found",
|
|
||||||
message=f"PID {pid} not found"
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
|
||||||
os.kill(pid, signal.SIGTERM)
|
|
||||||
except ProcessLookupError:
|
|
||||||
return EvictionResult(
|
|
||||||
success=True, method="already_gone",
|
|
||||||
message=f"PID {pid} vanished before SIGTERM"
|
|
||||||
)
|
|
||||||
except PermissionError as exc:
|
|
||||||
return EvictionResult(
|
|
||||||
success=False, method="error",
|
|
||||||
message=f"Permission denied terminating PID {pid}: {exc}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Wait for grace period
|
|
||||||
deadline = time.monotonic() + grace
|
|
||||||
while time.monotonic() < deadline:
|
|
||||||
if not psutil.pid_exists(pid):
|
|
||||||
logger.info("PID %d exited cleanly after SIGTERM", pid)
|
|
||||||
return EvictionResult(
|
|
||||||
success=True, method="sigterm",
|
|
||||||
message=f"PID {pid} exited after SIGTERM"
|
|
||||||
)
|
|
||||||
time.sleep(0.05)
|
|
||||||
|
|
||||||
# Escalate to SIGKILL
|
|
||||||
if psutil.pid_exists(pid):
|
|
||||||
try:
|
|
||||||
os.kill(pid, signal.SIGKILL)
|
|
||||||
logger.warning("PID %d required SIGKILL", pid)
|
|
||||||
return EvictionResult(
|
|
||||||
success=True, method="sigkill",
|
|
||||||
message=f"PID {pid} killed with SIGKILL"
|
|
||||||
)
|
|
||||||
except ProcessLookupError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return EvictionResult(
|
|
||||||
success=True, method="sigkill",
|
|
||||||
message=f"PID {pid} is gone"
|
|
||||||
)
|
|
||||||
|
|
@ -1,52 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from circuitforge_core.resources.models import GpuInfo
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_NVIDIA_SMI_CMD = [
|
|
||||||
"nvidia-smi",
|
|
||||||
"--query-gpu=index,name,memory.total,memory.used,memory.free",
|
|
||||||
"--format=csv,noheader,nounits",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class GpuMonitor:
|
|
||||||
def poll(self) -> list[GpuInfo]:
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
_NVIDIA_SMI_CMD,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
timeout=5,
|
|
||||||
)
|
|
||||||
except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
|
|
||||||
logger.warning("nvidia-smi unavailable: %s", exc)
|
|
||||||
return []
|
|
||||||
|
|
||||||
if result.returncode != 0:
|
|
||||||
logger.warning("nvidia-smi exited %d", result.returncode)
|
|
||||||
return []
|
|
||||||
|
|
||||||
return self._parse(result.stdout)
|
|
||||||
|
|
||||||
def _parse(self, output: str) -> list[GpuInfo]:
|
|
||||||
gpus: list[GpuInfo] = []
|
|
||||||
for line in output.strip().splitlines():
|
|
||||||
parts = [p.strip() for p in line.split(",")]
|
|
||||||
if len(parts) != 5:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
gpus.append(GpuInfo(
|
|
||||||
gpu_id=int(parts[0]),
|
|
||||||
name=parts[1],
|
|
||||||
vram_total_mb=int(parts[2]),
|
|
||||||
vram_used_mb=int(parts[3]),
|
|
||||||
vram_free_mb=int(parts[4]),
|
|
||||||
))
|
|
||||||
except ValueError:
|
|
||||||
logger.debug("Skipping malformed nvidia-smi line: %r", line)
|
|
||||||
return gpus
|
|
||||||
|
|
@ -1,186 +0,0 @@
|
||||||
"""
|
|
||||||
ServiceManager — start/stop Docker containers and processes for cf-orch managed services.
|
|
||||||
|
|
||||||
Container naming convention: cf-orch-{service}-{node_id}
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import os
|
|
||||||
import re
|
|
||||||
import subprocess
|
|
||||||
from collections import defaultdict
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from circuitforge_core.resources.profiles.schema import DockerSpec, GpuProfile, ProcessSpec
|
|
||||||
|
|
||||||
|
|
||||||
def _expand_volume(v: str) -> str:
|
|
||||||
"""Expand bash-style volume strings including ${VAR:-default} and $VAR."""
|
|
||||||
def _sub(m: re.Match) -> str: # type: ignore[type-arg]
|
|
||||||
var, default = m.group(1), m.group(2) or ""
|
|
||||||
return os.environ.get(var) or default
|
|
||||||
v = re.sub(r"\$\{(\w+)(?::-(.*?))?\}", _sub, v)
|
|
||||||
v = re.sub(r"\$(\w+)", lambda m: os.environ.get(m.group(1), m.group(0)), v)
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceManager:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
profile: GpuProfile,
|
|
||||||
advertise_host: str = "127.0.0.1",
|
|
||||||
) -> None:
|
|
||||||
self.node_id = node_id
|
|
||||||
self.profile = profile
|
|
||||||
self.advertise_host = advertise_host
|
|
||||||
self._procs: dict[str, Any] = {}
|
|
||||||
|
|
||||||
def container_name(self, service: str) -> str:
|
|
||||||
return f"cf-orch-{service}-{self.node_id}"
|
|
||||||
|
|
||||||
def _get_spec(self, service: str) -> DockerSpec | ProcessSpec | None:
|
|
||||||
svc = self.profile.services.get(service)
|
|
||||||
if svc is None:
|
|
||||||
return None
|
|
||||||
return svc.managed
|
|
||||||
|
|
||||||
def is_running(self, service: str) -> bool:
|
|
||||||
spec = self._get_spec(service)
|
|
||||||
if spec is None:
|
|
||||||
return False
|
|
||||||
if isinstance(spec, DockerSpec):
|
|
||||||
try:
|
|
||||||
result = subprocess.run(
|
|
||||||
[
|
|
||||||
"docker",
|
|
||||||
"inspect",
|
|
||||||
"--format",
|
|
||||||
"{{.State.Running}}",
|
|
||||||
self.container_name(service),
|
|
||||||
],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
check=True,
|
|
||||||
)
|
|
||||||
return result.stdout.strip() == "true"
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
return False
|
|
||||||
if isinstance(spec, ProcessSpec):
|
|
||||||
# For adopt=True services, check the health endpoint regardless of whether
|
|
||||||
# we spawned the process (it may be a system daemon we didn't start).
|
|
||||||
if spec.adopt:
|
|
||||||
return self._probe_health(spec.host_port, spec.health_path)
|
|
||||||
proc = self._procs.get(service)
|
|
||||||
if proc is None or proc.poll() is not None:
|
|
||||||
return False
|
|
||||||
import socket
|
|
||||||
try:
|
|
||||||
with socket.create_connection(("127.0.0.1", spec.host_port), timeout=1):
|
|
||||||
return True
|
|
||||||
except OSError:
|
|
||||||
return False
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _probe_health(self, port: int, health_path: str = "/health") -> bool:
|
|
||||||
"""Return True if the service at localhost:port responds 200 on health_path."""
|
|
||||||
import urllib.request
|
|
||||||
try:
|
|
||||||
url = f"http://127.0.0.1:{port}{health_path}"
|
|
||||||
with urllib.request.urlopen(url, timeout=2.0) as resp:
|
|
||||||
return resp.status == 200
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def start(self, service: str, gpu_id: int, params: dict[str, str]) -> str:
|
|
||||||
spec = self._get_spec(service)
|
|
||||||
if spec is None:
|
|
||||||
raise ValueError(f"Service {service!r} not in profile or has no managed spec")
|
|
||||||
|
|
||||||
if self.is_running(service):
|
|
||||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
|
||||||
|
|
||||||
if isinstance(spec, DockerSpec):
|
|
||||||
expanded_volumes = [_expand_volume(v) for v in spec.volumes]
|
|
||||||
|
|
||||||
filler: dict[str, str] = defaultdict(str, params)
|
|
||||||
expanded_command = spec.command_template.format_map(filler).split()
|
|
||||||
|
|
||||||
cmd = [
|
|
||||||
"docker", "run", "-d", "--rm",
|
|
||||||
"--name", self.container_name(service),
|
|
||||||
"--runtime", spec.runtime,
|
|
||||||
"--gpus", f"device={gpu_id}",
|
|
||||||
"--ipc", spec.ipc,
|
|
||||||
"-p", f"{spec.host_port}:{spec.port}",
|
|
||||||
]
|
|
||||||
for vol in expanded_volumes:
|
|
||||||
cmd += ["-v", vol]
|
|
||||||
for key, val in spec.env.items():
|
|
||||||
cmd += ["-e", f"{key}={val}"]
|
|
||||||
cmd.append(spec.image)
|
|
||||||
cmd.extend(expanded_command)
|
|
||||||
|
|
||||||
subprocess.run(cmd, check=True, capture_output=True, text=True)
|
|
||||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
|
||||||
|
|
||||||
if isinstance(spec, ProcessSpec):
|
|
||||||
# adopt=True: if the service is already healthy, claim it without spawning.
|
|
||||||
if spec.adopt and self._probe_health(spec.host_port, spec.health_path):
|
|
||||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
|
||||||
|
|
||||||
import subprocess as _sp
|
|
||||||
|
|
||||||
filler = defaultdict(str, params)
|
|
||||||
filler.setdefault("port", str(spec.port))
|
|
||||||
filler.setdefault("gpu_id", str(gpu_id))
|
|
||||||
args_expanded = spec.args_template.format_map(filler).split()
|
|
||||||
|
|
||||||
cmd = [spec.exec_path] + args_expanded
|
|
||||||
env = {**__import__("os").environ}
|
|
||||||
proc = _sp.Popen(
|
|
||||||
cmd,
|
|
||||||
cwd=spec.cwd or None,
|
|
||||||
env=env,
|
|
||||||
stdout=_sp.DEVNULL,
|
|
||||||
stderr=_sp.DEVNULL,
|
|
||||||
)
|
|
||||||
self._procs[service] = proc
|
|
||||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
|
||||||
|
|
||||||
raise NotImplementedError(f"Unknown spec type: {type(spec)}")
|
|
||||||
|
|
||||||
def stop(self, service: str) -> bool:
|
|
||||||
spec = self._get_spec(service)
|
|
||||||
if spec is None:
|
|
||||||
return False
|
|
||||||
if isinstance(spec, DockerSpec):
|
|
||||||
try:
|
|
||||||
subprocess.run(
|
|
||||||
["docker", "stop", self.container_name(service)],
|
|
||||||
check=True,
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
except subprocess.CalledProcessError:
|
|
||||||
return False
|
|
||||||
if isinstance(spec, ProcessSpec):
|
|
||||||
proc = self._procs.pop(service, None)
|
|
||||||
if proc is not None:
|
|
||||||
proc.terminate()
|
|
||||||
try:
|
|
||||||
proc.wait(timeout=10)
|
|
||||||
except Exception:
|
|
||||||
proc.kill()
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def list_running(self) -> list[str]:
|
|
||||||
return [svc for svc in self.profile.services if self.is_running(svc)]
|
|
||||||
|
|
||||||
def get_url(self, service: str) -> str | None:
|
|
||||||
spec = self._get_spec(service)
|
|
||||||
if spec is None or not self.is_running(service):
|
|
||||||
return None
|
|
||||||
return f"http://{self.advertise_host}:{spec.host_port}"
|
|
||||||
|
|
@ -1,123 +0,0 @@
|
||||||
"""
|
|
||||||
Probe running services to detect which models are currently loaded in VRAM.
|
|
||||||
|
|
||||||
Two probe strategies run together:
|
|
||||||
|
|
||||||
1. Well-known ports — always checked, regardless of who started the service.
|
|
||||||
Catches ollama, vLLM, etc. running outside cf-orch management.
|
|
||||||
|
|
||||||
2. Managed services — services cf-orch started via ServiceManager.
|
|
||||||
Checked on their configured host_port, deduplicates with well-known results.
|
|
||||||
|
|
||||||
Each service exposes a different introspection API:
|
|
||||||
- vllm: GET /v1/models → {"data": [{"id": "<model-name>"}]}
|
|
||||||
- ollama: GET /api/ps → {"models": [{"name": "<model>", "size_vram": <bytes>}]}
|
|
||||||
|
|
||||||
ollama can have multiple models loaded simultaneously; each is reported as a
|
|
||||||
separate entry so the dashboard shows per-model residency.
|
|
||||||
|
|
||||||
The probe is best-effort: a timeout or connection refusal means model_name=None
|
|
||||||
but the service is still reported as resident.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import urllib.request
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from circuitforge_core.resources.profiles.schema import DockerSpec
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_PROBE_TIMEOUT_S = 2.0
|
|
||||||
|
|
||||||
# Well-known service ports probed on every heartbeat.
|
|
||||||
# key → (service_name, prober_key)
|
|
||||||
_WELL_KNOWN_PORTS: dict[int, str] = {
|
|
||||||
11434: "ollama",
|
|
||||||
8000: "vllm",
|
|
||||||
8080: "vllm", # common alt vLLM port
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _fetch_json(url: str) -> dict[str, Any] | None:
|
|
||||||
"""GET a URL and parse JSON; returns None on any error."""
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(url, timeout=_PROBE_TIMEOUT_S) as resp:
|
|
||||||
return json.loads(resp.read())
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("Probe %s: %s", url, exc)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def _probe_vllm(port: int) -> list[str]:
|
|
||||||
data = _fetch_json(f"http://127.0.0.1:{port}/v1/models")
|
|
||||||
if data and data.get("data"):
|
|
||||||
return [m["id"] for m in data["data"] if m.get("id")]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def _probe_ollama(port: int) -> list[str]:
|
|
||||||
# /api/ps lists models currently *loaded in memory*, not just downloaded.
|
|
||||||
data = _fetch_json(f"http://127.0.0.1:{port}/api/ps")
|
|
||||||
if data and data.get("models"):
|
|
||||||
return [m["name"] for m in data["models"] if m.get("name")]
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
_PROBERS: dict[str, Any] = {
|
|
||||||
"vllm": _probe_vllm,
|
|
||||||
"ollama": _probe_ollama,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def probe_all(service_manager: Any) -> list[dict[str, Any]]:
|
|
||||||
"""
|
|
||||||
Probe all services — both well-known ports and cf-orch managed services.
|
|
||||||
|
|
||||||
Returns a list of dicts: [{"service": str, "model_name": str | None}].
|
|
||||||
Multiple loaded models in one service (e.g. two ollama models) each get
|
|
||||||
their own entry, disambiguated as "ollama/0", "ollama/1", etc.
|
|
||||||
"""
|
|
||||||
results: list[dict[str, Any]] = []
|
|
||||||
seen_ports: set[int] = set()
|
|
||||||
|
|
||||||
# ── 1. Well-known ports ──────────────────────────────────────────
|
|
||||||
for port, service in _WELL_KNOWN_PORTS.items():
|
|
||||||
prober = _PROBERS.get(service)
|
|
||||||
if prober is None:
|
|
||||||
continue
|
|
||||||
models = prober(port)
|
|
||||||
if not models:
|
|
||||||
continue # nothing on this port right now
|
|
||||||
seen_ports.add(port)
|
|
||||||
if len(models) == 1:
|
|
||||||
results.append({"service": service, "model_name": models[0]})
|
|
||||||
else:
|
|
||||||
for i, model in enumerate(models):
|
|
||||||
results.append({"service": f"{service}/{i}", "model_name": model})
|
|
||||||
|
|
||||||
# ── 2. Managed services (cf-orch started) ───────────────────────
|
|
||||||
if service_manager is not None:
|
|
||||||
for service in service_manager.list_running():
|
|
||||||
spec = service_manager._get_spec(service)
|
|
||||||
if not isinstance(spec, DockerSpec):
|
|
||||||
continue
|
|
||||||
if spec.host_port in seen_ports:
|
|
||||||
continue # already captured by well-known probe
|
|
||||||
prober = _PROBERS.get(service)
|
|
||||||
if prober is None:
|
|
||||||
results.append({"service": service, "model_name": None})
|
|
||||||
continue
|
|
||||||
models = prober(spec.host_port)
|
|
||||||
seen_ports.add(spec.host_port)
|
|
||||||
if not models:
|
|
||||||
results.append({"service": service, "model_name": None})
|
|
||||||
elif len(models) == 1:
|
|
||||||
results.append({"service": service, "model_name": models[0]})
|
|
||||||
else:
|
|
||||||
for i, model in enumerate(models):
|
|
||||||
results.append({"service": f"{service}/{i}", "model_name": model})
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
@ -1,234 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Annotated, Optional
|
|
||||||
|
|
||||||
import typer
|
|
||||||
import uvicorn
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
app = typer.Typer(name="cf-orch", help="CircuitForge GPU resource orchestrator")
|
|
||||||
|
|
||||||
_SYSTEMD_UNIT_PATH = Path("/etc/systemd/system/cf-orch.service")
|
|
||||||
|
|
||||||
_SYSTEMD_UNIT_TEMPLATE = """\
|
|
||||||
[Unit]
|
|
||||||
Description=CircuitForge GPU Resource Orchestrator
|
|
||||||
After=network.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
ExecStart={python} -m circuitforge_core.resources.cli start
|
|
||||||
Restart=on-failure
|
|
||||||
RestartSec=5
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=multi-user.target
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def start(
|
|
||||||
profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
|
|
||||||
host: str = "0.0.0.0",
|
|
||||||
port: int = 7700,
|
|
||||||
node_id: str = "local",
|
|
||||||
agent_port: int = 7701,
|
|
||||||
) -> None:
|
|
||||||
"""Start the cf-orch coordinator (auto-detects GPU profile if not specified).
|
|
||||||
|
|
||||||
Automatically pre-registers the local agent so its GPUs appear on the
|
|
||||||
dashboard immediately. Remote nodes self-register via POST /api/nodes.
|
|
||||||
"""
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
|
||||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
|
||||||
|
|
||||||
lease_manager = LeaseManager()
|
|
||||||
profile_registry = ProfileRegistry()
|
|
||||||
service_registry = ServiceRegistry()
|
|
||||||
node_store = NodeStore()
|
|
||||||
supervisor = AgentSupervisor(
|
|
||||||
lease_manager=lease_manager,
|
|
||||||
service_registry=service_registry,
|
|
||||||
profile_registry=profile_registry,
|
|
||||||
node_store=node_store,
|
|
||||||
)
|
|
||||||
restored = supervisor.restore_from_store()
|
|
||||||
if restored:
|
|
||||||
typer.echo(f"Restored {restored} known node(s) from previous session")
|
|
||||||
|
|
||||||
monitor = GpuMonitor()
|
|
||||||
gpus = monitor.poll()
|
|
||||||
if not gpus:
|
|
||||||
typer.echo(
|
|
||||||
"Warning: no GPUs detected via nvidia-smi — coordinator running with 0 VRAM"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
typer.echo(f"Detected {len(gpus)} GPU(s)")
|
|
||||||
|
|
||||||
if profile:
|
|
||||||
active_profile = profile_registry.load(profile)
|
|
||||||
typer.echo(f"Using profile: {active_profile.name} (from {profile})")
|
|
||||||
else:
|
|
||||||
active_profile = (
|
|
||||||
profile_registry.auto_detect(gpus)
|
|
||||||
if gpus
|
|
||||||
else profile_registry.list_public()[-1]
|
|
||||||
)
|
|
||||||
typer.echo(f"Auto-selected profile: {active_profile.name}")
|
|
||||||
|
|
||||||
# Pre-register the local agent — the heartbeat loop will poll it for live GPU data.
|
|
||||||
local_agent_url = f"http://127.0.0.1:{agent_port}"
|
|
||||||
supervisor.register(node_id, local_agent_url)
|
|
||||||
typer.echo(f"Registered local node '{node_id}' → {local_agent_url}")
|
|
||||||
|
|
||||||
coordinator_app = create_coordinator_app(
|
|
||||||
lease_manager=lease_manager,
|
|
||||||
profile_registry=profile_registry,
|
|
||||||
agent_supervisor=supervisor,
|
|
||||||
service_registry=service_registry,
|
|
||||||
)
|
|
||||||
|
|
||||||
typer.echo(f"Starting cf-orch coordinator on {host}:{port}")
|
|
||||||
uvicorn.run(coordinator_app, host=host, port=port)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def agent(
|
|
||||||
coordinator: str = "http://localhost:7700",
|
|
||||||
node_id: str = "local",
|
|
||||||
host: str = "0.0.0.0",
|
|
||||||
port: int = 7701,
|
|
||||||
advertise_host: Optional[str] = None,
|
|
||||||
profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
|
|
||||||
) -> None:
|
|
||||||
"""Start a cf-orch node agent and self-register with the coordinator.
|
|
||||||
|
|
||||||
The agent starts its HTTP server, then POSTs its URL to the coordinator
|
|
||||||
so it appears on the dashboard without manual configuration.
|
|
||||||
|
|
||||||
Use --advertise-host to override the IP the coordinator should use to
|
|
||||||
reach this agent (e.g. on a multi-homed or NATted host).
|
|
||||||
"""
|
|
||||||
import threading
|
|
||||||
import httpx
|
|
||||||
from circuitforge_core.resources.agent.app import create_agent_app
|
|
||||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
|
|
||||||
# The URL the coordinator should use to reach this agent.
|
|
||||||
reach_host = advertise_host or ("127.0.0.1" if host in ("0.0.0.0", "::") else host)
|
|
||||||
agent_url = f"http://{reach_host}:{port}"
|
|
||||||
|
|
||||||
_RECONNECT_INTERVAL_S = 30.0
|
|
||||||
|
|
||||||
def _reconnect_loop() -> None:
|
|
||||||
"""
|
|
||||||
Persistently re-register this agent with the coordinator.
|
|
||||||
|
|
||||||
Runs as a daemon thread for the lifetime of the agent process:
|
|
||||||
- Waits 2 s on first run (uvicorn needs time to bind)
|
|
||||||
- Re-registers every 30 s thereafter
|
|
||||||
- If the coordinator is down, silently retries — no crashing
|
|
||||||
- When the coordinator restarts, the agent re-appears within one cycle
|
|
||||||
|
|
||||||
This means coordinator restarts require no manual intervention on agent hosts.
|
|
||||||
"""
|
|
||||||
import time
|
|
||||||
first = True
|
|
||||||
while True:
|
|
||||||
time.sleep(2.0 if first else _RECONNECT_INTERVAL_S)
|
|
||||||
first = False
|
|
||||||
try:
|
|
||||||
resp = httpx.post(
|
|
||||||
f"{coordinator}/api/nodes",
|
|
||||||
json={"node_id": node_id, "agent_url": agent_url},
|
|
||||||
timeout=5.0,
|
|
||||||
)
|
|
||||||
if resp.is_success:
|
|
||||||
logger.debug("Registered with coordinator at %s as '%s'", coordinator, node_id)
|
|
||||||
else:
|
|
||||||
logger.warning(
|
|
||||||
"Coordinator registration returned %s", resp.status_code
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("Coordinator at %s unreachable, will retry: %s", coordinator, exc)
|
|
||||||
|
|
||||||
# Fire reconnect loop in a daemon thread so uvicorn.run() can start blocking immediately.
|
|
||||||
threading.Thread(target=_reconnect_loop, daemon=True, name="cf-orch-reconnect").start()
|
|
||||||
typer.echo(f"Reconnect loop started — will register with {coordinator} every {int(_RECONNECT_INTERVAL_S)}s")
|
|
||||||
|
|
||||||
service_manager = None
|
|
||||||
try:
|
|
||||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
|
||||||
pr = ProfileRegistry()
|
|
||||||
gpus = GpuMonitor().poll()
|
|
||||||
p = pr.load(Path(profile)) if profile else pr.auto_detect(gpus)
|
|
||||||
service_manager = ServiceManager(node_id=node_id, profile=p, advertise_host=reach_host)
|
|
||||||
typer.echo(f"ServiceManager ready with profile: {p.name}")
|
|
||||||
except Exception as exc:
|
|
||||||
typer.echo(f"Warning: ServiceManager unavailable ({exc})", err=True)
|
|
||||||
|
|
||||||
agent_app = create_agent_app(node_id=node_id, service_manager=service_manager)
|
|
||||||
typer.echo(f"Starting cf-orch agent [{node_id}] on {host}:{port}")
|
|
||||||
uvicorn.run(agent_app, host=host, port=port)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
|
||||||
def status(coordinator: str = "http://localhost:7700") -> None:
|
|
||||||
"""Show GPU and lease status from the coordinator."""
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = httpx.get(f"{coordinator}/api/nodes", timeout=5.0)
|
|
||||||
resp.raise_for_status()
|
|
||||||
nodes = resp.json().get("nodes", [])
|
|
||||||
for node in nodes:
|
|
||||||
typer.echo(f"\nNode: {node['node_id']}")
|
|
||||||
for gpu in node.get("gpus", []):
|
|
||||||
typer.echo(
|
|
||||||
f" GPU {gpu['gpu_id']}: {gpu['name']} — "
|
|
||||||
f"{gpu['vram_used_mb']}/{gpu['vram_total_mb']} MB used"
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
typer.echo(f"Coordinator unreachable at {coordinator}: {exc}", err=True)
|
|
||||||
raise typer.Exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command("install-service")
|
|
||||||
def install_service(
|
|
||||||
dry_run: bool = typer.Option(
|
|
||||||
False, "--dry-run", help="Print unit file without writing"
|
|
||||||
),
|
|
||||||
) -> None:
|
|
||||||
"""Write a systemd unit file for cf-orch (requires root)."""
|
|
||||||
python = sys.executable
|
|
||||||
unit_content = _SYSTEMD_UNIT_TEMPLATE.format(python=python)
|
|
||||||
if dry_run:
|
|
||||||
typer.echo(f"Would write to {_SYSTEMD_UNIT_PATH}:\n")
|
|
||||||
typer.echo(unit_content)
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
_SYSTEMD_UNIT_PATH.write_text(unit_content)
|
|
||||||
typer.echo(f"Written: {_SYSTEMD_UNIT_PATH}")
|
|
||||||
typer.echo(
|
|
||||||
"Run: sudo systemctl daemon-reload && sudo systemctl enable --now cf-orch"
|
|
||||||
)
|
|
||||||
except PermissionError:
|
|
||||||
typer.echo(
|
|
||||||
f"Permission denied writing to {_SYSTEMD_UNIT_PATH}. Run as root.", err=True
|
|
||||||
)
|
|
||||||
raise typer.Exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
app()
|
|
||||||
|
|
@ -1,143 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from contextlib import contextmanager, asynccontextmanager
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Allocation:
|
|
||||||
allocation_id: str
|
|
||||||
service: str
|
|
||||||
node_id: str
|
|
||||||
gpu_id: int
|
|
||||||
model: str | None
|
|
||||||
url: str
|
|
||||||
started: bool
|
|
||||||
warm: bool
|
|
||||||
|
|
||||||
|
|
||||||
class CFOrchClient:
|
|
||||||
"""
|
|
||||||
Client for cf-orch coordinator allocation.
|
|
||||||
|
|
||||||
Sync usage (in LLMRouter or other sync code):
|
|
||||||
client = CFOrchClient(os.environ["CF_ORCH_URL"])
|
|
||||||
with client.allocate("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
|
|
||||||
# alloc.url is the inference endpoint
|
|
||||||
|
|
||||||
Async usage (in FastAPI apps):
|
|
||||||
async with client.allocate_async("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
|
|
||||||
...
|
|
||||||
|
|
||||||
Authentication:
|
|
||||||
Pass api_key explicitly, or set CF_LICENSE_KEY env var. When set, every
|
|
||||||
request carries Authorization: Bearer <key>. Required for the hosted
|
|
||||||
CircuitForge coordinator (orch.circuitforge.tech); optional for local
|
|
||||||
self-hosted coordinators.
|
|
||||||
|
|
||||||
Raises ValueError immediately if coordinator_url is empty.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, coordinator_url: str, api_key: str | None = None) -> None:
|
|
||||||
if not coordinator_url:
|
|
||||||
raise ValueError("coordinator_url is empty — cf-orch not configured")
|
|
||||||
self._url = coordinator_url.rstrip("/")
|
|
||||||
self._api_key = api_key or os.environ.get("CF_LICENSE_KEY", "")
|
|
||||||
|
|
||||||
def _headers(self) -> dict[str, str]:
|
|
||||||
if self._api_key:
|
|
||||||
return {"Authorization": f"Bearer {self._api_key}"}
|
|
||||||
return {}
|
|
||||||
|
|
||||||
def _build_body(self, model_candidates: list[str] | None, ttl_s: float, caller: str) -> dict:
|
|
||||||
return {
|
|
||||||
"model_candidates": model_candidates or [],
|
|
||||||
"ttl_s": ttl_s,
|
|
||||||
"caller": caller,
|
|
||||||
}
|
|
||||||
|
|
||||||
def _parse_allocation(self, data: dict, service: str) -> Allocation:
|
|
||||||
return Allocation(
|
|
||||||
allocation_id=data["allocation_id"],
|
|
||||||
service=service,
|
|
||||||
node_id=data["node_id"],
|
|
||||||
gpu_id=data["gpu_id"],
|
|
||||||
model=data.get("model"),
|
|
||||||
url=data["url"],
|
|
||||||
started=data.get("started", False),
|
|
||||||
warm=data.get("warm", False),
|
|
||||||
)
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def allocate(
|
|
||||||
self,
|
|
||||||
service: str,
|
|
||||||
*,
|
|
||||||
model_candidates: list[str] | None = None,
|
|
||||||
ttl_s: float = 3600.0,
|
|
||||||
caller: str = "",
|
|
||||||
):
|
|
||||||
"""Sync context manager. Allocates on enter, releases on exit."""
|
|
||||||
resp = httpx.post(
|
|
||||||
f"{self._url}/api/services/{service}/allocate",
|
|
||||||
json=self._build_body(model_candidates, ttl_s, caller),
|
|
||||||
headers=self._headers(),
|
|
||||||
timeout=120.0,
|
|
||||||
)
|
|
||||||
if not resp.is_success:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"cf-orch allocation failed for {service!r}: "
|
|
||||||
f"HTTP {resp.status_code} — {resp.text[:200]}"
|
|
||||||
)
|
|
||||||
alloc = self._parse_allocation(resp.json(), service)
|
|
||||||
try:
|
|
||||||
yield alloc
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
httpx.delete(
|
|
||||||
f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
|
|
||||||
headers=self._headers(),
|
|
||||||
timeout=10.0,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("cf-orch release failed (non-fatal): %s", exc)
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def allocate_async(
|
|
||||||
self,
|
|
||||||
service: str,
|
|
||||||
*,
|
|
||||||
model_candidates: list[str] | None = None,
|
|
||||||
ttl_s: float = 3600.0,
|
|
||||||
caller: str = "",
|
|
||||||
):
|
|
||||||
"""Async context manager. Allocates on enter, releases on exit."""
|
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{self._url}/api/services/{service}/allocate",
|
|
||||||
json=self._build_body(model_candidates, ttl_s, caller),
|
|
||||||
headers=self._headers(),
|
|
||||||
)
|
|
||||||
if not resp.is_success:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"cf-orch allocation failed for {service!r}: "
|
|
||||||
f"HTTP {resp.status_code} — {resp.text[:200]}"
|
|
||||||
)
|
|
||||||
alloc = self._parse_allocation(resp.json(), service)
|
|
||||||
try:
|
|
||||||
yield alloc
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
await client.delete(
|
|
||||||
f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
|
|
||||||
headers=self._headers(),
|
|
||||||
timeout=10.0,
|
|
||||||
)
|
|
||||||
except Exception as exc:
|
|
||||||
logger.debug("cf-orch async release failed (non-fatal): %s", exc)
|
|
||||||
|
|
@ -1,44 +0,0 @@
|
||||||
# circuitforge_core/resources/compose.yml
|
|
||||||
# One-command cf-orch deployment for Docker self-hosters:
|
|
||||||
# docker compose -f path/to/compose.yml up cf-orch-coordinator
|
|
||||||
|
|
||||||
services:
|
|
||||||
cf-orch-coordinator:
|
|
||||||
image: python:3.12-slim
|
|
||||||
command: >
|
|
||||||
sh -c "pip install 'circuitforge-core[orch]' &&
|
|
||||||
cf-orch start --host 0.0.0.0 --port 7700"
|
|
||||||
ports:
|
|
||||||
- "7700:7700"
|
|
||||||
volumes:
|
|
||||||
- /run/docker.sock:/var/run/docker.sock:ro
|
|
||||||
- cf-orch-data:/data
|
|
||||||
environment:
|
|
||||||
- CFORCH_PROFILE=${CFORCH_PROFILE:-}
|
|
||||||
restart: unless-stopped
|
|
||||||
devices:
|
|
||||||
- /dev/nvidia0:/dev/nvidia0
|
|
||||||
- /dev/nvidiactl:/dev/nvidiactl
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
cf-orch-agent:
|
|
||||||
image: python:3.12-slim
|
|
||||||
command: >
|
|
||||||
sh -c "pip install 'circuitforge-core[orch]' &&
|
|
||||||
cf-orch agent --coordinator http://cf-orch-coordinator:7700
|
|
||||||
--node-id ${CFORCH_NODE_ID:-local}
|
|
||||||
--host 0.0.0.0 --port 7701"
|
|
||||||
ports:
|
|
||||||
- "7701:7701"
|
|
||||||
depends_on:
|
|
||||||
- cf-orch-coordinator
|
|
||||||
environment:
|
|
||||||
- CFORCH_NODE_ID=${CFORCH_NODE_ID:-local}
|
|
||||||
restart: unless-stopped
|
|
||||||
devices:
|
|
||||||
- /dev/nvidia0:/dev/nvidia0
|
|
||||||
- /dev/nvidiactl:/dev/nvidiactl
|
|
||||||
runtime: nvidia
|
|
||||||
|
|
||||||
volumes:
|
|
||||||
cf-orch-data:
|
|
||||||
|
|
@ -1,209 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo, ResidentAllocation
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_HEARTBEAT_INTERVAL_S = 10.0
|
|
||||||
_AGENT_TIMEOUT_S = 5.0
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AgentRecord:
|
|
||||||
node_id: str
|
|
||||||
agent_url: str
|
|
||||||
last_seen: float = field(default_factory=time.time)
|
|
||||||
gpus: list[GpuInfo] = field(default_factory=list)
|
|
||||||
online: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class AgentSupervisor:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
lease_manager: LeaseManager,
|
|
||||||
service_registry: ServiceRegistry | None = None,
|
|
||||||
profile_registry: ProfileRegistry | None = None,
|
|
||||||
node_store: NodeStore | None = None,
|
|
||||||
) -> None:
|
|
||||||
self._agents: dict[str, AgentRecord] = {}
|
|
||||||
self._lease_manager = lease_manager
|
|
||||||
self._running = False
|
|
||||||
self._service_registry = service_registry
|
|
||||||
self._profile_registry = profile_registry
|
|
||||||
self._node_store = node_store
|
|
||||||
self._heartbeat_tick = 0
|
|
||||||
|
|
||||||
def restore_from_store(self) -> int:
|
|
||||||
"""
|
|
||||||
Load previously-known nodes from NodeStore into the in-memory registry.
|
|
||||||
|
|
||||||
All restored nodes start as offline=False. The heartbeat loop will poll
|
|
||||||
them on its first tick and promote any that respond to online=True.
|
|
||||||
|
|
||||||
Returns the number of nodes restored.
|
|
||||||
"""
|
|
||||||
if self._node_store is None:
|
|
||||||
return 0
|
|
||||||
restored = 0
|
|
||||||
for node_id, agent_url in self._node_store.all():
|
|
||||||
if node_id not in self._agents:
|
|
||||||
self._agents[node_id] = AgentRecord(
|
|
||||||
node_id=node_id, agent_url=agent_url, online=False
|
|
||||||
)
|
|
||||||
restored += 1
|
|
||||||
if restored:
|
|
||||||
logger.info("NodeStore: restored %d known node(s) from previous session", restored)
|
|
||||||
return restored
|
|
||||||
|
|
||||||
def register(self, node_id: str, agent_url: str) -> None:
|
|
||||||
if node_id not in self._agents:
|
|
||||||
self._agents[node_id] = AgentRecord(node_id=node_id, agent_url=agent_url)
|
|
||||||
logger.info("Registered agent node: %s @ %s", node_id, agent_url)
|
|
||||||
else:
|
|
||||||
if self._agents[node_id].agent_url != agent_url:
|
|
||||||
self._agents[node_id].agent_url = agent_url
|
|
||||||
logger.info("Updated agent URL for %s → %s", node_id, agent_url)
|
|
||||||
if self._node_store is not None:
|
|
||||||
self._node_store.upsert(node_id, agent_url)
|
|
||||||
|
|
||||||
def get_node_info(self, node_id: str) -> NodeInfo | None:
|
|
||||||
record = self._agents.get(node_id)
|
|
||||||
if record is None:
|
|
||||||
return None
|
|
||||||
return NodeInfo(
|
|
||||||
node_id=record.node_id,
|
|
||||||
agent_url=record.agent_url,
|
|
||||||
gpus=record.gpus,
|
|
||||||
last_heartbeat=record.last_seen,
|
|
||||||
)
|
|
||||||
|
|
||||||
def all_nodes(self) -> list[NodeInfo]:
|
|
||||||
return [
|
|
||||||
NodeInfo(
|
|
||||||
node_id=r.node_id,
|
|
||||||
agent_url=r.agent_url,
|
|
||||||
gpus=r.gpus,
|
|
||||||
last_heartbeat=r.last_seen,
|
|
||||||
)
|
|
||||||
for r in self._agents.values()
|
|
||||||
]
|
|
||||||
|
|
||||||
def online_agents(self) -> "dict[str, AgentRecord]":
|
|
||||||
"""Return only currently-online agents, keyed by node_id."""
|
|
||||||
return {nid: rec for nid, rec in self._agents.items() if rec.online}
|
|
||||||
|
|
||||||
async def poll_agent(self, node_id: str) -> bool:
|
|
||||||
record = self._agents.get(node_id)
|
|
||||||
if record is None:
|
|
||||||
return False
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=_AGENT_TIMEOUT_S) as client:
|
|
||||||
gpu_resp = await client.get(f"{record.agent_url}/gpu-info")
|
|
||||||
gpu_resp.raise_for_status()
|
|
||||||
|
|
||||||
# Resident-info is best-effort — older agents may not have the endpoint.
|
|
||||||
try:
|
|
||||||
res_resp = await client.get(f"{record.agent_url}/resident-info")
|
|
||||||
resident_data = res_resp.json() if res_resp.is_success else {}
|
|
||||||
except Exception:
|
|
||||||
resident_data = {}
|
|
||||||
|
|
||||||
data = gpu_resp.json()
|
|
||||||
gpus = [
|
|
||||||
GpuInfo(
|
|
||||||
gpu_id=g["gpu_id"],
|
|
||||||
name=g["name"],
|
|
||||||
vram_total_mb=g["vram_total_mb"],
|
|
||||||
vram_used_mb=g["vram_used_mb"],
|
|
||||||
vram_free_mb=g["vram_free_mb"],
|
|
||||||
)
|
|
||||||
for g in data.get("gpus", [])
|
|
||||||
]
|
|
||||||
record.gpus = gpus
|
|
||||||
record.last_seen = time.time()
|
|
||||||
record.online = True
|
|
||||||
for gpu in gpus:
|
|
||||||
self._lease_manager.register_gpu(node_id, gpu.gpu_id, gpu.vram_total_mb)
|
|
||||||
|
|
||||||
residents = [
|
|
||||||
(r["service"], r.get("model_name"))
|
|
||||||
for r in resident_data.get("residents", [])
|
|
||||||
]
|
|
||||||
self._lease_manager.set_residents_for_node(node_id, residents)
|
|
||||||
|
|
||||||
return True
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("Agent %s unreachable: %s", node_id, exc)
|
|
||||||
record.online = False
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def poll_all(self) -> None:
|
|
||||||
await asyncio.gather(*[self.poll_agent(nid) for nid in self._agents])
|
|
||||||
|
|
||||||
def _build_idle_stop_config(self) -> dict[str, int]:
|
|
||||||
if self._profile_registry is None:
|
|
||||||
return {}
|
|
||||||
config: dict[str, int] = {}
|
|
||||||
for profile in self._profile_registry.list_public():
|
|
||||||
for svc_name, svc in profile.services.items():
|
|
||||||
if svc.idle_stop_after_s > 0:
|
|
||||||
existing = config.get(svc_name, 0)
|
|
||||||
config[svc_name] = min(existing, svc.idle_stop_after_s) if existing > 0 else svc.idle_stop_after_s
|
|
||||||
return config
|
|
||||||
|
|
||||||
async def _http_post(self, url: str) -> bool:
|
|
||||||
try:
|
|
||||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
||||||
resp = await client.post(url)
|
|
||||||
return resp.is_success
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("HTTP POST %s failed: %s", url, exc)
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def _run_idle_sweep(self) -> None:
|
|
||||||
if self._service_registry is None:
|
|
||||||
return
|
|
||||||
expired = self._service_registry.sweep_expired_allocations()
|
|
||||||
if expired:
|
|
||||||
logger.info("TTL sweep: expired %d allocation(s): %s", len(expired), expired)
|
|
||||||
idle_stop_config = self._build_idle_stop_config()
|
|
||||||
if not idle_stop_config:
|
|
||||||
return
|
|
||||||
timed_out = self._service_registry.idle_past_timeout(idle_stop_config)
|
|
||||||
for instance in timed_out:
|
|
||||||
node_info = self.get_node_info(instance.node_id)
|
|
||||||
if node_info is None:
|
|
||||||
continue
|
|
||||||
stop_url = f"{node_info.agent_url}/services/{instance.service}/stop"
|
|
||||||
logger.info(
|
|
||||||
"Idle sweep: stopping %s on %s gpu%s (idle timeout)",
|
|
||||||
instance.service, instance.node_id, instance.gpu_id,
|
|
||||||
)
|
|
||||||
success = await self._http_post(stop_url)
|
|
||||||
if success:
|
|
||||||
self._service_registry.mark_stopped(
|
|
||||||
instance.service, instance.node_id, instance.gpu_id
|
|
||||||
)
|
|
||||||
|
|
||||||
async def run_heartbeat_loop(self) -> None:
|
|
||||||
self._running = True
|
|
||||||
while self._running:
|
|
||||||
await self.poll_all()
|
|
||||||
self._heartbeat_tick += 1
|
|
||||||
if self._heartbeat_tick % 3 == 0:
|
|
||||||
await self._run_idle_sweep()
|
|
||||||
await asyncio.sleep(_HEARTBEAT_INTERVAL_S)
|
|
||||||
|
|
||||||
def stop(self) -> None:
|
|
||||||
self._running = False
|
|
||||||
|
|
@ -1,509 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
import urllib.request
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
from fastapi import FastAPI, HTTPException
|
|
||||||
from fastapi.responses import HTMLResponse
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
|
||||||
from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.node_selector import select_node
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.profiles.schema import ProcessSpec
|
|
||||||
|
|
||||||
_DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
|
|
||||||
|
|
||||||
|
|
||||||
def _get_health_path(profile_registry: ProfileRegistry, service: str) -> str:
|
|
||||||
"""Return the health_path for a service from the first matching profile spec."""
|
|
||||||
for profile in profile_registry.list_public():
|
|
||||||
svc = profile.services.get(service)
|
|
||||||
if svc and isinstance(svc.managed, ProcessSpec):
|
|
||||||
return svc.managed.health_path
|
|
||||||
return "/health"
|
|
||||||
|
|
||||||
_PROBE_INTERVAL_S = 5.0 # how often to poll starting instances
|
|
||||||
_PROBE_TIMEOUT_S = 300.0 # give up and mark stopped after this many seconds
|
|
||||||
|
|
||||||
|
|
||||||
async def _run_instance_probe_loop(service_registry: ServiceRegistry) -> None:
|
|
||||||
"""
|
|
||||||
Background loop: transition 'starting' instances to 'running' once their
|
|
||||||
/health endpoint responds, or to 'stopped' after PROBE_TIMEOUT_S.
|
|
||||||
"""
|
|
||||||
import asyncio
|
|
||||||
|
|
||||||
start_times: dict[str, float] = {} # instance key → time first seen as starting
|
|
||||||
|
|
||||||
while True:
|
|
||||||
await asyncio.sleep(_PROBE_INTERVAL_S)
|
|
||||||
now = time.time()
|
|
||||||
for inst in service_registry.all_instances():
|
|
||||||
if inst.state != "starting":
|
|
||||||
start_times.pop(f"{inst.service}:{inst.node_id}:{inst.gpu_id}", None)
|
|
||||||
continue
|
|
||||||
key = f"{inst.service}:{inst.node_id}:{inst.gpu_id}"
|
|
||||||
start_times.setdefault(key, now)
|
|
||||||
|
|
||||||
healthy = False
|
|
||||||
if inst.url:
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(
|
|
||||||
inst.url.rstrip("/") + inst.health_path, timeout=2.0
|
|
||||||
) as resp:
|
|
||||||
healthy = resp.status == 200
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if healthy:
|
|
||||||
service_registry.upsert_instance(
|
|
||||||
service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
|
|
||||||
state="running", model=inst.model, url=inst.url,
|
|
||||||
)
|
|
||||||
start_times.pop(key, None)
|
|
||||||
logger.info("Instance %s/%s gpu=%s transitioned to running", inst.service, inst.node_id, inst.gpu_id)
|
|
||||||
elif now - start_times[key] > _PROBE_TIMEOUT_S:
|
|
||||||
service_registry.upsert_instance(
|
|
||||||
service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
|
|
||||||
state="stopped", model=inst.model, url=inst.url,
|
|
||||||
)
|
|
||||||
start_times.pop(key, None)
|
|
||||||
logger.warning("Instance %s/%s gpu=%s timed out in starting state — marked stopped", inst.service, inst.node_id, inst.gpu_id)
|
|
||||||
|
|
||||||
|
|
||||||
class LeaseRequest(BaseModel):
|
|
||||||
node_id: str
|
|
||||||
gpu_id: int
|
|
||||||
mb: int
|
|
||||||
service: str
|
|
||||||
priority: int = 2
|
|
||||||
ttl_s: float = 0.0
|
|
||||||
|
|
||||||
|
|
||||||
class NodeRegisterRequest(BaseModel):
|
|
||||||
node_id: str
|
|
||||||
agent_url: str # e.g. "http://10.1.10.71:7701"
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceEnsureRequest(BaseModel):
|
|
||||||
node_id: str
|
|
||||||
gpu_id: int = 0
|
|
||||||
params: dict[str, str] = {}
|
|
||||||
ttl_s: float = 3600.0
|
|
||||||
# Ordered list of model names to try; falls back down the list if VRAM is tight.
|
|
||||||
# The "model" key in params is used if this list is empty.
|
|
||||||
model_candidates: list[str] = []
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceAllocateRequest(BaseModel):
|
|
||||||
model_candidates: list[str] = []
|
|
||||||
gpu_id: int | None = None
|
|
||||||
params: dict[str, str] = {}
|
|
||||||
ttl_s: float = 3600.0
|
|
||||||
caller: str = ""
|
|
||||||
|
|
||||||
|
|
||||||
def create_coordinator_app(
|
|
||||||
lease_manager: LeaseManager,
|
|
||||||
profile_registry: ProfileRegistry,
|
|
||||||
agent_supervisor: AgentSupervisor,
|
|
||||||
service_registry: ServiceRegistry,
|
|
||||||
) -> FastAPI:
|
|
||||||
eviction_engine = EvictionEngine(lease_manager=lease_manager)
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def _lifespan(app: FastAPI): # type: ignore[type-arg]
|
|
||||||
import asyncio
|
|
||||||
heartbeat_task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
|
|
||||||
probe_task = asyncio.create_task(_run_instance_probe_loop(service_registry))
|
|
||||||
yield
|
|
||||||
agent_supervisor.stop()
|
|
||||||
heartbeat_task.cancel()
|
|
||||||
probe_task.cancel()
|
|
||||||
|
|
||||||
app = FastAPI(title="cf-orch-coordinator", lifespan=_lifespan)
|
|
||||||
|
|
||||||
# Optional Heimdall auth — enabled when HEIMDALL_URL env var is set.
|
|
||||||
# Self-hosted coordinators skip this entirely; the CF-hosted public endpoint
|
|
||||||
# (orch.circuitforge.tech) sets HEIMDALL_URL to gate paid+ access.
|
|
||||||
from circuitforge_core.resources.coordinator.auth import HeimdallAuthMiddleware
|
|
||||||
_auth = HeimdallAuthMiddleware.from_env()
|
|
||||||
if _auth is not None:
|
|
||||||
app.middleware("http")(_auth)
|
|
||||||
|
|
||||||
@app.get("/", response_class=HTMLResponse, include_in_schema=False)
|
|
||||||
def dashboard() -> HTMLResponse:
|
|
||||||
return HTMLResponse(content=_DASHBOARD_HTML)
|
|
||||||
|
|
||||||
@app.get("/api/health")
|
|
||||||
def health() -> dict[str, Any]:
|
|
||||||
return {"status": "ok"}
|
|
||||||
|
|
||||||
@app.get("/api/nodes")
|
|
||||||
def get_nodes() -> dict[str, Any]:
|
|
||||||
nodes = agent_supervisor.all_nodes()
|
|
||||||
return {
|
|
||||||
"nodes": [
|
|
||||||
{
|
|
||||||
"node_id": n.node_id,
|
|
||||||
"agent_url": n.agent_url,
|
|
||||||
"last_heartbeat": n.last_heartbeat,
|
|
||||||
"gpus": [
|
|
||||||
{
|
|
||||||
"gpu_id": g.gpu_id,
|
|
||||||
"name": g.name,
|
|
||||||
"vram_total_mb": g.vram_total_mb,
|
|
||||||
"vram_used_mb": g.vram_used_mb,
|
|
||||||
"vram_free_mb": g.vram_free_mb,
|
|
||||||
}
|
|
||||||
for g in n.gpus
|
|
||||||
],
|
|
||||||
}
|
|
||||||
for n in nodes
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.post("/api/nodes")
|
|
||||||
async def register_node(req: NodeRegisterRequest) -> dict[str, Any]:
|
|
||||||
"""Agents call this to self-register. Coordinator immediately polls for GPU info."""
|
|
||||||
agent_supervisor.register(req.node_id, req.agent_url)
|
|
||||||
await agent_supervisor.poll_agent(req.node_id)
|
|
||||||
return {"registered": True, "node_id": req.node_id}
|
|
||||||
|
|
||||||
@app.get("/api/profiles")
|
|
||||||
def get_profiles() -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"profiles": [
|
|
||||||
{"name": p.name, "vram_total_mb": p.vram_total_mb}
|
|
||||||
for p in profile_registry.list_public()
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.get("/api/resident")
|
|
||||||
def get_residents() -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"residents": [
|
|
||||||
{
|
|
||||||
"service": r.service,
|
|
||||||
"node_id": r.node_id,
|
|
||||||
"model_name": r.model_name,
|
|
||||||
"first_seen": r.first_seen,
|
|
||||||
}
|
|
||||||
for r in lease_manager.all_residents()
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.get("/api/leases")
|
|
||||||
def get_leases() -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"leases": [
|
|
||||||
{
|
|
||||||
"lease_id": lease.lease_id,
|
|
||||||
"node_id": lease.node_id,
|
|
||||||
"gpu_id": lease.gpu_id,
|
|
||||||
"mb_granted": lease.mb_granted,
|
|
||||||
"holder_service": lease.holder_service,
|
|
||||||
"priority": lease.priority,
|
|
||||||
"expires_at": lease.expires_at,
|
|
||||||
}
|
|
||||||
for lease in lease_manager.all_leases()
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.post("/api/leases")
|
|
||||||
async def request_lease(req: LeaseRequest) -> dict[str, Any]:
|
|
||||||
node_info = agent_supervisor.get_node_info(req.node_id)
|
|
||||||
if node_info is None:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=422,
|
|
||||||
detail=f"Unknown node_id {req.node_id!r} — node not registered",
|
|
||||||
)
|
|
||||||
agent_url = node_info.agent_url
|
|
||||||
|
|
||||||
lease = await eviction_engine.request_lease(
|
|
||||||
node_id=req.node_id,
|
|
||||||
gpu_id=req.gpu_id,
|
|
||||||
mb=req.mb,
|
|
||||||
service=req.service,
|
|
||||||
priority=req.priority,
|
|
||||||
agent_url=agent_url,
|
|
||||||
ttl_s=req.ttl_s,
|
|
||||||
)
|
|
||||||
if lease is None:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=503,
|
|
||||||
detail="Insufficient VRAM — no eviction candidates available",
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"lease": {
|
|
||||||
"lease_id": lease.lease_id,
|
|
||||||
"node_id": lease.node_id,
|
|
||||||
"gpu_id": lease.gpu_id,
|
|
||||||
"mb_granted": lease.mb_granted,
|
|
||||||
"holder_service": lease.holder_service,
|
|
||||||
"priority": lease.priority,
|
|
||||||
"expires_at": lease.expires_at,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.delete("/api/leases/{lease_id}")
|
|
||||||
async def release_lease(lease_id: str) -> dict[str, Any]:
|
|
||||||
released = await lease_manager.release(lease_id)
|
|
||||||
if not released:
|
|
||||||
raise HTTPException(status_code=404, detail=f"Lease {lease_id!r} not found")
|
|
||||||
return {"released": True, "lease_id": lease_id}
|
|
||||||
|
|
||||||
@app.post("/api/services/{service}/ensure")
|
|
||||||
async def ensure_service(service: str, req: ServiceEnsureRequest) -> dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Ensure a managed service is running on the given node.
|
|
||||||
|
|
||||||
If model_candidates is provided, tries each model in order, skipping any
|
|
||||||
that exceed the live free VRAM on the target GPU. Falls back down the list
|
|
||||||
until one succeeds. The selected model is returned in the response.
|
|
||||||
"""
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
node_info = agent_supervisor.get_node_info(req.node_id)
|
|
||||||
if node_info is None:
|
|
||||||
raise HTTPException(422, detail=f"Unknown node_id {req.node_id!r}")
|
|
||||||
|
|
||||||
# Resolve candidate list — fall back to params["model"] if not specified.
|
|
||||||
candidates: list[str] = req.model_candidates or (
|
|
||||||
[req.params["model"]] if "model" in req.params else []
|
|
||||||
)
|
|
||||||
if not candidates:
|
|
||||||
raise HTTPException(422, detail="No model specified: set params.model or model_candidates")
|
|
||||||
|
|
||||||
# Live free VRAM on the target GPU (used for pre-flight filtering).
|
|
||||||
gpu = next((g for g in node_info.gpus if g.gpu_id == req.gpu_id), None)
|
|
||||||
free_mb = gpu.vram_free_mb if gpu else 0
|
|
||||||
|
|
||||||
# Profile max_mb for the service gives us the VRAM ceiling for this slot.
|
|
||||||
# Models larger than free_mb are skipped before we even try to start them.
|
|
||||||
# We use model file size as a rough proxy — skip if free_mb < half of max_mb,
|
|
||||||
# since a fully-loaded model typically needs ~50-80% of its param size in VRAM.
|
|
||||||
service_max_mb = 0
|
|
||||||
for p in profile_registry.list_public():
|
|
||||||
svc = p.services.get(service)
|
|
||||||
if svc:
|
|
||||||
service_max_mb = svc.max_mb
|
|
||||||
break
|
|
||||||
|
|
||||||
# Filter candidates by VRAM headroom — require free VRAM >= service ceiling
|
|
||||||
# so the model can actually load without competing for VRAM with other processes.
|
|
||||||
if service_max_mb > 0 and free_mb < service_max_mb:
|
|
||||||
raise HTTPException(
|
|
||||||
503,
|
|
||||||
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
|
|
||||||
)
|
|
||||||
|
|
||||||
last_error: str = ""
|
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
||||||
for model in candidates:
|
|
||||||
params_with_model = {**req.params, "model": model}
|
|
||||||
try:
|
|
||||||
start_resp = await client.post(
|
|
||||||
f"{node_info.agent_url}/services/{service}/start",
|
|
||||||
json={"gpu_id": req.gpu_id, "params": params_with_model},
|
|
||||||
)
|
|
||||||
if start_resp.is_success:
|
|
||||||
data = start_resp.json()
|
|
||||||
return {
|
|
||||||
"service": service,
|
|
||||||
"node_id": req.node_id,
|
|
||||||
"gpu_id": req.gpu_id,
|
|
||||||
"model": model,
|
|
||||||
"url": data.get("url"),
|
|
||||||
"running": data.get("running", False),
|
|
||||||
}
|
|
||||||
last_error = start_resp.text
|
|
||||||
except httpx.HTTPError as exc:
|
|
||||||
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
|
|
||||||
|
|
||||||
raise HTTPException(
|
|
||||||
503,
|
|
||||||
detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.post("/api/services/{service}/allocate")
|
|
||||||
async def allocate_service(service: str, req: ServiceAllocateRequest) -> dict[str, Any]:
|
|
||||||
"""
|
|
||||||
Allocate a managed service — coordinator picks the best node automatically.
|
|
||||||
Returns a URL + allocation_id. (Allocation not tracked server-side until Phase 2.)
|
|
||||||
"""
|
|
||||||
import httpx
|
|
||||||
|
|
||||||
if not req.model_candidates:
|
|
||||||
raise HTTPException(422, detail="model_candidates must be non-empty")
|
|
||||||
|
|
||||||
# Validate service is known in at least one profile, regardless of gpu_id
|
|
||||||
if not any(service in p.services for p in profile_registry.list_public()):
|
|
||||||
raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
|
|
||||||
|
|
||||||
residents = lease_manager.resident_keys()
|
|
||||||
|
|
||||||
if req.gpu_id is None:
|
|
||||||
online = agent_supervisor.online_agents()
|
|
||||||
placement = select_node(online, service, profile_registry, residents)
|
|
||||||
if placement is None:
|
|
||||||
raise HTTPException(
|
|
||||||
503,
|
|
||||||
detail=f"No online node has capacity for service {service!r}",
|
|
||||||
)
|
|
||||||
node_id, gpu_id = placement
|
|
||||||
else:
|
|
||||||
online = agent_supervisor.online_agents()
|
|
||||||
node_id = next(
|
|
||||||
(nid for nid, rec in online.items()
|
|
||||||
if any(g.gpu_id == req.gpu_id for g in rec.gpus)),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
if node_id is None:
|
|
||||||
raise HTTPException(422, detail=f"No online node has gpu_id={req.gpu_id}")
|
|
||||||
gpu_id = req.gpu_id
|
|
||||||
|
|
||||||
node_info = agent_supervisor.get_node_info(node_id)
|
|
||||||
if node_info is None:
|
|
||||||
raise HTTPException(422, detail=f"Node {node_id!r} not found")
|
|
||||||
|
|
||||||
warm = f"{node_id}:{service}" in residents
|
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
||||||
last_error = ""
|
|
||||||
for model in req.model_candidates:
|
|
||||||
try:
|
|
||||||
resp = await client.post(
|
|
||||||
f"{node_info.agent_url}/services/{service}/start",
|
|
||||||
json={"gpu_id": gpu_id, "params": {**req.params, "model": model}},
|
|
||||||
)
|
|
||||||
if resp.is_success:
|
|
||||||
data = resp.json()
|
|
||||||
svc_url = data.get("url", "")
|
|
||||||
alloc = service_registry.allocate(
|
|
||||||
service=service,
|
|
||||||
node_id=node_id,
|
|
||||||
gpu_id=gpu_id,
|
|
||||||
model=model,
|
|
||||||
caller=req.caller,
|
|
||||||
url=svc_url,
|
|
||||||
ttl_s=req.ttl_s,
|
|
||||||
)
|
|
||||||
# Seed the instance state for first-time starts.
|
|
||||||
# adopted=True means the agent found it already running.
|
|
||||||
adopted = data.get("adopted", False)
|
|
||||||
instance_state = "running" if (warm or adopted) else "starting"
|
|
||||||
health_path = _get_health_path(profile_registry, service)
|
|
||||||
service_registry.upsert_instance(
|
|
||||||
service=service,
|
|
||||||
node_id=node_id,
|
|
||||||
gpu_id=gpu_id,
|
|
||||||
state=instance_state,
|
|
||||||
model=model,
|
|
||||||
url=svc_url,
|
|
||||||
health_path=health_path,
|
|
||||||
)
|
|
||||||
return {
|
|
||||||
"allocation_id": alloc.allocation_id,
|
|
||||||
"service": service,
|
|
||||||
"node_id": node_id,
|
|
||||||
"gpu_id": gpu_id,
|
|
||||||
"model": model,
|
|
||||||
"url": data.get("url"),
|
|
||||||
"started": not warm,
|
|
||||||
"warm": warm,
|
|
||||||
}
|
|
||||||
last_error = resp.text
|
|
||||||
except httpx.HTTPError as exc:
|
|
||||||
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
|
|
||||||
|
|
||||||
raise HTTPException(
|
|
||||||
503,
|
|
||||||
detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.delete("/api/services/{service}/allocations/{allocation_id}")
|
|
||||||
async def release_allocation(service: str, allocation_id: str) -> dict[str, Any]:
|
|
||||||
existing = service_registry.get_allocation(allocation_id)
|
|
||||||
if existing is None or existing.service != service:
|
|
||||||
raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found for service {service!r}")
|
|
||||||
released = service_registry.release(allocation_id)
|
|
||||||
if not released:
|
|
||||||
raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found")
|
|
||||||
return {"released": True, "allocation_id": allocation_id}
|
|
||||||
|
|
||||||
@app.get("/api/services/{service}/status")
|
|
||||||
def get_service_status(service: str) -> dict[str, Any]:
|
|
||||||
instances = [i for i in service_registry.all_instances() if i.service == service]
|
|
||||||
allocations = [a for a in service_registry.all_allocations() if a.service == service]
|
|
||||||
return {
|
|
||||||
"service": service,
|
|
||||||
"instances": [
|
|
||||||
{
|
|
||||||
"node_id": i.node_id,
|
|
||||||
"gpu_id": i.gpu_id,
|
|
||||||
"state": i.state,
|
|
||||||
"model": i.model,
|
|
||||||
"url": i.url,
|
|
||||||
"idle_since": i.idle_since,
|
|
||||||
}
|
|
||||||
for i in instances
|
|
||||||
],
|
|
||||||
"allocations": [
|
|
||||||
{
|
|
||||||
"allocation_id": a.allocation_id,
|
|
||||||
"node_id": a.node_id,
|
|
||||||
"gpu_id": a.gpu_id,
|
|
||||||
"model": a.model,
|
|
||||||
"caller": a.caller,
|
|
||||||
"url": a.url,
|
|
||||||
"expires_at": a.expires_at,
|
|
||||||
}
|
|
||||||
for a in allocations
|
|
||||||
],
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.get("/api/services")
|
|
||||||
def list_services() -> dict[str, Any]:
|
|
||||||
instances = service_registry.all_instances()
|
|
||||||
return {
|
|
||||||
"services": [
|
|
||||||
{
|
|
||||||
"service": i.service,
|
|
||||||
"node_id": i.node_id,
|
|
||||||
"gpu_id": i.gpu_id,
|
|
||||||
"state": i.state,
|
|
||||||
"model": i.model,
|
|
||||||
"url": i.url,
|
|
||||||
}
|
|
||||||
for i in instances
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.delete("/api/services/{service}")
|
|
||||||
async def stop_service(service: str, node_id: str) -> dict[str, Any]:
|
|
||||||
"""Stop a managed service on the given node."""
|
|
||||||
node_info = agent_supervisor.get_node_info(node_id)
|
|
||||||
if node_info is None:
|
|
||||||
raise HTTPException(422, detail=f"Unknown node_id {node_id!r}")
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
||||||
try:
|
|
||||||
resp = await client.post(f"{node_info.agent_url}/services/{service}/stop")
|
|
||||||
resp.raise_for_status()
|
|
||||||
return {"service": service, "node_id": node_id, "stopped": resp.json().get("stopped", False)}
|
|
||||||
except httpx.HTTPError as exc:
|
|
||||||
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
|
|
||||||
|
|
||||||
return app
|
|
||||||
|
|
@ -1,197 +0,0 @@
|
||||||
"""
|
|
||||||
cf-orch coordinator auth middleware.
|
|
||||||
|
|
||||||
When HEIMDALL_URL is set, all /api/* requests (except /api/health) must carry:
|
|
||||||
Authorization: Bearer <CF license key>
|
|
||||||
|
|
||||||
The key is validated against Heimdall and the result cached for
|
|
||||||
CACHE_TTL_S seconds (default 300 / 5 min). This keeps Heimdall out of the
|
|
||||||
per-allocation hot path while keeping revocation latency bounded.
|
|
||||||
|
|
||||||
When HEIMDALL_URL is not set, auth is disabled — self-hosted deployments work
|
|
||||||
with no configuration change.
|
|
||||||
|
|
||||||
Environment variables
|
|
||||||
---------------------
|
|
||||||
HEIMDALL_URL Heimdall base URL, e.g. https://license.circuitforge.tech
|
|
||||||
When absent, auth is skipped entirely.
|
|
||||||
HEIMDALL_MIN_TIER Minimum tier required (default: "paid").
|
|
||||||
Accepted values: free, paid, premium, ultra.
|
|
||||||
CF_ORCH_AUTH_SECRET Shared secret sent to Heimdall so it can distinguish
|
|
||||||
coordinator service calls from end-user requests.
|
|
||||||
Must match the COORDINATOR_SECRET env var on Heimdall.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from threading import Lock
|
|
||||||
|
|
||||||
import httpx
|
|
||||||
from fastapi import Request
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Unauthenticated paths — health check must always be accessible for monitoring.
|
|
||||||
_EXEMPT_PATHS: frozenset[str] = frozenset({"/api/health", "/", "/openapi.json", "/docs", "/redoc"})
|
|
||||||
|
|
||||||
_TIER_ORDER: dict[str, int] = {"free": 0, "paid": 1, "premium": 2, "ultra": 3}
|
|
||||||
|
|
||||||
CACHE_TTL_S: float = 300.0 # 5 minutes — matches Kiwi cloud session TTL
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class _CacheEntry:
|
|
||||||
valid: bool
|
|
||||||
tier: str
|
|
||||||
user_id: str
|
|
||||||
expires_at: float
|
|
||||||
|
|
||||||
|
|
||||||
class _ValidationCache:
|
|
||||||
"""Thread-safe TTL cache for Heimdall validation results."""
|
|
||||||
|
|
||||||
def __init__(self, ttl_s: float = CACHE_TTL_S) -> None:
|
|
||||||
self._ttl = ttl_s
|
|
||||||
self._store: dict[str, _CacheEntry] = {}
|
|
||||||
self._lock = Lock()
|
|
||||||
|
|
||||||
def get(self, key: str) -> _CacheEntry | None:
|
|
||||||
with self._lock:
|
|
||||||
entry = self._store.get(key)
|
|
||||||
if entry is None or time.monotonic() > entry.expires_at:
|
|
||||||
return None
|
|
||||||
return entry
|
|
||||||
|
|
||||||
def set(self, key: str, valid: bool, tier: str, user_id: str) -> None:
|
|
||||||
with self._lock:
|
|
||||||
self._store[key] = _CacheEntry(
|
|
||||||
valid=valid,
|
|
||||||
tier=tier,
|
|
||||||
user_id=user_id,
|
|
||||||
expires_at=time.monotonic() + self._ttl,
|
|
||||||
)
|
|
||||||
|
|
||||||
def evict(self, key: str) -> None:
|
|
||||||
with self._lock:
|
|
||||||
self._store.pop(key, None)
|
|
||||||
|
|
||||||
def prune(self) -> int:
|
|
||||||
"""Remove expired entries. Returns count removed."""
|
|
||||||
now = time.monotonic()
|
|
||||||
with self._lock:
|
|
||||||
expired = [k for k, e in self._store.items() if now > e.expires_at]
|
|
||||||
for k in expired:
|
|
||||||
del self._store[k]
|
|
||||||
return len(expired)
|
|
||||||
|
|
||||||
|
|
||||||
class HeimdallAuthMiddleware:
|
|
||||||
"""
|
|
||||||
ASGI middleware that validates CF license keys against Heimdall.
|
|
||||||
|
|
||||||
Attach to a FastAPI app via app.middleware("http"):
|
|
||||||
|
|
||||||
middleware = HeimdallAuthMiddleware.from_env()
|
|
||||||
if middleware:
|
|
||||||
app.middleware("http")(middleware)
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
heimdall_url: str,
|
|
||||||
min_tier: str = "paid",
|
|
||||||
auth_secret: str = "",
|
|
||||||
cache_ttl_s: float = CACHE_TTL_S,
|
|
||||||
) -> None:
|
|
||||||
self._heimdall = heimdall_url.rstrip("/")
|
|
||||||
self._min_tier_rank = _TIER_ORDER.get(min_tier, 1)
|
|
||||||
self._min_tier = min_tier
|
|
||||||
self._auth_secret = auth_secret
|
|
||||||
self._cache = _ValidationCache(ttl_s=cache_ttl_s)
|
|
||||||
logger.info(
|
|
||||||
"[cf-orch auth] Heimdall auth enabled — url=%s min_tier=%s ttl=%ss",
|
|
||||||
self._heimdall, min_tier, cache_ttl_s,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_env(cls) -> "HeimdallAuthMiddleware | None":
|
|
||||||
"""Return a configured middleware instance, or None if HEIMDALL_URL is not set."""
|
|
||||||
url = os.environ.get("HEIMDALL_URL", "")
|
|
||||||
if not url:
|
|
||||||
logger.info("[cf-orch auth] HEIMDALL_URL not set — auth disabled (self-hosted mode)")
|
|
||||||
return None
|
|
||||||
return cls(
|
|
||||||
heimdall_url=url,
|
|
||||||
min_tier=os.environ.get("HEIMDALL_MIN_TIER", "paid"),
|
|
||||||
auth_secret=os.environ.get("CF_ORCH_AUTH_SECRET", ""),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _validate_against_heimdall(self, license_key: str) -> tuple[bool, str, str]:
|
|
||||||
"""
|
|
||||||
Call Heimdall's /licenses/verify endpoint.
|
|
||||||
|
|
||||||
Returns (valid, tier, user_id).
|
|
||||||
On any network or parse error, returns (False, "", "") — fail closed.
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
headers: dict[str, str] = {"Content-Type": "application/json"}
|
|
||||||
if self._auth_secret:
|
|
||||||
headers["X-Coordinator-Secret"] = self._auth_secret
|
|
||||||
resp = httpx.post(
|
|
||||||
f"{self._heimdall}/licenses/verify",
|
|
||||||
json={"key": license_key, "min_tier": self._min_tier},
|
|
||||||
headers=headers,
|
|
||||||
timeout=5.0,
|
|
||||||
)
|
|
||||||
if resp.status_code == 200:
|
|
||||||
data = resp.json()
|
|
||||||
return data.get("valid", False), data.get("tier", ""), data.get("user_id", "")
|
|
||||||
# 401/403 from Heimdall = key invalid/insufficient tier
|
|
||||||
logger.debug("[cf-orch auth] Heimdall returned %s for key ...%s", resp.status_code, license_key[-6:])
|
|
||||||
return False, "", ""
|
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("[cf-orch auth] Heimdall unreachable — failing closed: %s", exc)
|
|
||||||
return False, "", ""
|
|
||||||
|
|
||||||
def _check_key(self, license_key: str) -> tuple[bool, str]:
|
|
||||||
"""
|
|
||||||
Validate key (cache-first). Returns (authorized, reason_if_denied).
|
|
||||||
"""
|
|
||||||
cached = self._cache.get(license_key)
|
|
||||||
if cached is not None:
|
|
||||||
if not cached.valid:
|
|
||||||
return False, "license key invalid or expired"
|
|
||||||
if _TIER_ORDER.get(cached.tier, -1) < self._min_tier_rank:
|
|
||||||
return False, f"feature requires {self._min_tier} tier (have: {cached.tier})"
|
|
||||||
return True, ""
|
|
||||||
|
|
||||||
valid, tier, user_id = self._validate_against_heimdall(license_key)
|
|
||||||
self._cache.set(license_key, valid=valid, tier=tier, user_id=user_id)
|
|
||||||
|
|
||||||
if not valid:
|
|
||||||
return False, "license key invalid or expired"
|
|
||||||
if _TIER_ORDER.get(tier, -1) < self._min_tier_rank:
|
|
||||||
return False, f"feature requires {self._min_tier} tier (have: {tier})"
|
|
||||||
return True, ""
|
|
||||||
|
|
||||||
async def __call__(self, request: Request, call_next): # type: ignore[no-untyped-def]
|
|
||||||
if request.url.path in _EXEMPT_PATHS:
|
|
||||||
return await call_next(request)
|
|
||||||
|
|
||||||
auth_header = request.headers.get("Authorization", "")
|
|
||||||
if not auth_header.startswith("Bearer "):
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=401,
|
|
||||||
content={"detail": "Authorization: Bearer <license_key> required"},
|
|
||||||
)
|
|
||||||
|
|
||||||
license_key = auth_header.removeprefix("Bearer ").strip()
|
|
||||||
authorized, reason = self._check_key(license_key)
|
|
||||||
if not authorized:
|
|
||||||
return JSONResponse(status_code=403, content={"detail": reason})
|
|
||||||
|
|
||||||
return await call_next(request)
|
|
||||||
|
|
@ -1,473 +0,0 @@
|
||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>cf-orch · dashboard</title>
|
|
||||||
<style>
|
|
||||||
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
|
|
||||||
|
|
||||||
:root {
|
|
||||||
--bg: #0d1117;
|
|
||||||
--bg2: #161b22;
|
|
||||||
--bg3: #1c2129;
|
|
||||||
--border: #30363d;
|
|
||||||
--border-dim: #21262d;
|
|
||||||
--text: #e6edf3;
|
|
||||||
--muted: #8b949e;
|
|
||||||
--dim: #4d5763;
|
|
||||||
--indigo: #818cf8;
|
|
||||||
--cyan: #22d3ee;
|
|
||||||
--green: #4ade80;
|
|
||||||
--amber: #fbbf24;
|
|
||||||
--red: #f85149;
|
|
||||||
--orange: #fb923c;
|
|
||||||
--radius: 6px;
|
|
||||||
--radius-sm: 3px;
|
|
||||||
--font: 'JetBrains Mono', 'Fira Code', ui-monospace, monospace;
|
|
||||||
}
|
|
||||||
|
|
||||||
body { background: var(--bg); color: var(--text); font-family: var(--font); font-size: 13px; line-height: 1.5; padding: 1rem; }
|
|
||||||
|
|
||||||
/* header */
|
|
||||||
header { display: flex; align-items: center; gap: 1rem; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid var(--border); }
|
|
||||||
.logo { color: var(--indigo); font-size: 1.1em; font-weight: 700; }
|
|
||||||
#refresh-badge { margin-left: auto; font-size: 0.75em; color: var(--dim); }
|
|
||||||
#refresh-badge span { color: var(--green); }
|
|
||||||
|
|
||||||
/* section labels */
|
|
||||||
.section-label { font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.07em; color: var(--dim); margin-bottom: 0.5rem; }
|
|
||||||
|
|
||||||
/* health strip */
|
|
||||||
#health-strip { display: flex; flex-wrap: wrap; gap: 0.4rem; margin-bottom: 1rem; padding: 0.6rem 0.75rem; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); min-height: 36px; }
|
|
||||||
.pill { display: inline-flex; align-items: center; gap: 0.3rem; padding: 2px 10px; border-radius: 99px; font-size: 0.8em; font-weight: 600; }
|
|
||||||
.pill.ok { background: rgba(74,222,128,.12); color: var(--green); }
|
|
||||||
.pill.err { background: rgba(248,81,73,.12); color: var(--red); }
|
|
||||||
.pill.off { background: rgba(139,148,158,.1); color: var(--dim); }
|
|
||||||
|
|
||||||
/* GPU grid */
|
|
||||||
#gpu-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 0.6rem; margin-bottom: 1rem; }
|
|
||||||
.gpu-card { background: var(--bg3); border: 1px solid var(--border); border-radius: var(--radius); padding: 0.7rem 0.8rem; }
|
|
||||||
.gpu-card.offline { border-color: #7c2d12; opacity: 0.7; }
|
|
||||||
.gpu-node { font-size: 0.75em; font-weight: 700; color: var(--indigo); margin-bottom: 1px; }
|
|
||||||
.gpu-offline .gpu-node { color: var(--orange); }
|
|
||||||
.gpu-name { font-size: 0.78em; color: var(--text); margin-bottom: 0.4rem; }
|
|
||||||
.vram-track { position: relative; background: var(--bg); border-radius: var(--radius-sm); height: 6px; margin-bottom: 0.3rem; overflow: hidden; }
|
|
||||||
.vram-leased { position: absolute; left: 0; top: 0; height: 100%; background: var(--cyan); transition: width 0.4s; }
|
|
||||||
.vram-resident { position: absolute; top: 0; height: 100%; background: var(--amber); transition: left 0.4s, width 0.4s; }
|
|
||||||
.vram-label { font-size: 0.72em; color: var(--muted); margin-bottom: 0.25rem; }
|
|
||||||
.gpu-status { font-size: 0.72em; }
|
|
||||||
.gpu-status.idle { color: var(--green); }
|
|
||||||
.gpu-status.busy { color: var(--amber); }
|
|
||||||
.gpu-status.full { color: var(--red); }
|
|
||||||
.gpu-status.offline { color: var(--orange); }
|
|
||||||
.spark-track { height: 24px; background: var(--bg); border-radius: var(--radius-sm); margin-top: 0.4rem; overflow: hidden; }
|
|
||||||
|
|
||||||
/* shared table base */
|
|
||||||
.cf-table { width: 100%; border-collapse: collapse; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); overflow: hidden; margin-bottom: 1rem; }
|
|
||||||
.cf-table th { background: var(--bg3); color: var(--dim); font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.05em; padding: 0.4rem 0.6rem; text-align: left; border-bottom: 1px solid var(--border); }
|
|
||||||
.cf-table td { padding: 0.35rem 0.6rem; border-bottom: 1px solid var(--border-dim); font-size: 0.8em; vertical-align: middle; }
|
|
||||||
.cf-table tr:last-child td { border-bottom: none; }
|
|
||||||
.td-service { color: var(--indigo); font-weight: 600; }
|
|
||||||
.td-node { color: var(--muted); }
|
|
||||||
.td-mb { color: var(--text); }
|
|
||||||
.td-priority { color: var(--amber); }
|
|
||||||
.td-model { color: var(--cyan); font-size: 0.75em; }
|
|
||||||
.td-warm { color: var(--amber); }
|
|
||||||
.td-none { color: var(--dim); font-style: italic; }
|
|
||||||
.ttl-wrap { display: flex; align-items: center; gap: 0.5rem; }
|
|
||||||
.ttl-label { color: var(--cyan); font-variant-numeric: tabular-nums; white-space: nowrap; }
|
|
||||||
.ttl-track { flex: 1; background: var(--bg); border-radius: var(--radius-sm); height: 4px; }
|
|
||||||
.ttl-fill { height: 100%; border-radius: var(--radius-sm); background: var(--cyan); transition: width 0.4s; }
|
|
||||||
|
|
||||||
/* service state classes */
|
|
||||||
.state-running { color: #2ecc40; }
|
|
||||||
.state-idle { color: #ff851b; }
|
|
||||||
.state-stopped { color: #aaa; }
|
|
||||||
.state-starting { color: #0074d9; }
|
|
||||||
.state-unknown { color: #ff4136; }
|
|
||||||
|
|
||||||
/* error */
|
|
||||||
#error-banner { display: none; background: rgba(248,81,73,.1); border: 1px solid var(--red); border-radius: var(--radius); color: var(--red); padding: 0.5rem 0.75rem; font-size: 0.82em; margin-bottom: 1rem; }
|
|
||||||
|
|
||||||
/* footer */
|
|
||||||
footer { border-top: 1px solid var(--border); padding-top: 0.5rem; color: var(--dim); font-size: 0.72em; display: flex; gap: 1.5rem; }
|
|
||||||
footer a { color: var(--indigo); text-decoration: none; }
|
|
||||||
footer a:hover { text-decoration: underline; }
|
|
||||||
</style>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
|
|
||||||
<header>
|
|
||||||
<span class="logo">cf-orch</span>
|
|
||||||
<span id="cluster-label" style="color:var(--muted)">coordinator</span>
|
|
||||||
<div id="refresh-badge">auto-refresh <span id="countdown">5</span>s</div>
|
|
||||||
</header>
|
|
||||||
|
|
||||||
<div id="error-banner"></div>
|
|
||||||
|
|
||||||
<div class="section-label">Services</div>
|
|
||||||
<div id="health-strip"></div>
|
|
||||||
|
|
||||||
<div class="section-label">GPU Nodes</div>
|
|
||||||
<div id="gpu-grid"></div>
|
|
||||||
|
|
||||||
<div id="services-section">
|
|
||||||
<div class="section-label">Service Instances</div>
|
|
||||||
<table class="cf-table" id="services-table">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Service</th><th>Node</th><th>GPU</th><th>State</th><th>Model</th><th>URL</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody id="services-body"></tbody>
|
|
||||||
</table>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
<div class="section-label">Active Leases</div>
|
|
||||||
<table class="cf-table" id="leases-table">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Service</th><th>Node / GPU</th><th>VRAM</th><th>Priority</th><th>TTL / Expires</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody id="leases-body"></tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<div class="section-label">Warm Models</div>
|
|
||||||
<table class="cf-table" id="resident-table">
|
|
||||||
<thead>
|
|
||||||
<tr>
|
|
||||||
<th>Service</th><th>Node</th><th>Model</th><th>Warm Since</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody id="resident-body"></tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
<footer>
|
|
||||||
<span>cf-orch · circuitforge-core</span>
|
|
||||||
<a href="/api/nodes" target="_blank">/api/nodes</a>
|
|
||||||
<a href="/api/leases" target="_blank">/api/leases</a>
|
|
||||||
<a href="/api/resident" target="_blank">/api/resident</a>
|
|
||||||
<a href="/api/services" target="_blank">/api/services</a>
|
|
||||||
<a href="/api/health" target="_blank">/api/health</a>
|
|
||||||
</footer>
|
|
||||||
|
|
||||||
<script>
|
|
||||||
"use strict";
|
|
||||||
|
|
||||||
// ── helpers ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
/** Create an element with optional className and textContent. */
|
|
||||||
function el(tag, opts) {
|
|
||||||
const e = document.createElement(tag);
|
|
||||||
if (opts && opts.cls) { opts.cls.split(' ').forEach(c => c && e.classList.add(c)); }
|
|
||||||
if (opts && opts.text != null) e.textContent = opts.text;
|
|
||||||
if (opts && opts.style) Object.assign(e.style, opts.style);
|
|
||||||
if (opts && opts.attr) Object.entries(opts.attr).forEach(([k,v]) => e.setAttribute(k, v));
|
|
||||||
return e;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Append children to a parent element. Returns parent. */
|
|
||||||
function append(parent, ...children) {
|
|
||||||
children.forEach(c => c && parent.appendChild(c));
|
|
||||||
return parent;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Replace all children of a DOM node. */
|
|
||||||
function setChildren(parent, ...children) {
|
|
||||||
while (parent.firstChild) parent.removeChild(parent.firstChild);
|
|
||||||
append(parent, ...children);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Build a sparkline SVG element (no innerHTML). */
|
|
||||||
function buildSparkline(history, totalMb) {
|
|
||||||
const ns = 'http://www.w3.org/2000/svg';
|
|
||||||
const svg = document.createElementNS(ns, 'svg');
|
|
||||||
svg.setAttribute('width', '100%');
|
|
||||||
svg.setAttribute('height', '16');
|
|
||||||
svg.setAttribute('viewBox', '0 0 100 16');
|
|
||||||
|
|
||||||
if (!history || history.length < 2) {
|
|
||||||
const line = document.createElementNS(ns, 'line');
|
|
||||||
line.setAttribute('x1', '0'); line.setAttribute('y1', '14');
|
|
||||||
line.setAttribute('x2', '100'); line.setAttribute('y2', '14');
|
|
||||||
line.setAttribute('stroke', '#30363d'); line.setAttribute('stroke-width', '1');
|
|
||||||
svg.appendChild(line);
|
|
||||||
return svg;
|
|
||||||
}
|
|
||||||
|
|
||||||
const max = Math.max(totalMb, 1);
|
|
||||||
const pts = history.map((v, i) => {
|
|
||||||
const x = (i / (history.length - 1)) * 100;
|
|
||||||
const y = 14 - ((v / max) * 12);
|
|
||||||
return x.toFixed(1) + ',' + y.toFixed(1);
|
|
||||||
}).join(' ');
|
|
||||||
|
|
||||||
const poly = document.createElementNS(ns, 'polyline');
|
|
||||||
poly.setAttribute('points', pts);
|
|
||||||
poly.setAttribute('fill', 'none');
|
|
||||||
poly.setAttribute('stroke', '#818cf8');
|
|
||||||
poly.setAttribute('stroke-width', '1.5');
|
|
||||||
poly.setAttribute('stroke-linejoin', 'round');
|
|
||||||
svg.appendChild(poly);
|
|
||||||
return svg;
|
|
||||||
}
|
|
||||||
|
|
||||||
/** VRAM fill colour based on utilisation fraction. */
|
|
||||||
function vramColor(pct) {
|
|
||||||
if (pct >= 0.9) return '#f85149';
|
|
||||||
if (pct >= 0.7) return '#fbbf24';
|
|
||||||
return '#22d3ee';
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── sparkline history ────────────────────────────────────────────
|
|
||||||
// keyed "nodeId:gpuId" → array of vram_used_mb, max 20 samples
|
|
||||||
const sparkHistory = {};
|
|
||||||
|
|
||||||
// ── countdown ────────────────────────────────────────────────────
|
|
||||||
let countdown = 5;
|
|
||||||
setInterval(() => {
|
|
||||||
countdown = countdown <= 1 ? 5 : countdown - 1;
|
|
||||||
document.getElementById('countdown').textContent = countdown;
|
|
||||||
}, 1000);
|
|
||||||
|
|
||||||
// ── state class helper ───────────────────────────────────────────
|
|
||||||
function stateClass(state) {
|
|
||||||
const map = { running: 'state-running', idle: 'state-idle', stopped: 'state-stopped', starting: 'state-starting' };
|
|
||||||
return map[state] || 'state-unknown';
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── render: services table ───────────────────────────────────────
|
|
||||||
function renderServices(services) {
|
|
||||||
const tbody = document.getElementById('services-body');
|
|
||||||
if (!services || services.length === 0) {
|
|
||||||
const tr = document.createElement('tr');
|
|
||||||
const td = el('td', { cls: 'td-none', text: 'No service instances registered.' });
|
|
||||||
td.setAttribute('colspan', '6');
|
|
||||||
tr.appendChild(td);
|
|
||||||
setChildren(tbody, tr);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const rows = services.map(svc => {
|
|
||||||
const tr = document.createElement('tr');
|
|
||||||
const fields = [
|
|
||||||
{ text: svc.service, cls: 'td-service' },
|
|
||||||
{ text: svc.node_id, cls: 'td-node' },
|
|
||||||
{ text: String(svc.gpu_id), cls: 'td-mb' },
|
|
||||||
{ text: svc.state, cls: stateClass(svc.state) },
|
|
||||||
{ text: svc.model || '\u2014', cls: 'td-model' },
|
|
||||||
{ text: svc.url || '\u2014', cls: 'td-node' },
|
|
||||||
];
|
|
||||||
fields.forEach(f => tr.appendChild(el('td', { cls: f.cls, text: f.text })));
|
|
||||||
return tr;
|
|
||||||
});
|
|
||||||
|
|
||||||
setChildren(tbody, ...rows);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── render: health strip ─────────────────────────────────────────
|
|
||||||
function renderHealth(ok) {
|
|
||||||
const strip = document.getElementById('health-strip');
|
|
||||||
const pill = el('span', { cls: 'pill ' + (ok ? 'ok' : 'err'), text: (ok ? '● ' : '✕ ') + 'coordinator' });
|
|
||||||
setChildren(strip, pill);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── render: GPU grid ─────────────────────────────────────────────
|
|
||||||
// leasedByGpu: "nodeId:gpuId" → total MB currently leased (from active leases)
|
|
||||||
function renderNodes(nodes, leasedByGpu) {
|
|
||||||
const grid = document.getElementById('gpu-grid');
|
|
||||||
if (!nodes || nodes.length === 0) {
|
|
||||||
setChildren(grid, el('div', { text: 'No nodes registered.', style: { color: 'var(--dim)', fontSize: '0.8em', padding: '0.5rem' } }));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const cards = [];
|
|
||||||
for (const node of nodes) {
|
|
||||||
for (const gpu of node.gpus) {
|
|
||||||
const key = node.node_id + ':' + gpu.gpu_id;
|
|
||||||
const total = gpu.vram_total_mb || 1;
|
|
||||||
const used = gpu.vram_used_mb;
|
|
||||||
const leased = leasedByGpu[key] || 0;
|
|
||||||
// Resident = nvidia-smi used minus actively leased; clamped to [0, used].
|
|
||||||
const resident = Math.max(0, Math.min(used - leased, used));
|
|
||||||
const pct = used / total;
|
|
||||||
|
|
||||||
if (!sparkHistory[key]) sparkHistory[key] = [];
|
|
||||||
sparkHistory[key].push(used);
|
|
||||||
if (sparkHistory[key].length > 20) sparkHistory[key].shift();
|
|
||||||
|
|
||||||
const statusCls = pct >= 0.9 ? 'full' : pct >= 0.1 ? 'busy' : 'idle';
|
|
||||||
const statusText = pct >= 0.9 ? 'saturated' : pct >= 0.1 ? Math.round(pct * 100) + '% used' : 'idle';
|
|
||||||
|
|
||||||
const card = el('div', { cls: 'gpu-card' });
|
|
||||||
const nodeLabel = el('div', { cls: 'gpu-node', text: node.node_id.toUpperCase() + ' · GPU ' + gpu.gpu_id });
|
|
||||||
const nameLine = el('div', { cls: 'gpu-name', text: gpu.name || 'Unknown GPU' });
|
|
||||||
|
|
||||||
// Stacked bar: cyan (leased) → amber (resident) → dark bg (free).
|
|
||||||
const leasedPct = (leased / total * 100).toFixed(1);
|
|
||||||
const residentPct = (resident / total * 100).toFixed(1);
|
|
||||||
const track = el('div', { cls: 'vram-track' });
|
|
||||||
const fillLeased = el('div', { cls: 'vram-leased', style: { width: leasedPct + '%' } });
|
|
||||||
const fillResident = el('div', { cls: 'vram-resident', style: { left: leasedPct + '%', width: residentPct + '%' } });
|
|
||||||
append(track, fillLeased, fillResident);
|
|
||||||
|
|
||||||
// Breakdown label when something is allocated.
|
|
||||||
let labelText = (used / 1024).toFixed(1) + ' / ' + (total / 1024).toFixed(1) + ' GB';
|
|
||||||
if (leased > 0 || resident > 0) {
|
|
||||||
const parts = [];
|
|
||||||
if (leased > 0) parts.push((leased / 1024).toFixed(1) + 'G leased');
|
|
||||||
if (resident > 0) parts.push((resident / 1024).toFixed(1) + 'G resident');
|
|
||||||
labelText += ' (' + parts.join(' · ') + ')';
|
|
||||||
}
|
|
||||||
|
|
||||||
const vramLbl = el('div', { cls: 'vram-label', text: labelText });
|
|
||||||
const statusEl = el('div', { cls: 'gpu-status ' + statusCls, text: statusText });
|
|
||||||
const sparkTrack = el('div', { cls: 'spark-track' });
|
|
||||||
sparkTrack.appendChild(buildSparkline(sparkHistory[key], total));
|
|
||||||
|
|
||||||
append(card, nodeLabel, nameLine, track, vramLbl, statusEl, sparkTrack);
|
|
||||||
cards.push(card);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
setChildren(grid, ...cards);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── render: warm models table ────────────────────────────────────
|
|
||||||
function renderResidents(residents) {
|
|
||||||
const tbody = document.getElementById('resident-body');
|
|
||||||
if (!residents || residents.length === 0) {
|
|
||||||
const tr = document.createElement('tr');
|
|
||||||
const td = el('td', { cls: 'td-none', text: 'No warm models detected.' });
|
|
||||||
td.setAttribute('colspan', '4');
|
|
||||||
tr.appendChild(td);
|
|
||||||
setChildren(tbody, tr);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const now = Date.now() / 1000;
|
|
||||||
const rows = residents.map(r => {
|
|
||||||
const warmSecs = now - (r.first_seen || now);
|
|
||||||
const warmText = warmSecs < 60
|
|
||||||
? Math.floor(warmSecs) + 's'
|
|
||||||
: warmSecs < 3600
|
|
||||||
? Math.floor(warmSecs / 60) + 'm ' + String(Math.floor(warmSecs % 60)).padStart(2, '0') + 's'
|
|
||||||
: Math.floor(warmSecs / 3600) + 'h ' + String(Math.floor((warmSecs % 3600) / 60)).padStart(2, '0') + 'm';
|
|
||||||
|
|
||||||
const tr = document.createElement('tr');
|
|
||||||
append(tr,
|
|
||||||
el('td', { cls: 'td-service', text: r.service }),
|
|
||||||
el('td', { cls: 'td-node', text: r.node_id }),
|
|
||||||
el('td', { cls: 'td-model', text: r.model_name || '—' }),
|
|
||||||
el('td', { cls: 'td-warm', text: warmText }),
|
|
||||||
);
|
|
||||||
return tr;
|
|
||||||
});
|
|
||||||
|
|
||||||
setChildren(tbody, ...rows);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── render: leases table ─────────────────────────────────────────
|
|
||||||
function renderLeases(leases) {
|
|
||||||
const tbody = document.getElementById('leases-body');
|
|
||||||
if (!leases || leases.length === 0) {
|
|
||||||
const tr = document.createElement('tr');
|
|
||||||
const td = el('td', { cls: 'td-none', text: 'No active leases.' });
|
|
||||||
td.setAttribute('colspan', '5');
|
|
||||||
tr.appendChild(td);
|
|
||||||
setChildren(tbody, tr);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const now = Date.now() / 1000;
|
|
||||||
const rows = leases.map(lease => {
|
|
||||||
const mbGb = lease.mb_granted >= 1024
|
|
||||||
? (lease.mb_granted / 1024).toFixed(1) + ' GB'
|
|
||||||
: lease.mb_granted + ' MB';
|
|
||||||
|
|
||||||
const tr = document.createElement('tr');
|
|
||||||
|
|
||||||
const tdService = el('td', { cls: 'td-service', text: lease.holder_service });
|
|
||||||
const tdNode = el('td', { cls: 'td-node', text: lease.node_id + ' / GPU ' + lease.gpu_id });
|
|
||||||
const tdMb = el('td', { cls: 'td-mb', text: mbGb });
|
|
||||||
const tdPriority = el('td', { cls: 'td-priority', text: 'p' + lease.priority });
|
|
||||||
|
|
||||||
const tdTtl = document.createElement('td');
|
|
||||||
if (!lease.expires_at) {
|
|
||||||
tdTtl.appendChild(el('span', { cls: 'ttl-label', text: '∞' }));
|
|
||||||
} else {
|
|
||||||
const remaining = Math.max(0, lease.expires_at - now);
|
|
||||||
const pct = Math.min(100, (remaining / 300) * 100);
|
|
||||||
const mins = Math.floor(remaining / 60);
|
|
||||||
const secs = Math.floor(remaining % 60);
|
|
||||||
const label = remaining > 60
|
|
||||||
? mins + 'm ' + String(secs).padStart(2, '0') + 's'
|
|
||||||
: Math.floor(remaining) + 's';
|
|
||||||
|
|
||||||
const wrap = el('div', { cls: 'ttl-wrap' });
|
|
||||||
const lbl = el('span', { cls: 'ttl-label', text: label });
|
|
||||||
const track = el('div', { cls: 'ttl-track' });
|
|
||||||
const fill = el('div', { cls: 'ttl-fill', style: { width: pct.toFixed(1) + '%' } });
|
|
||||||
track.appendChild(fill);
|
|
||||||
append(wrap, lbl, track);
|
|
||||||
tdTtl.appendChild(wrap);
|
|
||||||
}
|
|
||||||
|
|
||||||
append(tr, tdService, tdNode, tdMb, tdPriority, tdTtl);
|
|
||||||
return tr;
|
|
||||||
});
|
|
||||||
|
|
||||||
setChildren(tbody, ...rows);
|
|
||||||
}
|
|
||||||
|
|
||||||
// ── error banner ─────────────────────────────────────────────────
|
|
||||||
function showError(msg) {
|
|
||||||
const el = document.getElementById('error-banner');
|
|
||||||
el.textContent = msg; // textContent — safe
|
|
||||||
el.style.display = 'block';
|
|
||||||
}
|
|
||||||
function clearError() { document.getElementById('error-banner').style.display = 'none'; }
|
|
||||||
|
|
||||||
// ── poll ─────────────────────────────────────────────────────────
|
|
||||||
async function poll() {
|
|
||||||
try {
|
|
||||||
const [nodesRes, leasesRes, residentRes, healthRes, servicesRes] = await Promise.all([
|
|
||||||
fetch('/api/nodes'),
|
|
||||||
fetch('/api/leases'),
|
|
||||||
fetch('/api/resident'),
|
|
||||||
fetch('/api/health'),
|
|
||||||
fetch('/api/services'),
|
|
||||||
]);
|
|
||||||
if (!nodesRes.ok || !leasesRes.ok) throw new Error('API error: ' + nodesRes.status);
|
|
||||||
const [nodesData, leasesData, residentData, servicesData] = await Promise.all([
|
|
||||||
nodesRes.json(), leasesRes.json(),
|
|
||||||
residentRes.ok ? residentRes.json() : Promise.resolve({ residents: [] }),
|
|
||||||
servicesRes.ok ? servicesRes.json() : Promise.resolve({ services: [] }),
|
|
||||||
]);
|
|
||||||
|
|
||||||
// Build per-GPU leased-MB index for the stacked bar.
|
|
||||||
const leasedByGpu = {};
|
|
||||||
for (const lease of (leasesData.leases || [])) {
|
|
||||||
const key = lease.node_id + ':' + lease.gpu_id;
|
|
||||||
leasedByGpu[key] = (leasedByGpu[key] || 0) + lease.mb_granted;
|
|
||||||
}
|
|
||||||
|
|
||||||
clearError();
|
|
||||||
renderHealth(healthRes.ok);
|
|
||||||
renderNodes(nodesData.nodes || [], leasedByGpu);
|
|
||||||
renderServices(servicesData.services || []);
|
|
||||||
renderLeases(leasesData.leases || []);
|
|
||||||
renderResidents(residentData.residents || []);
|
|
||||||
} catch (err) {
|
|
||||||
showError('Failed to reach coordinator: ' + err.message);
|
|
||||||
renderHealth(false);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
poll();
|
|
||||||
setInterval(poll, 5000);
|
|
||||||
</script>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
@ -1,81 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.models import VRAMLease
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_DEFAULT_EVICTION_TIMEOUT_S = 10.0
|
|
||||||
|
|
||||||
|
|
||||||
class EvictionEngine:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
lease_manager: LeaseManager,
|
|
||||||
eviction_timeout_s: float = _DEFAULT_EVICTION_TIMEOUT_S,
|
|
||||||
) -> None:
|
|
||||||
self.lease_manager = lease_manager
|
|
||||||
self._timeout = eviction_timeout_s
|
|
||||||
|
|
||||||
async def request_lease(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
gpu_id: int,
|
|
||||||
mb: int,
|
|
||||||
service: str,
|
|
||||||
priority: int,
|
|
||||||
agent_url: str,
|
|
||||||
ttl_s: float = 0.0,
|
|
||||||
) -> VRAMLease | None:
|
|
||||||
# Fast path: enough free VRAM
|
|
||||||
lease = await self.lease_manager.try_grant(
|
|
||||||
node_id, gpu_id, mb, service, priority, ttl_s
|
|
||||||
)
|
|
||||||
if lease is not None:
|
|
||||||
return lease
|
|
||||||
|
|
||||||
# Find eviction candidates
|
|
||||||
candidates = self.lease_manager.get_eviction_candidates(
|
|
||||||
node_id=node_id, gpu_id=gpu_id,
|
|
||||||
needed_mb=mb, requester_priority=priority,
|
|
||||||
)
|
|
||||||
if not candidates:
|
|
||||||
logger.info(
|
|
||||||
"No eviction candidates for %s on %s:GPU%d (%dMB needed)",
|
|
||||||
service, node_id, gpu_id, mb,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Evict candidates
|
|
||||||
freed_mb = sum(c.mb_granted for c in candidates)
|
|
||||||
logger.info(
|
|
||||||
"Evicting %d lease(s) to free %dMB for %s",
|
|
||||||
len(candidates), freed_mb, service,
|
|
||||||
)
|
|
||||||
for candidate in candidates:
|
|
||||||
await self._evict_lease(candidate, agent_url)
|
|
||||||
|
|
||||||
# Wait for evictions to free up VRAM (poll with timeout)
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
deadline = loop.time() + self._timeout
|
|
||||||
while loop.time() < deadline:
|
|
||||||
lease = await self.lease_manager.try_grant(
|
|
||||||
node_id, gpu_id, mb, service, priority, ttl_s
|
|
||||||
)
|
|
||||||
if lease is not None:
|
|
||||||
return lease
|
|
||||||
await asyncio.sleep(0.1)
|
|
||||||
|
|
||||||
logger.warning("Eviction timed out for %s after %.1fs", service, self._timeout)
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def _evict_lease(self, lease: VRAMLease, agent_url: str) -> None:
|
|
||||||
"""Release lease accounting. Process-level eviction deferred to Plan B."""
|
|
||||||
await self.lease_manager.release(lease.lease_id)
|
|
||||||
|
|
||||||
async def _call_agent_evict(self, agent_url: str, lease: VRAMLease) -> bool:
|
|
||||||
"""POST /evict to the agent. Stub for v1 — real process lookup in Plan B."""
|
|
||||||
return True
|
|
||||||
|
|
@ -1,130 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from collections import defaultdict
|
|
||||||
|
|
||||||
from circuitforge_core.resources.models import ResidentAllocation, VRAMLease
|
|
||||||
|
|
||||||
|
|
||||||
class LeaseManager:
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._leases: dict[str, VRAMLease] = {}
|
|
||||||
self._gpu_total: dict[tuple[str, int], int] = {}
|
|
||||||
self._gpu_used: dict[tuple[str, int], int] = defaultdict(int)
|
|
||||||
self._lock = asyncio.Lock()
|
|
||||||
# Resident allocations — keyed "node_id:service", updated by heartbeat.
|
|
||||||
# No lock needed: only the single heartbeat task writes this dict.
|
|
||||||
self._residents: dict[str, ResidentAllocation] = {}
|
|
||||||
|
|
||||||
def register_gpu(self, node_id: str, gpu_id: int, total_mb: int) -> None:
|
|
||||||
self._gpu_total[(node_id, gpu_id)] = total_mb
|
|
||||||
|
|
||||||
def gpu_total_mb(self, node_id: str, gpu_id: int) -> int:
|
|
||||||
return self._gpu_total.get((node_id, gpu_id), 0)
|
|
||||||
|
|
||||||
def used_mb(self, node_id: str, gpu_id: int) -> int:
|
|
||||||
return self._gpu_used[(node_id, gpu_id)]
|
|
||||||
|
|
||||||
async def try_grant(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
gpu_id: int,
|
|
||||||
mb: int,
|
|
||||||
service: str,
|
|
||||||
priority: int,
|
|
||||||
ttl_s: float = 0.0,
|
|
||||||
) -> VRAMLease | None:
|
|
||||||
async with self._lock:
|
|
||||||
total = self._gpu_total.get((node_id, gpu_id), 0)
|
|
||||||
used = self._gpu_used[(node_id, gpu_id)]
|
|
||||||
if total - used < mb:
|
|
||||||
return None
|
|
||||||
lease = VRAMLease.create(
|
|
||||||
gpu_id=gpu_id, node_id=node_id, mb=mb,
|
|
||||||
service=service, priority=priority, ttl_s=ttl_s,
|
|
||||||
)
|
|
||||||
self._leases[lease.lease_id] = lease
|
|
||||||
self._gpu_used[(node_id, gpu_id)] += mb
|
|
||||||
return lease
|
|
||||||
|
|
||||||
async def release(self, lease_id: str) -> bool:
|
|
||||||
async with self._lock:
|
|
||||||
lease = self._leases.pop(lease_id, None)
|
|
||||||
if lease is None:
|
|
||||||
return False
|
|
||||||
self._gpu_used[(lease.node_id, lease.gpu_id)] -= lease.mb_granted
|
|
||||||
return True
|
|
||||||
|
|
||||||
def get_eviction_candidates(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
gpu_id: int,
|
|
||||||
needed_mb: int,
|
|
||||||
requester_priority: int,
|
|
||||||
) -> list[VRAMLease]:
|
|
||||||
candidates = [
|
|
||||||
lease for lease in self._leases.values()
|
|
||||||
if lease.node_id == node_id
|
|
||||||
and lease.gpu_id == gpu_id
|
|
||||||
and lease.priority > requester_priority
|
|
||||||
]
|
|
||||||
candidates.sort(key=lambda lease: lease.priority, reverse=True)
|
|
||||||
selected: list[VRAMLease] = []
|
|
||||||
freed = 0
|
|
||||||
for candidate in candidates:
|
|
||||||
selected.append(candidate)
|
|
||||||
freed += candidate.mb_granted
|
|
||||||
if freed >= needed_mb:
|
|
||||||
break
|
|
||||||
return selected
|
|
||||||
|
|
||||||
def list_leases(
|
|
||||||
self, node_id: str | None = None, gpu_id: int | None = None
|
|
||||||
) -> list[VRAMLease]:
|
|
||||||
return [
|
|
||||||
lease for lease in self._leases.values()
|
|
||||||
if (node_id is None or lease.node_id == node_id)
|
|
||||||
and (gpu_id is None or lease.gpu_id == gpu_id)
|
|
||||||
]
|
|
||||||
|
|
||||||
def all_leases(self) -> list[VRAMLease]:
|
|
||||||
return list(self._leases.values())
|
|
||||||
|
|
||||||
# ── resident tracking ────────────────────────────────────────────
|
|
||||||
|
|
||||||
def set_residents_for_node(
|
|
||||||
self,
|
|
||||||
node_id: str,
|
|
||||||
residents: list[tuple[str, str | None]], # (service, model_name)
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Replace the resident snapshot for a node.
|
|
||||||
|
|
||||||
Preserves first_seen for entries whose service+model_name are unchanged,
|
|
||||||
so the dashboard can show how long a model has been warm.
|
|
||||||
"""
|
|
||||||
new_keys = {f"{node_id}:{service}" for service, _ in residents}
|
|
||||||
|
|
||||||
# Remove stale entries (service no longer running on this node).
|
|
||||||
for key in list(self._residents):
|
|
||||||
if key.startswith(f"{node_id}:") and key not in new_keys:
|
|
||||||
del self._residents[key]
|
|
||||||
|
|
||||||
# Upsert: preserve first_seen when model is unchanged, reset otherwise.
|
|
||||||
for service, model_name in residents:
|
|
||||||
key = f"{node_id}:{service}"
|
|
||||||
existing = self._residents.get(key)
|
|
||||||
if existing is not None and existing.model_name == model_name:
|
|
||||||
continue # same model still loaded — keep original first_seen
|
|
||||||
self._residents[key] = ResidentAllocation(
|
|
||||||
service=service,
|
|
||||||
node_id=node_id,
|
|
||||||
model_name=model_name,
|
|
||||||
)
|
|
||||||
|
|
||||||
def all_residents(self) -> list[ResidentAllocation]:
|
|
||||||
return list(self._residents.values())
|
|
||||||
|
|
||||||
def resident_keys(self) -> set[str]:
|
|
||||||
"""Return set of 'node_id:service' strings for currently-warm services."""
|
|
||||||
return set(self._residents.keys())
|
|
||||||
|
|
@ -1,74 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import TYPE_CHECKING
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
|
|
||||||
_WARM_BONUS_MB = 1000
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class _Scored:
|
|
||||||
node_id: str
|
|
||||||
gpu_id: int
|
|
||||||
vram_free_mb: int
|
|
||||||
effective_free_mb: int
|
|
||||||
can_fit: bool
|
|
||||||
warm: bool
|
|
||||||
|
|
||||||
|
|
||||||
def select_node(
|
|
||||||
agents: "dict[str, AgentRecord]",
|
|
||||||
service: str,
|
|
||||||
profile_registry: "ProfileRegistry",
|
|
||||||
resident_keys: set[str],
|
|
||||||
) -> tuple[str, int] | None:
|
|
||||||
"""
|
|
||||||
Pick the best (node_id, gpu_id) for the requested service.
|
|
||||||
Warm nodes (service already running) get priority, then sorted by free VRAM.
|
|
||||||
Returns None if no suitable node exists.
|
|
||||||
"""
|
|
||||||
service_max_mb = _find_service_max_mb(service, profile_registry)
|
|
||||||
if service_max_mb is None:
|
|
||||||
return None # service not in any profile
|
|
||||||
|
|
||||||
candidates: list[_Scored] = []
|
|
||||||
for node_id, record in agents.items():
|
|
||||||
if not record.online:
|
|
||||||
continue
|
|
||||||
for gpu in record.gpus:
|
|
||||||
warm = f"{node_id}:{service}" in resident_keys
|
|
||||||
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
|
|
||||||
can_fit = gpu.vram_free_mb >= service_max_mb
|
|
||||||
candidates.append(_Scored(
|
|
||||||
node_id=node_id,
|
|
||||||
gpu_id=gpu.gpu_id,
|
|
||||||
vram_free_mb=gpu.vram_free_mb,
|
|
||||||
effective_free_mb=effective,
|
|
||||||
can_fit=can_fit,
|
|
||||||
warm=warm,
|
|
||||||
))
|
|
||||||
if not candidates:
|
|
||||||
return None
|
|
||||||
# Prefer: (1) warm nodes (model already resident — no cold start)
|
|
||||||
# (2) cold nodes that can fit the service (free >= half of max_mb)
|
|
||||||
# Fallback: best-effort node when nothing fits and nothing is warm
|
|
||||||
# (coordinator will attempt to start the service anyway; it may evict or fail)
|
|
||||||
# Note: resident_keys are per-node, not per-GPU. On multi-GPU nodes, the warm
|
|
||||||
# bonus applies to all GPUs on the node. This is a known coarseness —
|
|
||||||
# per-GPU resident tracking requires a resident_key format change.
|
|
||||||
preferred = [c for c in candidates if c.warm or c.can_fit]
|
|
||||||
pool = preferred if preferred else candidates
|
|
||||||
best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
|
|
||||||
return best.node_id, best.gpu_id
|
|
||||||
|
|
||||||
|
|
||||||
def _find_service_max_mb(service: str, profile_registry: "ProfileRegistry") -> int | None:
|
|
||||||
for profile in profile_registry.list_public():
|
|
||||||
svc = profile.services.get(service)
|
|
||||||
if svc is not None:
|
|
||||||
return svc.max_mb
|
|
||||||
return None
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
"""
|
|
||||||
circuitforge_core.resources.coordinator.node_store — SQLite persistence for known agent nodes.
|
|
||||||
|
|
||||||
Gives the coordinator restart-safe memory of which nodes have ever registered.
|
|
||||||
On startup the coordinator reloads all known nodes and immediately probes them;
|
|
||||||
nodes that respond come back online within one heartbeat cycle (~10 s) without
|
|
||||||
any manual intervention on the agent hosts.
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import sqlite3
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_DEFAULT_DB_PATH = Path.home() / ".local" / "share" / "circuitforge" / "cf-orch-nodes.db"
|
|
||||||
_STALE_AGE_DAYS = 30 # nodes unseen for this long are pruned automatically
|
|
||||||
|
|
||||||
|
|
||||||
class NodeStore:
|
|
||||||
"""
|
|
||||||
Thin SQLite wrapper for persisting known agent nodes across coordinator restarts.
|
|
||||||
|
|
||||||
Thread-safe for single-writer use (coordinator runs in one asyncio thread).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, db_path: Path = _DEFAULT_DB_PATH) -> None:
|
|
||||||
self.db_path = db_path
|
|
||||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
|
||||||
self._conn.row_factory = sqlite3.Row
|
|
||||||
self._migrate()
|
|
||||||
logger.debug("NodeStore initialised at %s", db_path)
|
|
||||||
|
|
||||||
def _migrate(self) -> None:
|
|
||||||
self._conn.executescript("""
|
|
||||||
CREATE TABLE IF NOT EXISTS known_nodes (
|
|
||||||
node_id TEXT PRIMARY KEY,
|
|
||||||
agent_url TEXT NOT NULL,
|
|
||||||
last_seen REAL NOT NULL
|
|
||||||
);
|
|
||||||
""")
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def upsert(self, node_id: str, agent_url: str) -> None:
|
|
||||||
"""Record or update a node. Called on every successful registration."""
|
|
||||||
self._conn.execute(
|
|
||||||
"""
|
|
||||||
INSERT INTO known_nodes (node_id, agent_url, last_seen)
|
|
||||||
VALUES (?, ?, ?)
|
|
||||||
ON CONFLICT(node_id) DO UPDATE SET
|
|
||||||
agent_url = excluded.agent_url,
|
|
||||||
last_seen = excluded.last_seen
|
|
||||||
""",
|
|
||||||
(node_id, agent_url, time.time()),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def all(self) -> list[tuple[str, str]]:
|
|
||||||
"""Return all known (node_id, agent_url) pairs."""
|
|
||||||
rows = self._conn.execute(
|
|
||||||
"SELECT node_id, agent_url FROM known_nodes ORDER BY last_seen DESC"
|
|
||||||
).fetchall()
|
|
||||||
return [(r["node_id"], r["agent_url"]) for r in rows]
|
|
||||||
|
|
||||||
def remove(self, node_id: str) -> None:
|
|
||||||
self._conn.execute("DELETE FROM known_nodes WHERE node_id = ?", (node_id,))
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def prune_stale(self, max_age_days: int = _STALE_AGE_DAYS) -> int:
|
|
||||||
"""Delete nodes not seen within max_age_days. Returns count removed."""
|
|
||||||
cutoff = time.time() - max_age_days * 86400
|
|
||||||
cur = self._conn.execute(
|
|
||||||
"DELETE FROM known_nodes WHERE last_seen < ?", (cutoff,)
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
removed = cur.rowcount
|
|
||||||
if removed:
|
|
||||||
logger.info("NodeStore: pruned %d stale node(s) (>%d days old)", removed, max_age_days)
|
|
||||||
return removed
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
self._conn.close()
|
|
||||||
|
|
@ -1,65 +0,0 @@
|
||||||
# circuitforge_core/resources/coordinator/profile_registry.py
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from circuitforge_core.resources.models import GpuInfo
|
|
||||||
from circuitforge_core.resources.profiles.schema import GpuProfile, load_profile
|
|
||||||
|
|
||||||
_PUBLIC_DIR = Path(__file__).parent.parent / "profiles" / "public"
|
|
||||||
|
|
||||||
# VRAM thresholds for public profile selection (MB)
|
|
||||||
_PROFILE_THRESHOLDS = [
|
|
||||||
(22000, "single-gpu-24gb"),
|
|
||||||
(14000, "single-gpu-16gb"),
|
|
||||||
(8000, "single-gpu-8gb"),
|
|
||||||
(5500, "single-gpu-6gb"),
|
|
||||||
(3500, "single-gpu-4gb"),
|
|
||||||
(0, "single-gpu-2gb"),
|
|
||||||
]
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ProfileRegistry:
|
|
||||||
def __init__(self, extra_dirs: list[Path] | None = None) -> None:
|
|
||||||
self._profiles: dict[str, GpuProfile] = {}
|
|
||||||
self._load_dir(_PUBLIC_DIR)
|
|
||||||
for d in (extra_dirs or []):
|
|
||||||
if d.exists():
|
|
||||||
self._load_dir(d)
|
|
||||||
|
|
||||||
def _load_dir(self, directory: Path) -> None:
|
|
||||||
for yaml_file in directory.glob("*.yaml"):
|
|
||||||
try:
|
|
||||||
profile = load_profile(yaml_file)
|
|
||||||
self._profiles[profile.name] = profile
|
|
||||||
except Exception as exc:
|
|
||||||
_log.warning("Skipping %s: %s", yaml_file, exc)
|
|
||||||
|
|
||||||
def load(self, path: Path) -> GpuProfile:
|
|
||||||
profile = load_profile(path)
|
|
||||||
self._profiles[profile.name] = profile
|
|
||||||
return profile
|
|
||||||
|
|
||||||
def list_public(self) -> list[GpuProfile]:
|
|
||||||
# CPU profiles (cpu-*) are intentionally excluded — this endpoint
|
|
||||||
# is used to match GPU hardware. CPU inference nodes self-select
|
|
||||||
# their profile via the CLI and are not listed for lease matching.
|
|
||||||
return [
|
|
||||||
p for p in self._profiles.values()
|
|
||||||
if p.name.startswith("single-gpu-")
|
|
||||||
]
|
|
||||||
|
|
||||||
def get(self, name: str) -> GpuProfile | None:
|
|
||||||
return self._profiles.get(name)
|
|
||||||
|
|
||||||
def auto_detect(self, gpus: list[GpuInfo]) -> GpuProfile:
|
|
||||||
primary_vram = gpus[0].vram_total_mb if gpus else 0
|
|
||||||
for threshold_mb, profile_name in _PROFILE_THRESHOLDS:
|
|
||||||
if primary_vram >= threshold_mb:
|
|
||||||
profile = self._profiles.get(profile_name)
|
|
||||||
if profile:
|
|
||||||
return profile
|
|
||||||
return self._profiles["single-gpu-2gb"]
|
|
||||||
|
|
@ -1,173 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import dataclasses
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Literal
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ServiceAllocation:
|
|
||||||
allocation_id: str
|
|
||||||
service: str
|
|
||||||
node_id: str
|
|
||||||
gpu_id: int
|
|
||||||
model: str | None
|
|
||||||
caller: str
|
|
||||||
url: str
|
|
||||||
created_at: float
|
|
||||||
expires_at: float # 0 = no expiry
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class ServiceInstance:
|
|
||||||
service: str
|
|
||||||
node_id: str
|
|
||||||
gpu_id: int
|
|
||||||
state: Literal["starting", "running", "idle", "stopped"]
|
|
||||||
model: str | None
|
|
||||||
url: str | None
|
|
||||||
idle_since: float | None = None
|
|
||||||
health_path: str = "/health"
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceRegistry:
|
|
||||||
"""
|
|
||||||
In-memory registry of service allocations and instance state.
|
|
||||||
|
|
||||||
Allocations: per-caller request — many per service instance.
|
|
||||||
Instances: per (service, node_id, gpu_id) — one per running container.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._allocations: dict[str, ServiceAllocation] = {}
|
|
||||||
self._instances: dict[str, ServiceInstance] = {} # key: "service:node_id:gpu_id"
|
|
||||||
|
|
||||||
# ── allocation API ────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def allocate(
|
|
||||||
self,
|
|
||||||
service: str,
|
|
||||||
node_id: str,
|
|
||||||
gpu_id: int,
|
|
||||||
model: str | None,
|
|
||||||
url: str,
|
|
||||||
caller: str,
|
|
||||||
ttl_s: float,
|
|
||||||
) -> ServiceAllocation:
|
|
||||||
alloc = ServiceAllocation(
|
|
||||||
allocation_id=str(uuid.uuid4()),
|
|
||||||
service=service,
|
|
||||||
node_id=node_id,
|
|
||||||
gpu_id=gpu_id,
|
|
||||||
model=model,
|
|
||||||
caller=caller,
|
|
||||||
url=url,
|
|
||||||
created_at=time.time(),
|
|
||||||
expires_at=time.time() + ttl_s if ttl_s > 0 else 0.0,
|
|
||||||
)
|
|
||||||
self._allocations[alloc.allocation_id] = alloc
|
|
||||||
|
|
||||||
# If an instance exists in idle/stopped state, mark it running again
|
|
||||||
key = f"{service}:{node_id}:{gpu_id}"
|
|
||||||
if key in self._instances:
|
|
||||||
inst = self._instances[key]
|
|
||||||
if inst.state in ("idle", "stopped"):
|
|
||||||
self._instances[key] = dataclasses.replace(
|
|
||||||
inst, state="running", idle_since=None
|
|
||||||
)
|
|
||||||
return alloc
|
|
||||||
|
|
||||||
def release(self, allocation_id: str) -> bool:
|
|
||||||
alloc = self._allocations.pop(allocation_id, None)
|
|
||||||
if alloc is None:
|
|
||||||
return False
|
|
||||||
# If no active allocations remain for this instance, mark it idle
|
|
||||||
key = f"{alloc.service}:{alloc.node_id}:{alloc.gpu_id}"
|
|
||||||
if self.active_allocations(alloc.service, alloc.node_id, alloc.gpu_id) == 0:
|
|
||||||
if key in self._instances:
|
|
||||||
self._instances[key] = dataclasses.replace(
|
|
||||||
self._instances[key], state="idle", idle_since=time.time()
|
|
||||||
)
|
|
||||||
return True
|
|
||||||
|
|
||||||
def active_allocations(self, service: str, node_id: str, gpu_id: int) -> int:
|
|
||||||
return sum(
|
|
||||||
1 for a in self._allocations.values()
|
|
||||||
if a.service == service and a.node_id == node_id and a.gpu_id == gpu_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── instance API ─────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def upsert_instance(
|
|
||||||
self,
|
|
||||||
service: str,
|
|
||||||
node_id: str,
|
|
||||||
gpu_id: int,
|
|
||||||
state: Literal["starting", "running", "idle", "stopped"],
|
|
||||||
model: str | None,
|
|
||||||
url: str | None,
|
|
||||||
health_path: str = "/health",
|
|
||||||
) -> ServiceInstance:
|
|
||||||
key = f"{service}:{node_id}:{gpu_id}"
|
|
||||||
existing = self._instances.get(key)
|
|
||||||
idle_since: float | None = None
|
|
||||||
if state == "idle":
|
|
||||||
# Preserve idle_since if already idle; set now if transitioning into idle
|
|
||||||
idle_since = existing.idle_since if (existing and existing.state == "idle") else time.time()
|
|
||||||
inst = ServiceInstance(
|
|
||||||
service=service, node_id=node_id, gpu_id=gpu_id,
|
|
||||||
state=state, model=model, url=url, idle_since=idle_since,
|
|
||||||
health_path=health_path,
|
|
||||||
)
|
|
||||||
self._instances[key] = inst
|
|
||||||
return inst
|
|
||||||
|
|
||||||
def get_allocation(self, allocation_id: str) -> ServiceAllocation | None:
|
|
||||||
return self._allocations.get(allocation_id)
|
|
||||||
|
|
||||||
def sweep_expired_allocations(self) -> list[str]:
|
|
||||||
"""
|
|
||||||
Remove all allocations whose TTL has elapsed and transition the
|
|
||||||
corresponding instance to 'idle' if no active allocations remain.
|
|
||||||
Returns the list of expired allocation_ids.
|
|
||||||
"""
|
|
||||||
now = time.time()
|
|
||||||
expired = [
|
|
||||||
alloc_id
|
|
||||||
for alloc_id, alloc in self._allocations.items()
|
|
||||||
if alloc.expires_at > 0 and now > alloc.expires_at
|
|
||||||
]
|
|
||||||
for alloc_id in expired:
|
|
||||||
self.release(alloc_id)
|
|
||||||
return expired
|
|
||||||
|
|
||||||
def all_allocations(self) -> list[ServiceAllocation]:
|
|
||||||
return list(self._allocations.values())
|
|
||||||
|
|
||||||
def all_instances(self) -> list[ServiceInstance]:
|
|
||||||
return list(self._instances.values())
|
|
||||||
|
|
||||||
def mark_stopped(self, service: str, node_id: str, gpu_id: int) -> None:
|
|
||||||
"""Transition an instance to 'stopped' state and clear idle_since."""
|
|
||||||
key = f"{service}:{node_id}:{gpu_id}"
|
|
||||||
if key in self._instances:
|
|
||||||
self._instances[key] = dataclasses.replace(
|
|
||||||
self._instances[key], state="stopped", idle_since=None
|
|
||||||
)
|
|
||||||
|
|
||||||
def idle_past_timeout(self, idle_stop_config: dict[str, int]) -> list[ServiceInstance]:
|
|
||||||
"""
|
|
||||||
Return instances in 'idle' state whose idle time exceeds their configured timeout.
|
|
||||||
idle_stop_config: {service_name: seconds} — 0 means never stop automatically.
|
|
||||||
"""
|
|
||||||
now = time.time()
|
|
||||||
result = []
|
|
||||||
for inst in self._instances.values():
|
|
||||||
if inst.state != "idle" or inst.idle_since is None:
|
|
||||||
continue
|
|
||||||
timeout = idle_stop_config.get(inst.service, 0)
|
|
||||||
if timeout > 0 and (now - inst.idle_since) >= timeout:
|
|
||||||
result.append(inst)
|
|
||||||
return result
|
|
||||||
|
|
@ -1,250 +0,0 @@
|
||||||
"""
|
|
||||||
cf-docuvision — managed document understanding service.
|
|
||||||
|
|
||||||
Wraps ByteDance/Dolphin-v2 (Qwen2.5-VL backbone) behind a simple HTTP API.
|
|
||||||
Managed by cf-orch; started/stopped as a ProcessSpec service.
|
|
||||||
|
|
||||||
API
|
|
||||||
---
|
|
||||||
GET /health → {"status": "ok", "model": "<path>"}
|
|
||||||
POST /extract → ExtractResponse
|
|
||||||
|
|
||||||
Usage (standalone)::
|
|
||||||
|
|
||||||
python -m circuitforge_core.resources.docuvision.app \\
|
|
||||||
--model /Library/Assets/LLM/docuvision/models/dolphin-v2 \\
|
|
||||||
--port 8003 --gpu-id 0
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import base64
|
|
||||||
import io
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import uvicorn
|
|
||||||
from fastapi import FastAPI, HTTPException
|
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Module-level state — populated by _load_model() on first /extract call
|
|
||||||
_model: Any = None
|
|
||||||
_processor: Any = None
|
|
||||||
_model_path: str = ""
|
|
||||||
_device: str = "cpu"
|
|
||||||
|
|
||||||
|
|
||||||
# ── lazy loader ───────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _load_model() -> None:
|
|
||||||
"""Lazy-load Dolphin-v2. Called once on first /extract request."""
|
|
||||||
global _model, _processor, _device
|
|
||||||
|
|
||||||
if _model is not None:
|
|
||||||
return
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transformers import AutoProcessor, AutoModelForCausalLM
|
|
||||||
|
|
||||||
logger.info("Loading Dolphin-v2 from %s ...", _model_path)
|
|
||||||
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
||||||
|
|
||||||
_processor = AutoProcessor.from_pretrained(
|
|
||||||
_model_path,
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
_model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
_model_path,
|
|
||||||
trust_remote_code=True,
|
|
||||||
torch_dtype=torch.float16 if _device == "cuda" else torch.float32,
|
|
||||||
device_map=_device,
|
|
||||||
)
|
|
||||||
_model.eval()
|
|
||||||
logger.info("Dolphin-v2 loaded on %s", _device)
|
|
||||||
|
|
||||||
|
|
||||||
# ── FastAPI app ───────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def _lifespan(app: FastAPI):
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(title="cf-docuvision", lifespan=_lifespan)
|
|
||||||
|
|
||||||
|
|
||||||
# ── request / response models ─────────────────────────────────────────────────
|
|
||||||
|
|
||||||
class ExtractRequest(BaseModel):
|
|
||||||
"""
|
|
||||||
Either image_b64 (base64-encoded bytes) or image_path (absolute path) must
|
|
||||||
be provided. hint guides the extraction mode:
|
|
||||||
- "auto" - Dolphin-v2 detects layout and element types automatically
|
|
||||||
- "table" - optimise for tabular data (receipts, invoices, forms)
|
|
||||||
- "text" - optimise for dense prose (contracts, letters)
|
|
||||||
- "form" - optimise for form field extraction
|
|
||||||
"""
|
|
||||||
image_b64: str | None = None
|
|
||||||
image_path: str | None = None
|
|
||||||
hint: str = "auto"
|
|
||||||
|
|
||||||
|
|
||||||
class ElementOut(BaseModel):
|
|
||||||
type: str # heading | paragraph | list | table | figure | formula | code
|
|
||||||
text: str
|
|
||||||
bbox: list[float] | None = None # [x0, y0, x1, y1] normalised 0-1 if available
|
|
||||||
|
|
||||||
|
|
||||||
class TableOut(BaseModel):
|
|
||||||
html: str
|
|
||||||
bbox: list[float] | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class ExtractResponse(BaseModel):
|
|
||||||
elements: list[ElementOut]
|
|
||||||
raw_text: str
|
|
||||||
tables: list[TableOut]
|
|
||||||
metadata: dict[str, Any]
|
|
||||||
|
|
||||||
|
|
||||||
# ── helpers ───────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
_HINT_PROMPTS: dict[str, str] = {
|
|
||||||
"auto": "Parse this document. Extract all elements with their types and text content.",
|
|
||||||
"table": "Extract all tables from this document as structured HTML. Also extract any line-item text.",
|
|
||||||
"text": "Extract all text from this document preserving paragraph and heading structure.",
|
|
||||||
"form": "Extract all form fields from this document. Return field labels and their values.",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _image_from_request(req: ExtractRequest):
|
|
||||||
"""Return a PIL Image from either image_b64 or image_path."""
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
if req.image_b64:
|
|
||||||
img_bytes = base64.b64decode(req.image_b64)
|
|
||||||
return Image.open(io.BytesIO(img_bytes)).convert("RGB")
|
|
||||||
|
|
||||||
if req.image_path:
|
|
||||||
from pathlib import Path
|
|
||||||
p = Path(req.image_path)
|
|
||||||
if not p.exists():
|
|
||||||
raise HTTPException(status_code=404, detail=f"image_path not found: {req.image_path}")
|
|
||||||
return Image.open(p).convert("RGB")
|
|
||||||
|
|
||||||
raise HTTPException(status_code=422, detail="Either image_b64 or image_path must be provided")
|
|
||||||
|
|
||||||
|
|
||||||
def _parse_dolphin_output(raw: str) -> tuple[list[ElementOut], list[TableOut], str]:
|
|
||||||
"""
|
|
||||||
Parse Dolphin-v2's structured output into elements and tables.
|
|
||||||
|
|
||||||
Dolphin-v2 returns a JSON array of element dicts with keys:
|
|
||||||
type, text, [html], [bbox]
|
|
||||||
|
|
||||||
Falls back gracefully if the model returns plain text instead.
|
|
||||||
"""
|
|
||||||
elements: list[ElementOut] = []
|
|
||||||
tables: list[TableOut] = []
|
|
||||||
|
|
||||||
# Try JSON parse first
|
|
||||||
try:
|
|
||||||
parsed = json.loads(raw)
|
|
||||||
if isinstance(parsed, list):
|
|
||||||
for item in parsed:
|
|
||||||
etype = item.get("type", "paragraph")
|
|
||||||
text = item.get("text", "")
|
|
||||||
bbox = item.get("bbox")
|
|
||||||
if etype == "table":
|
|
||||||
tables.append(TableOut(html=item.get("html", text), bbox=bbox))
|
|
||||||
elements.append(ElementOut(type=etype, text=text, bbox=bbox))
|
|
||||||
raw_text = "\n".join(e.text for e in elements)
|
|
||||||
return elements, tables, raw_text
|
|
||||||
except (json.JSONDecodeError, TypeError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Plain-text fallback: treat entire output as a single paragraph
|
|
||||||
elements = [ElementOut(type="paragraph", text=raw.strip())]
|
|
||||||
return elements, tables, raw.strip()
|
|
||||||
|
|
||||||
|
|
||||||
# ── routes ────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
async def health() -> dict[str, str]:
|
|
||||||
return {"status": "ok", "model": _model_path}
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/extract", response_model=ExtractResponse)
|
|
||||||
async def extract(req: ExtractRequest) -> ExtractResponse:
|
|
||||||
_load_model()
|
|
||||||
|
|
||||||
image = _image_from_request(req)
|
|
||||||
prompt = _HINT_PROMPTS.get(req.hint, _HINT_PROMPTS["auto"])
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
inputs = _processor(
|
|
||||||
text=prompt,
|
|
||||||
images=image,
|
|
||||||
return_tensors="pt",
|
|
||||||
).to(_device)
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
output_ids = _model.generate(
|
|
||||||
**inputs,
|
|
||||||
max_new_tokens=2048,
|
|
||||||
do_sample=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Decode only the newly generated tokens
|
|
||||||
input_len = inputs["input_ids"].shape[1]
|
|
||||||
raw_output = _processor.decode(
|
|
||||||
output_ids[0][input_len:],
|
|
||||||
skip_special_tokens=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
elements, tables, raw_text = _parse_dolphin_output(raw_output)
|
|
||||||
|
|
||||||
w, h = image.size
|
|
||||||
|
|
||||||
return ExtractResponse(
|
|
||||||
elements=elements,
|
|
||||||
raw_text=raw_text,
|
|
||||||
tables=tables,
|
|
||||||
metadata={
|
|
||||||
"hint": req.hint,
|
|
||||||
"width": w,
|
|
||||||
"height": h,
|
|
||||||
"model": _model_path,
|
|
||||||
"device": _device,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ── CLI entry point ───────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = argparse.ArgumentParser(description="cf-docuvision service")
|
|
||||||
parser.add_argument("--model", required=True, help="Path to Dolphin-v2 model directory")
|
|
||||||
parser.add_argument("--port", type=int, default=8003)
|
|
||||||
parser.add_argument("--host", default="0.0.0.0")
|
|
||||||
parser.add_argument("--gpu-id", type=int, default=0)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
global _model_path
|
|
||||||
_model_path = args.model
|
|
||||||
|
|
||||||
import os
|
|
||||||
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
|
|
||||||
uvicorn.run(app, host=args.host, port=args.port)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -1,137 +0,0 @@
|
||||||
"""Generic OpenAI-compatible inference server for HuggingFace causal LMs."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import uvicorn
|
|
||||||
from fastapi import FastAPI, HTTPException
|
|
||||||
from pydantic import BaseModel
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
||||||
|
|
||||||
_model: Any = None
|
|
||||||
_tokenizer: Any = None
|
|
||||||
_model_id: str = ""
|
|
||||||
_device: str = "cpu"
|
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def lifespan(app: FastAPI):
|
|
||||||
yield
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(lifespan=lifespan)
|
|
||||||
|
|
||||||
|
|
||||||
class Message(BaseModel):
|
|
||||||
role: str
|
|
||||||
content: str
|
|
||||||
|
|
||||||
|
|
||||||
class ChatRequest(BaseModel):
|
|
||||||
model: str | None = None
|
|
||||||
messages: list[Message]
|
|
||||||
max_tokens: int | None = 512
|
|
||||||
temperature: float | None = 0.7
|
|
||||||
stream: bool | None = False
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/health")
|
|
||||||
def health() -> dict[str, str]:
|
|
||||||
return {"status": "ok", "model": _model_id}
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/v1/models")
|
|
||||||
def list_models() -> dict[str, Any]:
|
|
||||||
return {
|
|
||||||
"object": "list",
|
|
||||||
"data": [{"id": _model_id, "object": "model", "owned_by": "cf-orch"}],
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/v1/chat/completions")
|
|
||||||
def chat_completions(req: ChatRequest) -> dict[str, Any]:
|
|
||||||
if _model is None:
|
|
||||||
raise HTTPException(503, detail="Model not loaded")
|
|
||||||
if req.stream:
|
|
||||||
raise HTTPException(501, detail="Streaming not supported")
|
|
||||||
|
|
||||||
conversation = [{"role": m.role, "content": m.content} for m in req.messages]
|
|
||||||
try:
|
|
||||||
encoded = _tokenizer.apply_chat_template(
|
|
||||||
conversation,
|
|
||||||
return_tensors="pt",
|
|
||||||
add_generation_prompt=True,
|
|
||||||
)
|
|
||||||
# transformers 5.x returns BatchEncoding; 4.x returned a bare tensor
|
|
||||||
input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(_device)
|
|
||||||
except Exception as exc:
|
|
||||||
raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
|
|
||||||
|
|
||||||
max_new = req.max_tokens or 512
|
|
||||||
temp = req.temperature if req.temperature is not None else 0.7
|
|
||||||
gen_kwargs: dict[str, Any] = {
|
|
||||||
"max_new_tokens": max_new,
|
|
||||||
"do_sample": temp > 0,
|
|
||||||
"pad_token_id": _tokenizer.eos_token_id,
|
|
||||||
}
|
|
||||||
if temp > 0:
|
|
||||||
gen_kwargs["temperature"] = temp
|
|
||||||
|
|
||||||
with torch.inference_mode():
|
|
||||||
output_ids = _model.generate(input_ids, **gen_kwargs)
|
|
||||||
|
|
||||||
new_tokens = output_ids[0][input_ids.shape[-1]:]
|
|
||||||
reply = _tokenizer.decode(new_tokens, skip_special_tokens=True)
|
|
||||||
|
|
||||||
return {
|
|
||||||
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
|
|
||||||
"object": "chat.completion",
|
|
||||||
"created": int(time.time()),
|
|
||||||
"model": _model_id,
|
|
||||||
"choices": [
|
|
||||||
{
|
|
||||||
"index": 0,
|
|
||||||
"message": {"role": "assistant", "content": reply},
|
|
||||||
"finish_reason": "stop",
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"usage": {
|
|
||||||
"prompt_tokens": input_ids.shape[-1],
|
|
||||||
"completion_tokens": len(new_tokens),
|
|
||||||
"total_tokens": input_ids.shape[-1] + len(new_tokens),
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _load_model(model_path: str, gpu_id: int) -> None:
|
|
||||||
global _model, _tokenizer, _model_id, _device
|
|
||||||
_device = f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
|
|
||||||
_model_id = model_path
|
|
||||||
_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
|
||||||
_model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
model_path,
|
|
||||||
dtype=torch.float16 if "cuda" in _device else torch.float32,
|
|
||||||
device_map={"": _device},
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
_model.eval()
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
parser = argparse.ArgumentParser(description="cf-orch generic LLM inference server")
|
|
||||||
parser.add_argument("--model", required=True)
|
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
|
||||||
parser.add_argument("--host", default="0.0.0.0")
|
|
||||||
parser.add_argument("--gpu-id", type=int, default=0)
|
|
||||||
args = parser.parse_args()
|
|
||||||
_load_model(args.model, args.gpu_id)
|
|
||||||
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -1,66 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class VRAMLease:
|
|
||||||
lease_id: str
|
|
||||||
gpu_id: int
|
|
||||||
node_id: str
|
|
||||||
mb_granted: int
|
|
||||||
holder_service: str
|
|
||||||
priority: int
|
|
||||||
expires_at: float # unix timestamp; 0.0 = no expiry
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def create(
|
|
||||||
cls,
|
|
||||||
gpu_id: int,
|
|
||||||
node_id: str,
|
|
||||||
mb: int,
|
|
||||||
service: str,
|
|
||||||
priority: int,
|
|
||||||
ttl_s: float = 0.0,
|
|
||||||
) -> VRAMLease:
|
|
||||||
return cls(
|
|
||||||
lease_id=str(uuid.uuid4()),
|
|
||||||
gpu_id=gpu_id,
|
|
||||||
node_id=node_id,
|
|
||||||
mb_granted=mb,
|
|
||||||
holder_service=service,
|
|
||||||
priority=priority,
|
|
||||||
expires_at=time.time() + ttl_s if ttl_s > 0.0 else 0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
def is_expired(self) -> bool:
|
|
||||||
return self.expires_at > 0.0 and time.time() > self.expires_at
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class GpuInfo:
|
|
||||||
gpu_id: int
|
|
||||||
name: str
|
|
||||||
vram_total_mb: int
|
|
||||||
vram_used_mb: int
|
|
||||||
vram_free_mb: int
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
|
||||||
class ResidentAllocation:
|
|
||||||
"""A model that is loaded and warm in VRAM but not actively serving a request."""
|
|
||||||
service: str
|
|
||||||
node_id: str
|
|
||||||
model_name: Optional[str] # None if service is running but model probe failed
|
|
||||||
first_seen: float = field(default_factory=time.time)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class NodeInfo:
|
|
||||||
node_id: str
|
|
||||||
agent_url: str
|
|
||||||
gpus: list[GpuInfo]
|
|
||||||
last_heartbeat: float = field(default_factory=time.time)
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: cpu-16gb
|
|
||||||
eviction_timeout_s: 30.0
|
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
backend: moonshine
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
cf-embed:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
always_on: true
|
|
||||||
cf-classify:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
always_on: true
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 3b-q4
|
|
||||||
image_gen_max: none
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: cpu-32gb
|
|
||||||
eviction_timeout_s: 30.0
|
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
backend: faster-whisper
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
cf-embed:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 4
|
|
||||||
always_on: true
|
|
||||||
cf-classify:
|
|
||||||
max_mb: 0
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 4
|
|
||||||
always_on: true
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 7b-q4
|
|
||||||
image_gen_max: none
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-16gb
|
|
||||||
vram_total_mb: 16384
|
|
||||||
eviction_timeout_s: 10.0
|
|
||||||
services:
|
|
||||||
vllm:
|
|
||||||
max_mb: 9000
|
|
||||||
priority: 1
|
|
||||||
idle_stop_after_s: 600
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8000
|
|
||||||
host_port: 8000
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
ollama:
|
|
||||||
max_mb: 12288
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 3072
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 4
|
|
||||||
cf-docuvision:
|
|
||||||
max_mb: 6144
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 3
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8003
|
|
||||||
host_port: 8003
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 1200
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 3
|
|
||||||
backend: parakeet-tdt
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 1024
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 3
|
|
||||||
cf-embed:
|
|
||||||
max_mb: 512
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 6
|
|
||||||
always_on: true
|
|
||||||
cf-classify:
|
|
||||||
max_mb: 512
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 6
|
|
||||||
always_on: true
|
|
||||||
comfyui:
|
|
||||||
max_mb: 14336
|
|
||||||
priority: 4
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 34b
|
|
||||||
image_gen_max: flux-dev-fp8
|
|
||||||
|
|
@ -1,73 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-24gb
|
|
||||||
vram_total_mb: 24576
|
|
||||||
eviction_timeout_s: 10.0
|
|
||||||
services:
|
|
||||||
vllm:
|
|
||||||
max_mb: 9000
|
|
||||||
priority: 1
|
|
||||||
idle_stop_after_s: 600
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8000
|
|
||||||
host_port: 8000
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
ollama:
|
|
||||||
max_mb: 18432
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 4096
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 6
|
|
||||||
cf-docuvision:
|
|
||||||
max_mb: 8192
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 4
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8003
|
|
||||||
host_port: 8003
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 1200
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 4
|
|
||||||
backend: parakeet-tdt
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 1024
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 4
|
|
||||||
cf-embed:
|
|
||||||
max_mb: 512
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 8
|
|
||||||
always_on: true
|
|
||||||
cf-classify:
|
|
||||||
max_mb: 512
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 8
|
|
||||||
always_on: true
|
|
||||||
comfyui:
|
|
||||||
max_mb: 20480
|
|
||||||
priority: 4
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 70b
|
|
||||||
image_gen_max: flux-dev-fp16
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-2gb
|
|
||||||
vram_total_mb: 2048
|
|
||||||
eviction_timeout_s: 15.0
|
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
max_mb: 1536
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 512
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 200
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
backend: moonshine
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 3b
|
|
||||||
image_gen_max: none
|
|
||||||
|
|
@ -1,38 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-4gb
|
|
||||||
vram_total_mb: 4096
|
|
||||||
eviction_timeout_s: 15.0
|
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
max_mb: 3072
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 1024
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 600
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
backend: faster-whisper
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 512
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
comfyui:
|
|
||||||
max_mb: 3584
|
|
||||||
priority: 4
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 3b
|
|
||||||
image_gen_max: sd15-fp8
|
|
||||||
|
|
@ -1,61 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-6gb
|
|
||||||
vram_total_mb: 6144
|
|
||||||
eviction_timeout_s: 10.0
|
|
||||||
services:
|
|
||||||
vllm:
|
|
||||||
max_mb: 5500
|
|
||||||
priority: 1
|
|
||||||
idle_stop_after_s: 600
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8000
|
|
||||||
host_port: 8000
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
ollama:
|
|
||||||
max_mb: 3584
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 1536
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
cf-docuvision:
|
|
||||||
max_mb: 3072
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8003
|
|
||||||
host_port: 8003
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 600
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
backend: faster-whisper
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 768
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 1
|
|
||||||
comfyui:
|
|
||||||
max_mb: 5120
|
|
||||||
priority: 4
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 7b
|
|
||||||
image_gen_max: sd15
|
|
||||||
|
|
@ -1,68 +0,0 @@
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-8gb
|
|
||||||
vram_total_mb: 8192
|
|
||||||
eviction_timeout_s: 10.0
|
|
||||||
services:
|
|
||||||
vllm:
|
|
||||||
max_mb: 6500
|
|
||||||
priority: 1
|
|
||||||
idle_stop_after_s: 600
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8000
|
|
||||||
host_port: 8000
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
ollama:
|
|
||||||
max_mb: 4096
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: "/usr/local/bin/ollama"
|
|
||||||
args_template: "serve"
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 2048
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 3
|
|
||||||
cf-docuvision:
|
|
||||||
max_mb: 4096
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/devl/miniconda3/envs/cf/bin/python"
|
|
||||||
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
|
|
||||||
port: 8003
|
|
||||||
host_port: 8003
|
|
||||||
cwd: "/Library/Development/CircuitForge/circuitforge-core"
|
|
||||||
cf-stt:
|
|
||||||
max_mb: 1200
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
backend: parakeet-tdt
|
|
||||||
cf-tts:
|
|
||||||
max_mb: 1024
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 2
|
|
||||||
comfyui:
|
|
||||||
max_mb: 6144
|
|
||||||
priority: 4
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
exec_path: "/opt/miniconda3/envs/comfyui/bin/python"
|
|
||||||
args_template: "/opt/ComfyUI/main.py --listen 0.0.0.0 --port {port} --cuda-device {gpu_id}"
|
|
||||||
cwd: "/opt/ComfyUI"
|
|
||||||
port: 8188
|
|
||||||
host_port: 8188
|
|
||||||
model_size_hints:
|
|
||||||
llm_max_params: 8b
|
|
||||||
image_gen_max: sdxl-fp8
|
|
||||||
|
|
@ -1,121 +0,0 @@
|
||||||
# circuitforge_core/resources/profiles/schema.py
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
import yaml
|
|
||||||
from pydantic import BaseModel, Field, model_validator
|
|
||||||
|
|
||||||
SUPPORTED_SCHEMA_VERSION = 1
|
|
||||||
|
|
||||||
|
|
||||||
class DockerSpec(BaseModel):
|
|
||||||
"""Spec for a Docker-managed service."""
|
|
||||||
|
|
||||||
image: str
|
|
||||||
port: int
|
|
||||||
host_port: int
|
|
||||||
command_template: str = ""
|
|
||||||
volumes: list[str] = Field(default_factory=list)
|
|
||||||
env: dict[str, str] = Field(default_factory=dict)
|
|
||||||
runtime: str = "nvidia"
|
|
||||||
ipc: str = "host"
|
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessSpec(BaseModel):
|
|
||||||
"""Spec for a process-managed service (non-Docker, e.g. conda env)."""
|
|
||||||
|
|
||||||
exec_path: str
|
|
||||||
args_template: str = ""
|
|
||||||
cwd: str = ""
|
|
||||||
env: dict[str, str] = Field(default_factory=dict)
|
|
||||||
port: int = 0
|
|
||||||
host_port: int = 0
|
|
||||||
# adopt=True: if the service is already listening on host_port, claim it rather
|
|
||||||
# than spawning a new process (useful for system daemons like Ollama).
|
|
||||||
adopt: bool = False
|
|
||||||
# Override the health probe path; defaults to /health (Ollama uses /api/tags).
|
|
||||||
health_path: str = "/health"
|
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceProfile(BaseModel):
|
|
||||||
max_mb: int
|
|
||||||
priority: int
|
|
||||||
shared: bool = False
|
|
||||||
max_concurrent: int = 1
|
|
||||||
always_on: bool = False
|
|
||||||
idle_stop_after_s: int = 0
|
|
||||||
backend: str | None = None
|
|
||||||
consumers: list[str] = Field(default_factory=list)
|
|
||||||
managed: DockerSpec | ProcessSpec | None = None
|
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def _parse_managed(cls, values: Any) -> Any:
|
|
||||||
if not isinstance(values, dict):
|
|
||||||
return values
|
|
||||||
raw = values.get("managed")
|
|
||||||
if raw is None:
|
|
||||||
return values
|
|
||||||
if not isinstance(raw, dict):
|
|
||||||
return values
|
|
||||||
spec_type = raw.get("type")
|
|
||||||
managed_fields = {k: v for k, v in raw.items() if k != "type"}
|
|
||||||
if spec_type == "docker":
|
|
||||||
values["managed"] = DockerSpec(**managed_fields)
|
|
||||||
elif spec_type == "process":
|
|
||||||
values["managed"] = ProcessSpec(**managed_fields)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown managed service type: {spec_type!r}")
|
|
||||||
return values
|
|
||||||
|
|
||||||
|
|
||||||
class GpuNodeEntry(BaseModel):
|
|
||||||
id: int
|
|
||||||
vram_mb: int
|
|
||||||
role: str
|
|
||||||
card: str = "unknown"
|
|
||||||
always_on: bool = False
|
|
||||||
services: list[str] = Field(default_factory=list)
|
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
|
||||||
|
|
||||||
|
|
||||||
class NodeProfile(BaseModel):
|
|
||||||
gpus: list[GpuNodeEntry]
|
|
||||||
agent_url: str | None = None
|
|
||||||
nas_mount: str | None = None
|
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
|
||||||
|
|
||||||
|
|
||||||
class GpuProfile(BaseModel):
|
|
||||||
schema_version: int
|
|
||||||
name: str
|
|
||||||
vram_total_mb: int | None = None
|
|
||||||
eviction_timeout_s: float = 10.0
|
|
||||||
services: dict[str, ServiceProfile] = Field(default_factory=dict)
|
|
||||||
model_size_hints: dict[str, str] = Field(default_factory=dict)
|
|
||||||
nodes: dict[str, NodeProfile] = Field(default_factory=dict)
|
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
|
||||||
|
|
||||||
|
|
||||||
def load_profile(path: Path) -> GpuProfile:
|
|
||||||
raw: dict[str, Any] = yaml.safe_load(path.read_text())
|
|
||||||
if not isinstance(raw, dict):
|
|
||||||
raise ValueError(f"Profile file {path} must be a YAML mapping, got {type(raw).__name__}")
|
|
||||||
version = raw.get("schema_version")
|
|
||||||
if version != SUPPORTED_SCHEMA_VERSION:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unsupported schema_version {version!r} in {path}. "
|
|
||||||
f"Expected {SUPPORTED_SCHEMA_VERSION}."
|
|
||||||
)
|
|
||||||
return GpuProfile.model_validate(raw)
|
|
||||||
|
|
@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
[project]
|
[project]
|
||||||
name = "circuitforge-core"
|
name = "circuitforge-core"
|
||||||
version = "0.7.0"
|
version = "0.8.0"
|
||||||
description = "Shared scaffold for CircuitForge products"
|
description = "Shared scaffold for CircuitForge products (MIT)"
|
||||||
requires-python = ">=3.11"
|
requires-python = ">=3.11"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"pyyaml>=6.0",
|
"pyyaml>=6.0",
|
||||||
|
|
@ -14,32 +14,17 @@ dependencies = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
orch = [
|
|
||||||
"fastapi>=0.110",
|
|
||||||
"uvicorn[standard]>=0.29",
|
|
||||||
"httpx>=0.27",
|
|
||||||
"pydantic>=2.0",
|
|
||||||
"typer[all]>=0.12",
|
|
||||||
"psutil>=5.9",
|
|
||||||
]
|
|
||||||
tasks = [
|
|
||||||
"httpx>=0.27",
|
|
||||||
]
|
|
||||||
manage = [
|
manage = [
|
||||||
"platformdirs>=4.0",
|
"platformdirs>=4.0",
|
||||||
"typer[all]>=0.12",
|
"typer[all]>=0.12",
|
||||||
]
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"circuitforge-core[orch]",
|
|
||||||
"circuitforge-core[tasks]",
|
|
||||||
"circuitforge-core[manage]",
|
"circuitforge-core[manage]",
|
||||||
"pytest>=8.0",
|
"pytest>=8.0",
|
||||||
"pytest-asyncio>=0.23",
|
"pytest-asyncio>=0.23",
|
||||||
"httpx>=0.27",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
cf-orch = "circuitforge_core.resources.cli:app"
|
|
||||||
cf-manage = "circuitforge_core.manage.cli:app"
|
cf-manage = "circuitforge_core.manage.cli:app"
|
||||||
|
|
||||||
[tool.setuptools.packages.find]
|
[tool.setuptools.packages.find]
|
||||||
|
|
|
||||||
|
|
@ -1,68 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
from fastapi.testclient import TestClient
|
|
||||||
|
|
||||||
from circuitforge_core.resources.agent.app import create_agent_app
|
|
||||||
from circuitforge_core.resources.models import GpuInfo
|
|
||||||
from circuitforge_core.resources.agent.eviction_executor import EvictionResult
|
|
||||||
|
|
||||||
MOCK_GPUS = [
|
|
||||||
GpuInfo(
|
|
||||||
gpu_id=0,
|
|
||||||
name="RTX 4000",
|
|
||||||
vram_total_mb=8192,
|
|
||||||
vram_used_mb=1024,
|
|
||||||
vram_free_mb=7168,
|
|
||||||
),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def agent_client():
|
|
||||||
mock_monitor = MagicMock()
|
|
||||||
mock_monitor.poll.return_value = MOCK_GPUS
|
|
||||||
mock_executor = MagicMock()
|
|
||||||
app = create_agent_app(
|
|
||||||
node_id="heimdall",
|
|
||||||
monitor=mock_monitor,
|
|
||||||
executor=mock_executor,
|
|
||||||
)
|
|
||||||
return TestClient(app), mock_monitor, mock_executor
|
|
||||||
|
|
||||||
|
|
||||||
def test_health_returns_ok(agent_client):
|
|
||||||
client, _, _ = agent_client
|
|
||||||
resp = client.get("/health")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["status"] == "ok"
|
|
||||||
assert resp.json()["node_id"] == "heimdall"
|
|
||||||
|
|
||||||
|
|
||||||
def test_gpu_info_returns_gpu_list(agent_client):
|
|
||||||
client, _, _ = agent_client
|
|
||||||
resp = client.get("/gpu-info")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
data = resp.json()
|
|
||||||
assert len(data["gpus"]) == 1
|
|
||||||
assert data["gpus"][0]["gpu_id"] == 0
|
|
||||||
assert data["gpus"][0]["name"] == "RTX 4000"
|
|
||||||
assert data["gpus"][0]["vram_free_mb"] == 7168
|
|
||||||
|
|
||||||
|
|
||||||
def test_evict_calls_executor(agent_client):
|
|
||||||
client, _, mock_executor = agent_client
|
|
||||||
mock_executor.evict_pid.return_value = EvictionResult(
|
|
||||||
success=True, method="sigterm", message="done"
|
|
||||||
)
|
|
||||||
resp = client.post("/evict", json={"pid": 1234, "grace_period_s": 5.0})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["success"] is True
|
|
||||||
mock_executor.evict_pid.assert_called_once_with(pid=1234, grace_period_s=5.0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_evict_requires_pid(agent_client):
|
|
||||||
client, _, _ = agent_client
|
|
||||||
resp = client.post("/evict", json={"grace_period_s": 5.0})
|
|
||||||
assert resp.status_code == 422
|
|
||||||
|
|
@ -1,93 +0,0 @@
|
||||||
import asyncio
|
|
||||||
import time
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry, ServiceInstance
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_idle_stop_config_empty_without_registry():
|
|
||||||
lm = LeaseManager()
|
|
||||||
supervisor = AgentSupervisor(lease_manager=lm)
|
|
||||||
assert supervisor._build_idle_stop_config() == {}
|
|
||||||
|
|
||||||
|
|
||||||
def test_build_idle_stop_config_from_profiles():
|
|
||||||
lm = LeaseManager()
|
|
||||||
mock_svc = MagicMock()
|
|
||||||
mock_svc.idle_stop_after_s = 600
|
|
||||||
mock_profile = MagicMock()
|
|
||||||
mock_profile.services = {"vllm": mock_svc}
|
|
||||||
mock_profile_registry = MagicMock()
|
|
||||||
mock_profile_registry.list_public.return_value = [mock_profile]
|
|
||||||
|
|
||||||
supervisor = AgentSupervisor(lease_manager=lm, profile_registry=mock_profile_registry)
|
|
||||||
config = supervisor._build_idle_stop_config()
|
|
||||||
assert config == {"vllm": 600}
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_run_idle_sweep_posts_stop():
|
|
||||||
lm = LeaseManager()
|
|
||||||
service_registry = ServiceRegistry()
|
|
||||||
|
|
||||||
# Upsert instance as running, then allocate + release to transition it to idle
|
|
||||||
service_registry.upsert_instance(
|
|
||||||
service="vllm",
|
|
||||||
node_id="heimdall",
|
|
||||||
gpu_id=0,
|
|
||||||
state="running",
|
|
||||||
model="test-model",
|
|
||||||
url="http://heimdall:8000",
|
|
||||||
)
|
|
||||||
alloc = service_registry.allocate(
|
|
||||||
service="vllm",
|
|
||||||
node_id="heimdall",
|
|
||||||
gpu_id=0,
|
|
||||||
model="test-model",
|
|
||||||
url="http://heimdall:8000",
|
|
||||||
caller="test",
|
|
||||||
ttl_s=300.0,
|
|
||||||
)
|
|
||||||
service_registry.release(alloc.allocation_id)
|
|
||||||
|
|
||||||
# Backdate idle_since so it exceeds the timeout
|
|
||||||
import dataclasses
|
|
||||||
key = "vllm:heimdall:0"
|
|
||||||
inst = service_registry._instances[key]
|
|
||||||
service_registry._instances[key] = dataclasses.replace(inst, idle_since=time.time() - 700)
|
|
||||||
|
|
||||||
mock_profile_registry = MagicMock()
|
|
||||||
mock_svc = MagicMock()
|
|
||||||
mock_svc.idle_stop_after_s = 600
|
|
||||||
mock_profile = MagicMock()
|
|
||||||
mock_profile.services = {"vllm": mock_svc}
|
|
||||||
mock_profile_registry.list_public.return_value = [mock_profile]
|
|
||||||
|
|
||||||
supervisor = AgentSupervisor(
|
|
||||||
lease_manager=lm,
|
|
||||||
service_registry=service_registry,
|
|
||||||
profile_registry=mock_profile_registry,
|
|
||||||
)
|
|
||||||
supervisor.register("heimdall", "http://heimdall:7701")
|
|
||||||
|
|
||||||
posted_urls = []
|
|
||||||
|
|
||||||
async def fake_http_post(url: str) -> bool:
|
|
||||||
posted_urls.append(url)
|
|
||||||
return True
|
|
||||||
|
|
||||||
supervisor._http_post = fake_http_post
|
|
||||||
await supervisor._run_idle_sweep()
|
|
||||||
|
|
||||||
assert len(posted_urls) == 1
|
|
||||||
assert posted_urls[0] == "http://heimdall:7701/services/vllm/stop"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_run_idle_sweep_skips_without_registry():
|
|
||||||
lm = LeaseManager()
|
|
||||||
supervisor = AgentSupervisor(lease_manager=lm)
|
|
||||||
# Should return immediately without error
|
|
||||||
await supervisor._run_idle_sweep()
|
|
||||||
|
|
@ -1,151 +0,0 @@
|
||||||
# tests/test_resources/test_agent_watchdog.py
|
|
||||||
"""
|
|
||||||
Tests for AgentSupervisor watchdog behaviour:
|
|
||||||
- restore_from_store() reloads known nodes from NodeStore on startup
|
|
||||||
- register() persists to NodeStore
|
|
||||||
- restored nodes start offline and come online after a successful poll
|
|
||||||
- NodeStore=None path is a no-op (backwards compatibility)
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
|
||||||
|
|
||||||
|
|
||||||
# ── fixtures ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def store(tmp_path: Path) -> NodeStore:
|
|
||||||
return NodeStore(db_path=tmp_path / "nodes.db")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def supervisor(store: NodeStore) -> AgentSupervisor:
|
|
||||||
return AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def supervisor_no_store() -> AgentSupervisor:
|
|
||||||
return AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
|
|
||||||
|
|
||||||
|
|
||||||
# ── register() persists ───────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_register_persists_to_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
|
|
||||||
supervisor.register("heimdall", "http://127.0.0.1:7701")
|
|
||||||
rows = store.all()
|
|
||||||
assert len(rows) == 1
|
|
||||||
assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
|
|
||||||
|
|
||||||
|
|
||||||
def test_register_updates_url_in_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
|
|
||||||
supervisor.register("navi", "http://10.1.10.10:7701")
|
|
||||||
supervisor.register("navi", "http://10.1.10.10:9999")
|
|
||||||
rows = store.all()
|
|
||||||
assert len(rows) == 1
|
|
||||||
assert rows[0][1] == "http://10.1.10.10:9999"
|
|
||||||
|
|
||||||
|
|
||||||
def test_register_without_store_does_not_crash(supervisor_no_store: AgentSupervisor) -> None:
|
|
||||||
supervisor_no_store.register("heimdall", "http://127.0.0.1:7701")
|
|
||||||
assert supervisor_no_store.get_node_info("heimdall") is not None
|
|
||||||
|
|
||||||
|
|
||||||
# ── restore_from_store() ──────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_restore_loads_known_nodes(tmp_path: Path) -> None:
|
|
||||||
"""Nodes written by a previous supervisor session are restored into a fresh one."""
|
|
||||||
db = tmp_path / "nodes.db"
|
|
||||||
|
|
||||||
# Session 1: register two nodes
|
|
||||||
s1 = NodeStore(db_path=db)
|
|
||||||
sup1 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s1)
|
|
||||||
sup1.register("navi", "http://10.1.10.10:7701")
|
|
||||||
sup1.register("strahl", "http://10.1.10.20:7701")
|
|
||||||
|
|
||||||
# Session 2: fresh supervisor, same DB
|
|
||||||
s2 = NodeStore(db_path=db)
|
|
||||||
sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
|
|
||||||
restored = sup2.restore_from_store()
|
|
||||||
|
|
||||||
assert restored == 2
|
|
||||||
assert sup2.get_node_info("navi") is not None
|
|
||||||
assert sup2.get_node_info("strahl") is not None
|
|
||||||
|
|
||||||
|
|
||||||
def test_restore_marks_nodes_offline(tmp_path: Path) -> None:
|
|
||||||
"""Restored nodes start offline — they haven't been polled yet."""
|
|
||||||
db = tmp_path / "nodes.db"
|
|
||||||
|
|
||||||
s1 = NodeStore(db_path=db)
|
|
||||||
AgentSupervisor(lease_manager=LeaseManager(), node_store=s1).register(
|
|
||||||
"navi", "http://10.1.10.10:7701"
|
|
||||||
)
|
|
||||||
|
|
||||||
s2 = NodeStore(db_path=db)
|
|
||||||
sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
|
|
||||||
sup2.restore_from_store()
|
|
||||||
|
|
||||||
assert sup2.online_agents() == {}
|
|
||||||
|
|
||||||
|
|
||||||
def test_restore_returns_zero_without_store() -> None:
|
|
||||||
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
|
|
||||||
assert sup.restore_from_store() == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_restore_skips_already_registered(tmp_path: Path) -> None:
|
|
||||||
"""Nodes manually registered before restore_from_store() are not duplicated."""
|
|
||||||
db = tmp_path / "nodes.db"
|
|
||||||
store = NodeStore(db_path=db)
|
|
||||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
|
||||||
|
|
||||||
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
|
|
||||||
sup.register("heimdall", "http://127.0.0.1:7701") # already in memory
|
|
||||||
restored = sup.restore_from_store()
|
|
||||||
|
|
||||||
assert restored == 0 # already present, not double-counted
|
|
||||||
|
|
||||||
|
|
||||||
# ── restored node comes online after poll ─────────────────────────────────────
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_restored_node_comes_online_after_poll(tmp_path: Path) -> None:
|
|
||||||
"""After restore, a successful poll_agent() brings the node online."""
|
|
||||||
db = tmp_path / "nodes.db"
|
|
||||||
store = NodeStore(db_path=db)
|
|
||||||
store.upsert("navi", "http://10.1.10.10:7701")
|
|
||||||
|
|
||||||
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
|
|
||||||
sup.restore_from_store()
|
|
||||||
|
|
||||||
# Stub poll_agent to succeed
|
|
||||||
gpu_payload = {"gpus": [{"gpu_id": 0, "name": "RTX 4000",
|
|
||||||
"vram_total_mb": 8192, "vram_used_mb": 512, "vram_free_mb": 7680}]}
|
|
||||||
resident_payload = {"residents": []}
|
|
||||||
|
|
||||||
mock_resp_gpu = MagicMock()
|
|
||||||
mock_resp_gpu.raise_for_status = MagicMock()
|
|
||||||
mock_resp_gpu.json.return_value = gpu_payload
|
|
||||||
|
|
||||||
mock_resp_res = MagicMock()
|
|
||||||
mock_resp_res.is_success = True
|
|
||||||
mock_resp_res.json.return_value = resident_payload
|
|
||||||
|
|
||||||
mock_client = AsyncMock()
|
|
||||||
mock_client.get = AsyncMock(side_effect=[mock_resp_gpu, mock_resp_res])
|
|
||||||
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
|
|
||||||
mock_client.__aexit__ = AsyncMock(return_value=False)
|
|
||||||
|
|
||||||
with patch("circuitforge_core.resources.coordinator.agent_supervisor.httpx.AsyncClient",
|
|
||||||
return_value=mock_client):
|
|
||||||
result = await sup.poll_agent("navi")
|
|
||||||
|
|
||||||
assert result is True
|
|
||||||
assert "navi" in sup.online_agents()
|
|
||||||
|
|
@ -1,33 +0,0 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import patch
|
|
||||||
|
|
||||||
from typer.testing import CliRunner
|
|
||||||
|
|
||||||
from circuitforge_core.resources.cli import app
|
|
||||||
|
|
||||||
runner = CliRunner()
|
|
||||||
|
|
||||||
|
|
||||||
def test_cli_help():
|
|
||||||
result = runner.invoke(app, ["--help"])
|
|
||||||
assert result.exit_code == 0
|
|
||||||
assert "cf-orch" in result.output.lower() or "Usage" in result.output
|
|
||||||
|
|
||||||
|
|
||||||
def test_status_command_shows_no_coordinator_message():
|
|
||||||
with patch("httpx.get", side_effect=ConnectionRefusedError("refused")):
|
|
||||||
result = runner.invoke(app, ["status"])
|
|
||||||
assert result.exit_code != 0 or "unreachable" in result.output.lower() \
|
|
||||||
or "coordinator" in result.output.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def test_install_service_creates_systemd_unit(tmp_path: Path):
|
|
||||||
unit_path = tmp_path / "cf-orch.service"
|
|
||||||
with patch(
|
|
||||||
"circuitforge_core.resources.cli._SYSTEMD_UNIT_PATH", unit_path
|
|
||||||
):
|
|
||||||
result = runner.invoke(app, ["install-service", "--dry-run"])
|
|
||||||
assert result.exit_code == 0
|
|
||||||
assert "cf-orch.service" in result.output or "systemd" in result.output.lower()
|
|
||||||
|
|
@ -1,94 +0,0 @@
|
||||||
import json
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
import httpretty
|
|
||||||
from circuitforge_core.resources.client import CFOrchClient, Allocation
|
|
||||||
|
|
||||||
_ALLOC_BODY = (
|
|
||||||
'{"allocation_id":"abc123","service":"vllm","node_id":"heimdall",'
|
|
||||||
'"gpu_id":0,"model":"Ouro-1.4B","url":"http://heimdall:8000","started":false,"warm":true}'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@httpretty.activate
|
|
||||||
def test_sync_allocate_returns_allocation():
|
|
||||||
httpretty.register_uri(
|
|
||||||
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
|
|
||||||
body=_ALLOC_BODY, content_type="application/json",
|
|
||||||
)
|
|
||||||
httpretty.register_uri(
|
|
||||||
httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/abc123",
|
|
||||||
body='{"released":true}', content_type="application/json",
|
|
||||||
)
|
|
||||||
client = CFOrchClient("http://orch:7700")
|
|
||||||
with client.allocate("vllm", model_candidates=["Ouro-1.4B"], caller="test") as alloc:
|
|
||||||
assert isinstance(alloc, Allocation)
|
|
||||||
assert alloc.url == "http://heimdall:8000"
|
|
||||||
assert alloc.model == "Ouro-1.4B"
|
|
||||||
assert alloc.allocation_id == "abc123"
|
|
||||||
assert httpretty.last_request().method == "DELETE"
|
|
||||||
|
|
||||||
|
|
||||||
@httpretty.activate
|
|
||||||
def test_sync_allocate_ignores_404_on_release():
|
|
||||||
httpretty.register_uri(
|
|
||||||
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
|
|
||||||
body='{"allocation_id":"xyz","service":"vllm","node_id":"a","gpu_id":0,'
|
|
||||||
'"model":"m","url":"http://a:8000","started":false,"warm":false}',
|
|
||||||
content_type="application/json",
|
|
||||||
)
|
|
||||||
httpretty.register_uri(
|
|
||||||
httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/xyz",
|
|
||||||
status=404, body='{"detail":"not found"}', content_type="application/json",
|
|
||||||
)
|
|
||||||
client = CFOrchClient("http://orch:7700")
|
|
||||||
with client.allocate("vllm", model_candidates=["m"]) as alloc:
|
|
||||||
assert alloc.url == "http://a:8000"
|
|
||||||
# No exception raised — 404 on release is silently ignored
|
|
||||||
|
|
||||||
|
|
||||||
@httpretty.activate
|
|
||||||
def test_sync_allocate_raises_on_503():
|
|
||||||
httpretty.register_uri(
|
|
||||||
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
|
|
||||||
status=503, body='{"detail":"no capacity"}', content_type="application/json",
|
|
||||||
)
|
|
||||||
client = CFOrchClient("http://orch:7700")
|
|
||||||
with pytest.raises(RuntimeError, match="cf-orch allocation failed"):
|
|
||||||
with client.allocate("vllm", model_candidates=["m"]):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
async def test_async_allocate_works():
|
|
||||||
# httpretty only patches stdlib sockets; httpx async uses anyio sockets so
|
|
||||||
# we mock httpx.AsyncClient directly instead.
|
|
||||||
alloc_data = {
|
|
||||||
"allocation_id": "a1", "service": "vllm", "node_id": "n",
|
|
||||||
"gpu_id": 0, "model": "m", "url": "http://n:8000",
|
|
||||||
"started": False, "warm": False,
|
|
||||||
}
|
|
||||||
release_data = {"released": True}
|
|
||||||
|
|
||||||
def _make_response(data, status_code=200):
|
|
||||||
resp = MagicMock()
|
|
||||||
resp.is_success = status_code < 400
|
|
||||||
resp.status_code = status_code
|
|
||||||
resp.json.return_value = data
|
|
||||||
return resp
|
|
||||||
|
|
||||||
mock_post = AsyncMock(return_value=_make_response(alloc_data))
|
|
||||||
mock_delete = AsyncMock(return_value=_make_response(release_data))
|
|
||||||
|
|
||||||
mock_async_client = MagicMock()
|
|
||||||
mock_async_client.post = mock_post
|
|
||||||
mock_async_client.delete = mock_delete
|
|
||||||
mock_async_client.__aenter__ = AsyncMock(return_value=mock_async_client)
|
|
||||||
mock_async_client.__aexit__ = AsyncMock(return_value=False)
|
|
||||||
|
|
||||||
with patch("httpx.AsyncClient", return_value=mock_async_client):
|
|
||||||
client = CFOrchClient("http://orch:7700")
|
|
||||||
async with client.allocate_async("vllm", model_candidates=["m"]) as alloc:
|
|
||||||
assert alloc.url == "http://n:8000"
|
|
||||||
assert alloc.allocation_id == "a1"
|
|
||||||
mock_delete.assert_called_once()
|
|
||||||
|
|
@ -1,132 +0,0 @@
|
||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, MagicMock, patch
|
|
||||||
from fastapi.testclient import TestClient
|
|
||||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
|
|
||||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
|
||||||
|
|
||||||
|
|
||||||
def _make_supervisor_mock(online: bool = True):
|
|
||||||
sup = MagicMock()
|
|
||||||
record = AgentRecord(node_id="heimdall", agent_url="http://heimdall:7701")
|
|
||||||
record.gpus = [GpuInfo(0, "RTX 4000", 8192, 0, 8192)]
|
|
||||||
record.online = online
|
|
||||||
sup.online_agents.return_value = {"heimdall": record} if online else {}
|
|
||||||
sup.get_node_info.return_value = NodeInfo(
|
|
||||||
node_id="heimdall",
|
|
||||||
agent_url="http://heimdall:7701",
|
|
||||||
gpus=record.gpus,
|
|
||||||
last_heartbeat=0.0,
|
|
||||||
)
|
|
||||||
return sup
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def alloc_client():
|
|
||||||
lm = LeaseManager()
|
|
||||||
pr = ProfileRegistry()
|
|
||||||
sup = _make_supervisor_mock()
|
|
||||||
sr = ServiceRegistry()
|
|
||||||
app = create_coordinator_app(lease_manager=lm, profile_registry=pr, agent_supervisor=sup, service_registry=sr)
|
|
||||||
return TestClient(app), sup, sr
|
|
||||||
|
|
||||||
|
|
||||||
def test_allocate_returns_allocation_id_and_url(alloc_client):
|
|
||||||
client, sup, sr = alloc_client
|
|
||||||
with patch("httpx.AsyncClient") as mock_http:
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.is_success = True
|
|
||||||
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
|
|
||||||
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
|
|
||||||
|
|
||||||
resp = client.post("/api/services/vllm/allocate", json={
|
|
||||||
"model_candidates": ["Ouro-1.4B"],
|
|
||||||
"ttl_s": 300.0,
|
|
||||||
"caller": "test",
|
|
||||||
})
|
|
||||||
|
|
||||||
assert resp.status_code == 200
|
|
||||||
data = resp.json()
|
|
||||||
assert "allocation_id" in data
|
|
||||||
assert data["service"] == "vllm"
|
|
||||||
assert data["node_id"] == "heimdall"
|
|
||||||
assert data["url"] == "http://heimdall:8000"
|
|
||||||
|
|
||||||
|
|
||||||
def test_allocate_returns_503_when_no_online_nodes(alloc_client):
|
|
||||||
client, sup, sr = alloc_client
|
|
||||||
sup.online_agents.return_value = {}
|
|
||||||
resp = client.post("/api/services/vllm/allocate", json={"model_candidates": ["Ouro-1.4B"]})
|
|
||||||
assert resp.status_code == 503
|
|
||||||
|
|
||||||
|
|
||||||
def test_allocate_returns_422_for_empty_candidates(alloc_client):
|
|
||||||
client, _, sr = alloc_client
|
|
||||||
resp = client.post("/api/services/vllm/allocate", json={"model_candidates": []})
|
|
||||||
assert resp.status_code == 422
|
|
||||||
|
|
||||||
|
|
||||||
def test_allocate_returns_422_for_unknown_service(alloc_client):
|
|
||||||
client, _, sr = alloc_client
|
|
||||||
resp = client.post("/api/services/cf-made-up/allocate", json={"model_candidates": ["x"]})
|
|
||||||
assert resp.status_code == 422
|
|
||||||
|
|
||||||
|
|
||||||
def test_allocate_records_in_registry(alloc_client):
|
|
||||||
client, sup, sr = alloc_client
|
|
||||||
with patch("httpx.AsyncClient") as mock_http:
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.is_success = True
|
|
||||||
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
|
|
||||||
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
|
|
||||||
|
|
||||||
resp = client.post("/api/services/vllm/allocate", json={
|
|
||||||
"model_candidates": ["Ouro-1.4B"],
|
|
||||||
"ttl_s": 300.0,
|
|
||||||
"caller": "test",
|
|
||||||
})
|
|
||||||
|
|
||||||
assert resp.status_code == 200
|
|
||||||
allocation_id = resp.json()["allocation_id"]
|
|
||||||
|
|
||||||
status_resp = client.get("/api/services/vllm/status")
|
|
||||||
assert status_resp.status_code == 200
|
|
||||||
status_data = status_resp.json()
|
|
||||||
assert status_data["service"] == "vllm"
|
|
||||||
alloc_ids = [a["allocation_id"] for a in status_data["allocations"]]
|
|
||||||
assert allocation_id in alloc_ids
|
|
||||||
|
|
||||||
|
|
||||||
def test_release_allocation(alloc_client):
|
|
||||||
client, sup, sr = alloc_client
|
|
||||||
with patch("httpx.AsyncClient") as mock_http:
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.is_success = True
|
|
||||||
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
|
|
||||||
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
|
|
||||||
|
|
||||||
resp = client.post("/api/services/vllm/allocate", json={
|
|
||||||
"model_candidates": ["Ouro-1.4B"],
|
|
||||||
"ttl_s": 300.0,
|
|
||||||
"caller": "test",
|
|
||||||
})
|
|
||||||
|
|
||||||
assert resp.status_code == 200
|
|
||||||
allocation_id = resp.json()["allocation_id"]
|
|
||||||
|
|
||||||
del_resp = client.delete(f"/api/services/vllm/allocations/{allocation_id}")
|
|
||||||
assert del_resp.status_code == 200
|
|
||||||
assert del_resp.json() == {"released": True, "allocation_id": allocation_id}
|
|
||||||
|
|
||||||
status_resp = client.get("/api/services/vllm/status")
|
|
||||||
alloc_ids = [a["allocation_id"] for a in status_resp.json()["allocations"]]
|
|
||||||
assert allocation_id not in alloc_ids
|
|
||||||
|
|
||||||
|
|
||||||
def test_release_allocation_not_found(alloc_client):
|
|
||||||
client, _, sr = alloc_client
|
|
||||||
resp = client.delete("/api/services/vllm/allocations/bad-id")
|
|
||||||
assert resp.status_code == 404
|
|
||||||
|
|
@ -1,183 +0,0 @@
|
||||||
import pytest
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
from pathlib import Path
|
|
||||||
from fastapi.testclient import TestClient
|
|
||||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
|
||||||
from circuitforge_core.resources.profiles.schema import load_profile
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def coordinator_client():
|
|
||||||
lease_manager = LeaseManager()
|
|
||||||
lease_manager.register_gpu("heimdall", 0, 8192)
|
|
||||||
profile_registry = ProfileRegistry()
|
|
||||||
supervisor = MagicMock()
|
|
||||||
supervisor.all_nodes.return_value = [
|
|
||||||
NodeInfo(
|
|
||||||
node_id="heimdall",
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
gpus=[GpuInfo(gpu_id=0, name="RTX 4000",
|
|
||||||
vram_total_mb=8192, vram_used_mb=0, vram_free_mb=8192)],
|
|
||||||
last_heartbeat=0.0,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
supervisor.get_node_info.return_value = NodeInfo(
|
|
||||||
node_id="heimdall",
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
gpus=[],
|
|
||||||
last_heartbeat=0.0,
|
|
||||||
)
|
|
||||||
app = create_coordinator_app(
|
|
||||||
lease_manager=lease_manager,
|
|
||||||
profile_registry=profile_registry,
|
|
||||||
agent_supervisor=supervisor,
|
|
||||||
service_registry=ServiceRegistry(),
|
|
||||||
)
|
|
||||||
return TestClient(app), lease_manager
|
|
||||||
|
|
||||||
|
|
||||||
def test_health_returns_ok(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.get("/api/health")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["status"] == "ok"
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_nodes_returns_list(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.get("/api/nodes")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
nodes = resp.json()["nodes"]
|
|
||||||
assert len(nodes) == 1
|
|
||||||
assert nodes[0]["node_id"] == "heimdall"
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_profiles_returns_public_profiles(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.get("/api/profiles")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
names = [p["name"] for p in resp.json()["profiles"]]
|
|
||||||
assert "single-gpu-8gb" in names
|
|
||||||
|
|
||||||
|
|
||||||
def test_post_lease_grants_lease(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.post("/api/leases", json={
|
|
||||||
"node_id": "heimdall", "gpu_id": 0,
|
|
||||||
"mb": 2048, "service": "peregrine", "priority": 1,
|
|
||||||
})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
data = resp.json()
|
|
||||||
assert data["lease"]["mb_granted"] == 2048
|
|
||||||
assert data["lease"]["holder_service"] == "peregrine"
|
|
||||||
assert "lease_id" in data["lease"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_delete_lease_releases_it(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.post("/api/leases", json={
|
|
||||||
"node_id": "heimdall", "gpu_id": 0,
|
|
||||||
"mb": 2048, "service": "peregrine", "priority": 1,
|
|
||||||
})
|
|
||||||
lease_id = resp.json()["lease"]["lease_id"]
|
|
||||||
del_resp = client.delete(f"/api/leases/{lease_id}")
|
|
||||||
assert del_resp.status_code == 200
|
|
||||||
assert del_resp.json()["released"] is True
|
|
||||||
|
|
||||||
|
|
||||||
def test_delete_unknown_lease_returns_404(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.delete("/api/leases/nonexistent-id")
|
|
||||||
assert resp.status_code == 404
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_leases_returns_active_leases(coordinator_client):
|
|
||||||
client, _ = coordinator_client
|
|
||||||
client.post("/api/leases", json={
|
|
||||||
"node_id": "heimdall", "gpu_id": 0,
|
|
||||||
"mb": 1024, "service": "kiwi", "priority": 2,
|
|
||||||
})
|
|
||||||
resp = client.get("/api/leases")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert len(resp.json()["leases"]) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_dashboard_serves_html(coordinator_client):
|
|
||||||
"""GET / returns the dashboard HTML page."""
|
|
||||||
client, _ = coordinator_client
|
|
||||||
resp = client.get("/")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert "text/html" in resp.headers["content-type"]
|
|
||||||
# Verify key structural markers are present (without asserting exact markup)
|
|
||||||
assert "cf-orch" in resp.text
|
|
||||||
assert "/api/nodes" in resp.text
|
|
||||||
assert "/api/leases" in resp.text
|
|
||||||
|
|
||||||
|
|
||||||
def test_online_agents_excludes_offline():
|
|
||||||
lm = LeaseManager()
|
|
||||||
sup = AgentSupervisor(lm)
|
|
||||||
sup.register("online_node", "http://a:7701")
|
|
||||||
sup.register("offline_node", "http://b:7701")
|
|
||||||
sup._agents["online_node"].online = True
|
|
||||||
sup._agents["offline_node"].online = False
|
|
||||||
result = sup.online_agents()
|
|
||||||
assert "online_node" in result
|
|
||||||
assert "offline_node" not in result
|
|
||||||
|
|
||||||
|
|
||||||
def test_resident_keys_returns_set_of_node_service():
|
|
||||||
lm = LeaseManager()
|
|
||||||
lm.set_residents_for_node("heimdall", [("vllm", "Ouro-1.4B"), ("ollama", None)])
|
|
||||||
keys = lm.resident_keys()
|
|
||||||
assert keys == {"heimdall:vllm", "heimdall:ollama"}
|
|
||||||
|
|
||||||
|
|
||||||
def test_single_gpu_8gb_profile_has_idle_stop_after_s():
|
|
||||||
profile = load_profile(
|
|
||||||
Path("circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml")
|
|
||||||
)
|
|
||||||
vllm_svc = profile.services.get("vllm")
|
|
||||||
assert vllm_svc is not None
|
|
||||||
assert hasattr(vllm_svc, "idle_stop_after_s")
|
|
||||||
assert vllm_svc.idle_stop_after_s == 600
|
|
||||||
|
|
||||||
|
|
||||||
def test_ensure_service_returns_503_when_vram_too_low():
|
|
||||||
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
|
|
||||||
# Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
|
|
||||||
lease_manager = LeaseManager()
|
|
||||||
lease_manager.register_gpu("low-vram-node", 0, 512)
|
|
||||||
profile_registry = ProfileRegistry()
|
|
||||||
supervisor = MagicMock()
|
|
||||||
supervisor.get_node_info.return_value = NodeInfo(
|
|
||||||
node_id="low-vram-node",
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
gpus=[GpuInfo(gpu_id=0, name="GTX 1050",
|
|
||||||
vram_total_mb=512, vram_used_mb=412, vram_free_mb=100)],
|
|
||||||
last_heartbeat=0.0,
|
|
||||||
)
|
|
||||||
supervisor.all_nodes.return_value = []
|
|
||||||
app = create_coordinator_app(
|
|
||||||
lease_manager=lease_manager,
|
|
||||||
profile_registry=profile_registry,
|
|
||||||
agent_supervisor=supervisor,
|
|
||||||
service_registry=ServiceRegistry(),
|
|
||||||
)
|
|
||||||
client = TestClient(app)
|
|
||||||
|
|
||||||
resp = client.post("/api/services/vllm/ensure", json={
|
|
||||||
"node_id": "low-vram-node",
|
|
||||||
"gpu_id": 0,
|
|
||||||
"params": {"model": "some-model"},
|
|
||||||
})
|
|
||||||
|
|
||||||
assert resp.status_code == 503
|
|
||||||
assert "Insufficient VRAM" in resp.json()["detail"]
|
|
||||||
# Guard must fire before any agent HTTP call is attempted.
|
|
||||||
supervisor.get_node_info.assert_called_once_with("low-vram-node")
|
|
||||||
|
|
@ -1,148 +0,0 @@
|
||||||
"""Tests for HeimdallAuthMiddleware — TTL cache and request gating."""
|
|
||||||
import time
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import patch, MagicMock
|
|
||||||
from fastapi import FastAPI
|
|
||||||
from fastapi.testclient import TestClient
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.auth import (
|
|
||||||
HeimdallAuthMiddleware,
|
|
||||||
_ValidationCache,
|
|
||||||
CACHE_TTL_S,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ── Cache unit tests ──────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_cache_miss_returns_none():
|
|
||||||
cache = _ValidationCache()
|
|
||||||
assert cache.get("nonexistent") is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_cache_stores_and_retrieves():
|
|
||||||
cache = _ValidationCache()
|
|
||||||
cache.set("key1", valid=True, tier="paid", user_id="u1")
|
|
||||||
entry = cache.get("key1")
|
|
||||||
assert entry is not None
|
|
||||||
assert entry.valid is True
|
|
||||||
assert entry.tier == "paid"
|
|
||||||
|
|
||||||
|
|
||||||
def test_cache_entry_expires():
|
|
||||||
cache = _ValidationCache(ttl_s=0.05)
|
|
||||||
cache.set("key1", valid=True, tier="paid", user_id="u1")
|
|
||||||
time.sleep(0.1)
|
|
||||||
assert cache.get("key1") is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_cache_evict_removes_key():
|
|
||||||
cache = _ValidationCache()
|
|
||||||
cache.set("key1", valid=True, tier="paid", user_id="u1")
|
|
||||||
cache.evict("key1")
|
|
||||||
assert cache.get("key1") is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_cache_prune_removes_expired():
|
|
||||||
cache = _ValidationCache(ttl_s=0.05)
|
|
||||||
cache.set("k1", valid=True, tier="paid", user_id="")
|
|
||||||
cache.set("k2", valid=True, tier="paid", user_id="")
|
|
||||||
time.sleep(0.1)
|
|
||||||
removed = cache.prune()
|
|
||||||
assert removed == 2
|
|
||||||
|
|
||||||
|
|
||||||
# ── Middleware integration tests ──────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _make_app_with_auth(middleware: HeimdallAuthMiddleware) -> TestClient:
|
|
||||||
app = FastAPI()
|
|
||||||
app.middleware("http")(middleware)
|
|
||||||
|
|
||||||
@app.get("/api/health")
|
|
||||||
def health():
|
|
||||||
return {"status": "ok"}
|
|
||||||
|
|
||||||
@app.post("/api/services/vllm/allocate")
|
|
||||||
def allocate():
|
|
||||||
return {"allocation_id": "abc", "url": "http://gpu:8000"}
|
|
||||||
|
|
||||||
return TestClient(app, raise_server_exceptions=False)
|
|
||||||
|
|
||||||
|
|
||||||
def _patched_middleware(valid: bool, tier: str = "paid") -> HeimdallAuthMiddleware:
|
|
||||||
"""Return a middleware whose Heimdall call is pre-mocked."""
|
|
||||||
mw = HeimdallAuthMiddleware(
|
|
||||||
heimdall_url="http://heimdall.test",
|
|
||||||
min_tier="paid",
|
|
||||||
)
|
|
||||||
mw._validate_against_heimdall = MagicMock( # type: ignore[method-assign]
|
|
||||||
return_value=(valid, tier, "user-1" if valid else "")
|
|
||||||
)
|
|
||||||
return mw
|
|
||||||
|
|
||||||
|
|
||||||
def test_health_exempt_no_auth_required():
|
|
||||||
mw = _patched_middleware(valid=True)
|
|
||||||
client = _make_app_with_auth(mw)
|
|
||||||
resp = client.get("/api/health")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
|
|
||||||
|
|
||||||
def test_missing_auth_header_returns_401():
|
|
||||||
mw = _patched_middleware(valid=True)
|
|
||||||
client = _make_app_with_auth(mw)
|
|
||||||
resp = client.post("/api/services/vllm/allocate")
|
|
||||||
assert resp.status_code == 401
|
|
||||||
|
|
||||||
|
|
||||||
def test_invalid_key_returns_403():
|
|
||||||
mw = _patched_middleware(valid=False)
|
|
||||||
client = _make_app_with_auth(mw)
|
|
||||||
resp = client.post(
|
|
||||||
"/api/services/vllm/allocate",
|
|
||||||
headers={"Authorization": "Bearer BAD-KEY"},
|
|
||||||
)
|
|
||||||
assert resp.status_code == 403
|
|
||||||
|
|
||||||
|
|
||||||
def test_valid_paid_key_passes():
|
|
||||||
mw = _patched_middleware(valid=True, tier="paid")
|
|
||||||
client = _make_app_with_auth(mw)
|
|
||||||
resp = client.post(
|
|
||||||
"/api/services/vllm/allocate",
|
|
||||||
headers={"Authorization": "Bearer CFG-KIWI-GOOD-GOOD-GOOD"},
|
|
||||||
)
|
|
||||||
assert resp.status_code == 200
|
|
||||||
|
|
||||||
|
|
||||||
def test_free_tier_key_rejected_when_min_is_paid():
|
|
||||||
mw = _patched_middleware(valid=True, tier="free")
|
|
||||||
client = _make_app_with_auth(mw)
|
|
||||||
resp = client.post(
|
|
||||||
"/api/services/vllm/allocate",
|
|
||||||
headers={"Authorization": "Bearer CFG-KIWI-FREE-FREE-FREE"},
|
|
||||||
)
|
|
||||||
assert resp.status_code == 403
|
|
||||||
assert "paid" in resp.json()["detail"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_cache_prevents_second_heimdall_call():
|
|
||||||
mw = _patched_middleware(valid=True, tier="paid")
|
|
||||||
client = _make_app_with_auth(mw)
|
|
||||||
key = "CFG-KIWI-CACHED-KEY-1"
|
|
||||||
headers = {"Authorization": f"Bearer {key}"}
|
|
||||||
client.post("/api/services/vllm/allocate", headers=headers)
|
|
||||||
client.post("/api/services/vllm/allocate", headers=headers)
|
|
||||||
# Heimdall should only have been called once — second hit is from cache
|
|
||||||
assert mw._validate_against_heimdall.call_count == 1 # type: ignore[attr-defined]
|
|
||||||
|
|
||||||
|
|
||||||
def test_from_env_returns_none_without_heimdall_url(monkeypatch):
|
|
||||||
monkeypatch.delenv("HEIMDALL_URL", raising=False)
|
|
||||||
assert HeimdallAuthMiddleware.from_env() is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_from_env_returns_middleware_when_set(monkeypatch):
|
|
||||||
monkeypatch.setenv("HEIMDALL_URL", "http://heimdall.test")
|
|
||||||
mw = HeimdallAuthMiddleware.from_env()
|
|
||||||
assert mw is not None
|
|
||||||
assert mw._heimdall == "http://heimdall.test"
|
|
||||||
|
|
@ -1,215 +0,0 @@
|
||||||
# tests/test_resources/test_coordinator_probe.py
|
|
||||||
"""
|
|
||||||
Unit tests for _run_instance_probe_loop in coordinator/app.py.
|
|
||||||
|
|
||||||
Covers:
|
|
||||||
- healthy path: /health → 200 → state transitions starting → running
|
|
||||||
- timeout path: no healthy response within _PROBE_TIMEOUT_S → starting → stopped
|
|
||||||
- cleanup path: non-starting instance cleans up its start_times entry
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.app import (
|
|
||||||
_PROBE_TIMEOUT_S,
|
|
||||||
_run_instance_probe_loop,
|
|
||||||
)
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceInstance, ServiceRegistry
|
|
||||||
|
|
||||||
|
|
||||||
# ── helpers ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _inst(**kwargs) -> ServiceInstance:
|
|
||||||
defaults = dict(
|
|
||||||
service="vllm", node_id="node1", gpu_id=0,
|
|
||||||
state="starting", model="qwen", url="http://localhost:8000",
|
|
||||||
)
|
|
||||||
defaults.update(kwargs)
|
|
||||||
return ServiceInstance(**defaults)
|
|
||||||
|
|
||||||
|
|
||||||
def _registry(*instances: ServiceInstance) -> MagicMock:
|
|
||||||
reg = MagicMock(spec=ServiceRegistry)
|
|
||||||
reg.all_instances.return_value = list(instances)
|
|
||||||
return reg
|
|
||||||
|
|
||||||
|
|
||||||
def _health_resp(status: int = 200) -> MagicMock:
|
|
||||||
"""Context-manager mock that simulates an HTTP response."""
|
|
||||||
resp = MagicMock()
|
|
||||||
resp.status = status
|
|
||||||
resp.__enter__ = lambda s: resp
|
|
||||||
resp.__exit__ = MagicMock(return_value=False)
|
|
||||||
return resp
|
|
||||||
|
|
||||||
|
|
||||||
async def _one_tick(coro_fn, registry, *, time_val: float = 1000.0, **url_patch):
|
|
||||||
"""
|
|
||||||
Run the probe loop for exactly one iteration then cancel it.
|
|
||||||
|
|
||||||
asyncio.sleep is patched to return immediately on the first call
|
|
||||||
and raise CancelledError on the second (ending the loop cleanly).
|
|
||||||
"""
|
|
||||||
calls = 0
|
|
||||||
|
|
||||||
async def _fake_sleep(_delay):
|
|
||||||
nonlocal calls
|
|
||||||
calls += 1
|
|
||||||
if calls > 1:
|
|
||||||
raise asyncio.CancelledError()
|
|
||||||
|
|
||||||
patches = [
|
|
||||||
patch("asyncio.sleep", new=_fake_sleep),
|
|
||||||
patch("time.time", return_value=time_val),
|
|
||||||
]
|
|
||||||
if url_patch:
|
|
||||||
patches.append(patch("urllib.request.urlopen", **url_patch))
|
|
||||||
|
|
||||||
ctx = [p.__enter__() for p in patches]
|
|
||||||
try:
|
|
||||||
await coro_fn(registry)
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
finally:
|
|
||||||
for p in reversed(patches):
|
|
||||||
p.__exit__(None, None, None)
|
|
||||||
|
|
||||||
|
|
||||||
# ── tests ────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_probe_transitions_starting_to_running():
|
|
||||||
"""GET /health → 200 while in starting state → upsert_instance(state='running')."""
|
|
||||||
reg = _registry(_inst(state="starting", url="http://localhost:8000"))
|
|
||||||
|
|
||||||
calls = 0
|
|
||||||
|
|
||||||
async def fake_sleep(_delay):
|
|
||||||
nonlocal calls
|
|
||||||
calls += 1
|
|
||||||
if calls > 1:
|
|
||||||
raise asyncio.CancelledError()
|
|
||||||
|
|
||||||
with patch("asyncio.sleep", new=fake_sleep), \
|
|
||||||
patch("time.time", return_value=1000.0), \
|
|
||||||
patch("urllib.request.urlopen", return_value=_health_resp(200)):
|
|
||||||
try:
|
|
||||||
await _run_instance_probe_loop(reg)
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
reg.upsert_instance.assert_called_once_with(
|
|
||||||
service="vllm", node_id="node1", gpu_id=0,
|
|
||||||
state="running", model="qwen", url="http://localhost:8000",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_probe_transitions_starting_to_stopped_on_timeout():
|
|
||||||
"""No healthy response + time past _PROBE_TIMEOUT_S → upsert_instance(state='stopped').
|
|
||||||
|
|
||||||
Tick 1: seeds start_times[key] = 1000.0
|
|
||||||
Tick 2: time has advanced past _PROBE_TIMEOUT_S → timeout fires → stopped
|
|
||||||
Tick 3: CancelledError exits the loop
|
|
||||||
"""
|
|
||||||
reg = _registry(_inst(state="starting", url="http://localhost:8000"))
|
|
||||||
|
|
||||||
tick = 0
|
|
||||||
# Tick 1: t=1000 (seed); Tick 2: t=far_future (timeout fires)
|
|
||||||
times = [1000.0, 1000.0 + _PROBE_TIMEOUT_S + 1.0]
|
|
||||||
|
|
||||||
async def fake_sleep(_delay):
|
|
||||||
nonlocal tick
|
|
||||||
tick += 1
|
|
||||||
if tick > 2:
|
|
||||||
raise asyncio.CancelledError()
|
|
||||||
|
|
||||||
with patch("asyncio.sleep", new=fake_sleep), \
|
|
||||||
patch("time.time", side_effect=times * 10), \
|
|
||||||
patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
|
|
||||||
try:
|
|
||||||
await _run_instance_probe_loop(reg)
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
reg.upsert_instance.assert_called_once_with(
|
|
||||||
service="vllm", node_id="node1", gpu_id=0,
|
|
||||||
state="stopped", model="qwen", url="http://localhost:8000",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_probe_cleans_up_start_times_for_non_starting():
|
|
||||||
"""
|
|
||||||
An instance that is no longer in 'starting' state should not cause
|
|
||||||
upsert_instance to be called, and its key should be removed from start_times.
|
|
||||||
|
|
||||||
We verify this indirectly: run two ticks — first with state='starting' (seeds
|
|
||||||
the key and transitions to running), second with the updated registry returning
|
|
||||||
state='running' (should not call upsert again).
|
|
||||||
"""
|
|
||||||
starting_inst = _inst(state="starting", url="http://localhost:8000")
|
|
||||||
running_inst = _inst(state="running", url="http://localhost:8000")
|
|
||||||
|
|
||||||
tick = 0
|
|
||||||
|
|
||||||
# First tick: instance is starting → transitions to running
|
|
||||||
# Second tick: registry now returns running → no upsert
|
|
||||||
# Third tick: cancel
|
|
||||||
def instances_side_effect():
|
|
||||||
if tick <= 1:
|
|
||||||
return [starting_inst]
|
|
||||||
return [running_inst]
|
|
||||||
|
|
||||||
reg = MagicMock(spec=ServiceRegistry)
|
|
||||||
reg.all_instances.side_effect = instances_side_effect
|
|
||||||
|
|
||||||
async def fake_sleep(_delay):
|
|
||||||
nonlocal tick
|
|
||||||
tick += 1
|
|
||||||
if tick > 2:
|
|
||||||
raise asyncio.CancelledError()
|
|
||||||
|
|
||||||
with patch("asyncio.sleep", new=fake_sleep), \
|
|
||||||
patch("time.time", return_value=1000.0), \
|
|
||||||
patch("urllib.request.urlopen", return_value=_health_resp(200)):
|
|
||||||
try:
|
|
||||||
await _run_instance_probe_loop(reg)
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# upsert should have been called exactly once (the starting→running transition)
|
|
||||||
assert reg.upsert_instance.call_count == 1
|
|
||||||
reg.upsert_instance.assert_called_once_with(
|
|
||||||
service="vllm", node_id="node1", gpu_id=0,
|
|
||||||
state="running", model="qwen", url="http://localhost:8000",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_probe_no_url_does_not_attempt_health_check():
|
|
||||||
"""Instance with no URL stays in starting state (no health check, no timeout yet)."""
|
|
||||||
reg = _registry(_inst(state="starting", url=None))
|
|
||||||
|
|
||||||
tick = 0
|
|
||||||
|
|
||||||
async def fake_sleep(_delay):
|
|
||||||
nonlocal tick
|
|
||||||
tick += 1
|
|
||||||
if tick > 1:
|
|
||||||
raise asyncio.CancelledError()
|
|
||||||
|
|
||||||
with patch("asyncio.sleep", new=fake_sleep), \
|
|
||||||
patch("time.time", return_value=1000.0), \
|
|
||||||
patch("urllib.request.urlopen") as mock_urlopen:
|
|
||||||
try:
|
|
||||||
await _run_instance_probe_loop(reg)
|
|
||||||
except asyncio.CancelledError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
mock_urlopen.assert_not_called()
|
|
||||||
reg.upsert_instance.assert_not_called()
|
|
||||||
|
|
@ -1,215 +0,0 @@
|
||||||
# tests/test_resources/test_docuvision.py
|
|
||||||
"""
|
|
||||||
Unit tests for cf-docuvision FastAPI service (circuitforge_core/resources/docuvision/app.py).
|
|
||||||
|
|
||||||
Covers:
|
|
||||||
- GET /health → status + model path
|
|
||||||
- POST /extract → image_b64, image_path, hint routing, metadata fields
|
|
||||||
- _parse_dolphin_output → JSON list path, table detection, plain-text fallback
|
|
||||||
- _image_from_request → missing both fields → 422; bad image_path → 404
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import base64
|
|
||||||
import io
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
from fastapi.testclient import TestClient
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
import circuitforge_core.resources.docuvision.app as docuvision_module
|
|
||||||
from circuitforge_core.resources.docuvision.app import (
|
|
||||||
_parse_dolphin_output,
|
|
||||||
app,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ── fixtures ──────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _make_jpeg_b64(width: int = 10, height: int = 10) -> str:
|
|
||||||
"""Return a base64-encoded 10x10 white JPEG."""
|
|
||||||
img = Image.new("RGB", (width, height), color=(255, 255, 255))
|
|
||||||
buf = io.BytesIO()
|
|
||||||
img.save(buf, format="JPEG")
|
|
||||||
return base64.b64encode(buf.getvalue()).decode()
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
|
||||||
def _reset_module_state():
|
|
||||||
"""Reset module-level model state between tests."""
|
|
||||||
docuvision_module._model = None
|
|
||||||
docuvision_module._processor = None
|
|
||||||
docuvision_module._model_path = "/fake/model"
|
|
||||||
docuvision_module._device = "cpu"
|
|
||||||
yield
|
|
||||||
docuvision_module._model = None
|
|
||||||
docuvision_module._processor = None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mock_model():
|
|
||||||
"""
|
|
||||||
Inject fake model + processor into the module so _load_model() is skipped.
|
|
||||||
|
|
||||||
The processor returns a dict-like with 'input_ids'; the model generate()
|
|
||||||
returns a tensor-like whose decode produces a JSON string.
|
|
||||||
"""
|
|
||||||
fake_ids = MagicMock()
|
|
||||||
fake_ids.shape = [1, 5] # input_len = 5
|
|
||||||
|
|
||||||
fake_inputs = {"input_ids": fake_ids}
|
|
||||||
fake_inputs_obj = MagicMock()
|
|
||||||
fake_inputs_obj.__getitem__ = lambda self, k: fake_inputs[k]
|
|
||||||
fake_inputs_obj.to = lambda device: fake_inputs_obj
|
|
||||||
|
|
||||||
fake_output = MagicMock()
|
|
||||||
fake_output.__getitem__ = lambda self, idx: MagicMock() # output_ids[0]
|
|
||||||
|
|
||||||
fake_model = MagicMock()
|
|
||||||
fake_model.generate.return_value = fake_output
|
|
||||||
|
|
||||||
fake_processor = MagicMock()
|
|
||||||
fake_processor.return_value = fake_inputs_obj
|
|
||||||
fake_processor.decode.return_value = json.dumps([
|
|
||||||
{"type": "heading", "text": "Invoice", "bbox": [0.0, 0.0, 1.0, 0.1]},
|
|
||||||
{"type": "table", "text": "row1", "html": "<table><tr><td>row1</td></tr></table>",
|
|
||||||
"bbox": [0.0, 0.1, 1.0, 0.5]},
|
|
||||||
])
|
|
||||||
|
|
||||||
docuvision_module._model = fake_model
|
|
||||||
docuvision_module._processor = fake_processor
|
|
||||||
return fake_model, fake_processor
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def client():
|
|
||||||
return TestClient(app)
|
|
||||||
|
|
||||||
|
|
||||||
# ── health ────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_health_returns_ok(client):
|
|
||||||
resp = client.get("/health")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
data = resp.json()
|
|
||||||
assert data["status"] == "ok"
|
|
||||||
assert data["model"] == "/fake/model"
|
|
||||||
|
|
||||||
|
|
||||||
# ── _parse_dolphin_output ────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_parse_json_list_elements():
|
|
||||||
raw = json.dumps([
|
|
||||||
{"type": "heading", "text": "Title"},
|
|
||||||
{"type": "paragraph", "text": "Body text"},
|
|
||||||
])
|
|
||||||
elements, tables, raw_text = _parse_dolphin_output(raw)
|
|
||||||
assert len(elements) == 2
|
|
||||||
assert elements[0].type == "heading"
|
|
||||||
assert elements[0].text == "Title"
|
|
||||||
assert elements[1].type == "paragraph"
|
|
||||||
assert raw_text == "Title\nBody text"
|
|
||||||
assert tables == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_json_table_extracted():
|
|
||||||
raw = json.dumps([
|
|
||||||
{"type": "table", "text": "row", "html": "<table><tr><td>A</td></tr></table>",
|
|
||||||
"bbox": [0.0, 0.0, 1.0, 0.5]},
|
|
||||||
])
|
|
||||||
elements, tables, raw_text = _parse_dolphin_output(raw)
|
|
||||||
assert len(tables) == 1
|
|
||||||
assert tables[0].html == "<table><tr><td>A</td></tr></table>"
|
|
||||||
assert tables[0].bbox == [0.0, 0.0, 1.0, 0.5]
|
|
||||||
assert len(elements) == 1
|
|
||||||
assert elements[0].type == "table"
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_plain_text_fallback():
|
|
||||||
raw = "This is not JSON at all."
|
|
||||||
elements, tables, raw_text = _parse_dolphin_output(raw)
|
|
||||||
assert len(elements) == 1
|
|
||||||
assert elements[0].type == "paragraph"
|
|
||||||
assert elements[0].text == raw
|
|
||||||
assert tables == []
|
|
||||||
assert raw_text == raw
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_empty_string_fallback():
|
|
||||||
elements, tables, raw_text = _parse_dolphin_output("")
|
|
||||||
assert len(elements) == 1
|
|
||||||
assert elements[0].type == "paragraph"
|
|
||||||
assert elements[0].text == ""
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_json_missing_type_defaults_to_paragraph():
|
|
||||||
raw = json.dumps([{"text": "no type field"}])
|
|
||||||
elements, tables, _ = _parse_dolphin_output(raw)
|
|
||||||
assert elements[0].type == "paragraph"
|
|
||||||
|
|
||||||
|
|
||||||
# ── POST /extract ─────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_extract_image_b64(client, mock_model):
|
|
||||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "auto"})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
data = resp.json()
|
|
||||||
assert "elements" in data
|
|
||||||
assert "raw_text" in data
|
|
||||||
assert "tables" in data
|
|
||||||
assert data["metadata"]["hint"] == "auto"
|
|
||||||
assert data["metadata"]["model"] == "/fake/model"
|
|
||||||
assert data["metadata"]["width"] == 10
|
|
||||||
assert data["metadata"]["height"] == 10
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_hint_table_routes_correct_prompt(client, mock_model):
|
|
||||||
_, fake_processor = mock_model
|
|
||||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "table"})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
# Verify processor was called with the table-specific prompt
|
|
||||||
call_kwargs = fake_processor.call_args
|
|
||||||
assert "table" in call_kwargs.kwargs.get("text", "") or \
|
|
||||||
"table" in str(call_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_hint_unknown_falls_back_to_auto(client, mock_model):
|
|
||||||
"""An unrecognised hint silently falls back to the 'auto' prompt."""
|
|
||||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "nonsense"})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_image_path(tmp_path, client, mock_model):
|
|
||||||
img_file = tmp_path / "doc.png"
|
|
||||||
Image.new("RGB", (8, 8), color=(0, 0, 0)).save(img_file)
|
|
||||||
resp = client.post("/extract", json={"image_path": str(img_file)})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["metadata"]["width"] == 8
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_image_path_not_found(client, mock_model):
|
|
||||||
resp = client.post("/extract", json={"image_path": "/nonexistent/path/img.png"})
|
|
||||||
assert resp.status_code == 404
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_no_image_raises_422(client, mock_model):
|
|
||||||
resp = client.post("/extract", json={"hint": "auto"})
|
|
||||||
assert resp.status_code == 422
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_response_includes_tables(client, mock_model):
|
|
||||||
"""Verify table objects surface in response when model returns table elements."""
|
|
||||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
data = resp.json()
|
|
||||||
assert len(data["tables"]) == 1
|
|
||||||
assert "<table>" in data["tables"][0]["html"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_extract_device_in_metadata(client, mock_model):
|
|
||||||
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert "device" in resp.json()["metadata"]
|
|
||||||
|
|
@ -1,67 +0,0 @@
|
||||||
import asyncio
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import AsyncMock, patch
|
|
||||||
from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def lease_manager():
|
|
||||||
mgr = LeaseManager()
|
|
||||||
mgr.register_gpu("heimdall", 0, 8192)
|
|
||||||
return mgr
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def engine(lease_manager):
|
|
||||||
return EvictionEngine(lease_manager=lease_manager, eviction_timeout_s=0.1)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_request_lease_grants_when_vram_available(engine, lease_manager):
|
|
||||||
lease = await engine.request_lease(
|
|
||||||
node_id="heimdall", gpu_id=0, mb=4096,
|
|
||||||
service="peregrine", priority=1,
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
)
|
|
||||||
assert lease is not None
|
|
||||||
assert lease.mb_granted == 4096
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_request_lease_evicts_and_grants(engine, lease_manager):
|
|
||||||
# Pre-fill with a low-priority lease
|
|
||||||
big_lease = await lease_manager.try_grant(
|
|
||||||
"heimdall", 0, 7000, "comfyui", priority=4
|
|
||||||
)
|
|
||||||
assert big_lease is not None
|
|
||||||
|
|
||||||
# Mock the agent eviction call
|
|
||||||
with patch(
|
|
||||||
"circuitforge_core.resources.coordinator.eviction_engine.EvictionEngine._call_agent_evict",
|
|
||||||
new_callable=AsyncMock,
|
|
||||||
) as mock_evict:
|
|
||||||
mock_evict.return_value = True
|
|
||||||
# Simulate the comfyui lease being released (as if the agent evicted it)
|
|
||||||
asyncio.get_event_loop().call_later(
|
|
||||||
0.05, lambda: asyncio.ensure_future(lease_manager.release(big_lease.lease_id))
|
|
||||||
)
|
|
||||||
lease = await engine.request_lease(
|
|
||||||
node_id="heimdall", gpu_id=0, mb=4096,
|
|
||||||
service="peregrine", priority=1,
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
)
|
|
||||||
assert lease is not None
|
|
||||||
assert lease.holder_service == "peregrine"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_request_lease_returns_none_when_no_eviction_candidates(engine):
|
|
||||||
await engine.lease_manager.try_grant("heimdall", 0, 6000, "vllm", priority=1)
|
|
||||||
# Requesting 4GB but no lower-priority leases exist
|
|
||||||
lease = await engine.request_lease(
|
|
||||||
node_id="heimdall", gpu_id=0, mb=4096,
|
|
||||||
service="kiwi", priority=2,
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
)
|
|
||||||
assert lease is None
|
|
||||||
|
|
@ -1,43 +0,0 @@
|
||||||
import signal
|
|
||||||
from unittest.mock import patch, call
|
|
||||||
import pytest
|
|
||||||
from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor, EvictionResult
|
|
||||||
|
|
||||||
|
|
||||||
def test_evict_by_pid_sends_sigterm_then_sigkill():
|
|
||||||
executor = EvictionExecutor(grace_period_s=0.01)
|
|
||||||
# pid_exists always True → grace period expires → SIGKILL fires
|
|
||||||
with patch("os.kill") as mock_kill, \
|
|
||||||
patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
|
|
||||||
mock_psutil.pid_exists.return_value = True
|
|
||||||
result = executor.evict_pid(pid=1234, grace_period_s=0.01)
|
|
||||||
|
|
||||||
assert result.success is True
|
|
||||||
calls = mock_kill.call_args_list
|
|
||||||
assert call(1234, signal.SIGTERM) in calls
|
|
||||||
assert call(1234, signal.SIGKILL) in calls
|
|
||||||
|
|
||||||
|
|
||||||
def test_evict_pid_succeeds_on_sigterm_alone():
|
|
||||||
executor = EvictionExecutor(grace_period_s=0.1)
|
|
||||||
with patch("os.kill"), \
|
|
||||||
patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
|
|
||||||
mock_psutil.pid_exists.side_effect = [True, False] # gone after SIGTERM
|
|
||||||
result = executor.evict_pid(pid=5678, grace_period_s=0.01)
|
|
||||||
assert result.success is True
|
|
||||||
assert result.method == "sigterm"
|
|
||||||
|
|
||||||
|
|
||||||
def test_evict_pid_not_found_returns_failure():
|
|
||||||
executor = EvictionExecutor()
|
|
||||||
with patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
|
|
||||||
mock_psutil.pid_exists.return_value = False
|
|
||||||
result = executor.evict_pid(pid=9999)
|
|
||||||
assert result.success is False
|
|
||||||
assert "not found" in result.message.lower()
|
|
||||||
|
|
||||||
|
|
||||||
def test_eviction_result_is_immutable():
|
|
||||||
result = EvictionResult(success=True, method="sigterm", message="ok")
|
|
||||||
with pytest.raises((AttributeError, TypeError)):
|
|
||||||
result.success = False # type: ignore
|
|
||||||
|
|
@ -1,60 +0,0 @@
|
||||||
from unittest.mock import patch
|
|
||||||
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
|
|
||||||
|
|
||||||
|
|
||||||
SAMPLE_NVIDIA_SMI_OUTPUT = (
|
|
||||||
"0, Quadro RTX 4000, 8192, 6843, 1349\n"
|
|
||||||
"1, Quadro RTX 4000, 8192, 721, 7471\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_returns_list_of_gpu_info():
|
|
||||||
monitor = GpuMonitor()
|
|
||||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
|
||||||
mock_run.return_value.returncode = 0
|
|
||||||
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
|
||||||
gpus = monitor.poll()
|
|
||||||
assert len(gpus) == 2
|
|
||||||
assert gpus[0].gpu_id == 0
|
|
||||||
assert gpus[0].name == "Quadro RTX 4000"
|
|
||||||
assert gpus[0].vram_total_mb == 8192
|
|
||||||
assert gpus[0].vram_used_mb == 6843
|
|
||||||
assert gpus[0].vram_free_mb == 1349
|
|
||||||
|
|
||||||
|
|
||||||
def test_parse_second_gpu():
|
|
||||||
monitor = GpuMonitor()
|
|
||||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
|
||||||
mock_run.return_value.returncode = 0
|
|
||||||
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
|
|
||||||
gpus = monitor.poll()
|
|
||||||
assert gpus[1].gpu_id == 1
|
|
||||||
assert gpus[1].vram_used_mb == 721
|
|
||||||
assert gpus[1].vram_free_mb == 7471
|
|
||||||
|
|
||||||
|
|
||||||
def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
|
|
||||||
monitor = GpuMonitor()
|
|
||||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run", side_effect=FileNotFoundError):
|
|
||||||
gpus = monitor.poll()
|
|
||||||
assert gpus == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_poll_returns_empty_list_on_nonzero_exit():
|
|
||||||
monitor = GpuMonitor()
|
|
||||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
|
||||||
mock_run.return_value.returncode = 1
|
|
||||||
mock_run.return_value.stdout = ""
|
|
||||||
gpus = monitor.poll()
|
|
||||||
assert gpus == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_poll_skips_malformed_lines():
|
|
||||||
monitor = GpuMonitor()
|
|
||||||
malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
|
|
||||||
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
|
|
||||||
mock_run.return_value.returncode = 0
|
|
||||||
mock_run.return_value.stdout = malformed
|
|
||||||
gpus = monitor.poll()
|
|
||||||
assert len(gpus) == 1
|
|
||||||
assert gpus[0].gpu_id == 1
|
|
||||||
|
|
@ -1,221 +0,0 @@
|
||||||
"""Integration test: full lease → eviction → re-grant cycle.
|
|
||||||
|
|
||||||
Runs coordinator in-process (no subprocesses, no real nvidia-smi).
|
|
||||||
Uses TestClient for HTTP, mocks AgentSupervisor to return fixed node state.
|
|
||||||
"""
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
from fastapi.testclient import TestClient
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
|
||||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def system():
|
|
||||||
"""Create an in-process coordinator system with 8GB GPU and mock supervisor."""
|
|
||||||
lease_manager = LeaseManager()
|
|
||||||
lease_manager.register_gpu("local", 0, 8192)
|
|
||||||
|
|
||||||
mock_supervisor = MagicMock(spec=AgentSupervisor)
|
|
||||||
mock_supervisor.all_nodes.return_value = [
|
|
||||||
NodeInfo(
|
|
||||||
node_id="local",
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
gpus=[GpuInfo(
|
|
||||||
gpu_id=0,
|
|
||||||
name="RTX 4000",
|
|
||||||
vram_total_mb=8192,
|
|
||||||
vram_used_mb=0,
|
|
||||||
vram_free_mb=8192,
|
|
||||||
)],
|
|
||||||
last_heartbeat=0.0,
|
|
||||||
)
|
|
||||||
]
|
|
||||||
mock_supervisor.get_node_info.return_value = NodeInfo(
|
|
||||||
node_id="local",
|
|
||||||
agent_url="http://localhost:7701",
|
|
||||||
gpus=[],
|
|
||||||
last_heartbeat=0.0,
|
|
||||||
)
|
|
||||||
|
|
||||||
profile_registry = ProfileRegistry()
|
|
||||||
app = create_coordinator_app(
|
|
||||||
lease_manager=lease_manager,
|
|
||||||
profile_registry=profile_registry,
|
|
||||||
agent_supervisor=mock_supervisor,
|
|
||||||
service_registry=ServiceRegistry(),
|
|
||||||
)
|
|
||||||
client = TestClient(app)
|
|
||||||
return client, lease_manager
|
|
||||||
|
|
||||||
|
|
||||||
def test_full_lease_cycle(system):
|
|
||||||
"""Test: grant, verify, release, verify gone."""
|
|
||||||
client, _ = system
|
|
||||||
|
|
||||||
# Grant a lease
|
|
||||||
resp = client.post("/api/leases", json={
|
|
||||||
"node_id": "local",
|
|
||||||
"gpu_id": 0,
|
|
||||||
"mb": 4096,
|
|
||||||
"service": "peregrine",
|
|
||||||
"priority": 1,
|
|
||||||
})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
lease_data = resp.json()["lease"]
|
|
||||||
lease_id = lease_data["lease_id"]
|
|
||||||
assert lease_data["mb_granted"] == 4096
|
|
||||||
assert lease_data["holder_service"] == "peregrine"
|
|
||||||
|
|
||||||
# Verify it appears in active leases
|
|
||||||
resp = client.get("/api/leases")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
leases = resp.json()["leases"]
|
|
||||||
assert any(l["lease_id"] == lease_id for l in leases)
|
|
||||||
|
|
||||||
# Release it
|
|
||||||
resp = client.delete(f"/api/leases/{lease_id}")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["released"] is True
|
|
||||||
|
|
||||||
# Verify it's gone
|
|
||||||
resp = client.get("/api/leases")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
leases = resp.json()["leases"]
|
|
||||||
assert not any(l["lease_id"] == lease_id for l in leases)
|
|
||||||
|
|
||||||
|
|
||||||
def test_vram_exhaustion_returns_503(system):
|
|
||||||
"""Test: fill GPU, then request with no eviction candidates returns 503."""
|
|
||||||
client, _ = system
|
|
||||||
|
|
||||||
# Fill GPU 0 with high-priority lease
|
|
||||||
resp = client.post("/api/leases", json={
|
|
||||||
"node_id": "local",
|
|
||||||
"gpu_id": 0,
|
|
||||||
"mb": 8000,
|
|
||||||
"service": "vllm",
|
|
||||||
"priority": 1,
|
|
||||||
})
|
|
||||||
assert resp.status_code == 200
|
|
||||||
|
|
||||||
# Try to get more VRAM with same priority (no eviction candidates)
|
|
||||||
resp = client.post("/api/leases", json={
|
|
||||||
"node_id": "local",
|
|
||||||
"gpu_id": 0,
|
|
||||||
"mb": 2000,
|
|
||||||
"service": "kiwi",
|
|
||||||
"priority": 1,
|
|
||||||
})
|
|
||||||
assert resp.status_code == 503
|
|
||||||
assert "Insufficient VRAM" in resp.json()["detail"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_auto_detect_profile_for_8gb():
|
|
||||||
"""Test: ProfileRegistry auto-detects single-gpu-8gb for 8GB GPU."""
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
gpu = GpuInfo(
|
|
||||||
gpu_id=0,
|
|
||||||
name="RTX 4000",
|
|
||||||
vram_total_mb=8192,
|
|
||||||
vram_used_mb=0,
|
|
||||||
vram_free_mb=8192,
|
|
||||||
)
|
|
||||||
profile = registry.auto_detect([gpu])
|
|
||||||
assert profile.name == "single-gpu-8gb"
|
|
||||||
# Verify profile has services configured
|
|
||||||
assert hasattr(profile, "services")
|
|
||||||
|
|
||||||
|
|
||||||
def test_node_endpoint_shows_nodes(system):
|
|
||||||
"""Test: GET /api/nodes returns the mocked node."""
|
|
||||||
client, _ = system
|
|
||||||
resp = client.get("/api/nodes")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
nodes = resp.json()["nodes"]
|
|
||||||
assert len(nodes) == 1
|
|
||||||
assert nodes[0]["node_id"] == "local"
|
|
||||||
assert nodes[0]["agent_url"] == "http://localhost:7701"
|
|
||||||
assert len(nodes[0]["gpus"]) == 1
|
|
||||||
assert nodes[0]["gpus"][0]["name"] == "RTX 4000"
|
|
||||||
|
|
||||||
|
|
||||||
def test_profiles_endpoint_returns_public_profiles(system):
|
|
||||||
"""Test: GET /api/profiles returns standard public profiles."""
|
|
||||||
client, _ = system
|
|
||||||
resp = client.get("/api/profiles")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
profiles = resp.json()["profiles"]
|
|
||||||
names = [p["name"] for p in profiles]
|
|
||||||
# Verify common public profiles are present
|
|
||||||
assert "single-gpu-8gb" in names
|
|
||||||
assert "single-gpu-6gb" in names
|
|
||||||
assert "single-gpu-2gb" in names
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_leases_tracked_independently(system):
|
|
||||||
"""Test: multiple active leases are tracked correctly."""
|
|
||||||
client, _ = system
|
|
||||||
|
|
||||||
# Grant lease 1
|
|
||||||
resp1 = client.post("/api/leases", json={
|
|
||||||
"node_id": "local",
|
|
||||||
"gpu_id": 0,
|
|
||||||
"mb": 2048,
|
|
||||||
"service": "peregrine",
|
|
||||||
"priority": 2,
|
|
||||||
})
|
|
||||||
assert resp1.status_code == 200
|
|
||||||
lease1_id = resp1.json()["lease"]["lease_id"]
|
|
||||||
|
|
||||||
# Grant lease 2
|
|
||||||
resp2 = client.post("/api/leases", json={
|
|
||||||
"node_id": "local",
|
|
||||||
"gpu_id": 0,
|
|
||||||
"mb": 2048,
|
|
||||||
"service": "kiwi",
|
|
||||||
"priority": 2,
|
|
||||||
})
|
|
||||||
assert resp2.status_code == 200
|
|
||||||
lease2_id = resp2.json()["lease"]["lease_id"]
|
|
||||||
|
|
||||||
# Both should be in active leases
|
|
||||||
resp = client.get("/api/leases")
|
|
||||||
leases = resp.json()["leases"]
|
|
||||||
lease_ids = [l["lease_id"] for l in leases]
|
|
||||||
assert lease1_id in lease_ids
|
|
||||||
assert lease2_id in lease_ids
|
|
||||||
assert len(leases) == 2
|
|
||||||
|
|
||||||
# Release lease 1
|
|
||||||
resp = client.delete(f"/api/leases/{lease1_id}")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
|
|
||||||
# Only lease 2 should remain
|
|
||||||
resp = client.get("/api/leases")
|
|
||||||
leases = resp.json()["leases"]
|
|
||||||
lease_ids = [l["lease_id"] for l in leases]
|
|
||||||
assert lease1_id not in lease_ids
|
|
||||||
assert lease2_id in lease_ids
|
|
||||||
assert len(leases) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_delete_nonexistent_lease_returns_404(system):
|
|
||||||
"""Test: deleting a nonexistent lease returns 404."""
|
|
||||||
client, _ = system
|
|
||||||
resp = client.delete("/api/leases/nonexistent-lease-id")
|
|
||||||
assert resp.status_code == 404
|
|
||||||
assert "not found" in resp.json()["detail"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_health_endpoint_returns_ok(system):
|
|
||||||
"""Test: GET /api/health returns status ok."""
|
|
||||||
client, _ = system
|
|
||||||
resp = client.get("/api/health")
|
|
||||||
assert resp.status_code == 200
|
|
||||||
assert resp.json()["status"] == "ok"
|
|
||||||
|
|
@ -1,85 +0,0 @@
|
||||||
import pytest
|
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def mgr():
|
|
||||||
m = LeaseManager()
|
|
||||||
m.register_gpu(node_id="heimdall", gpu_id=0, total_mb=8192)
|
|
||||||
return m
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_grant_succeeds_when_vram_available(mgr):
|
|
||||||
lease = await mgr.try_grant(
|
|
||||||
node_id="heimdall", gpu_id=0, mb=4096,
|
|
||||||
service="peregrine", priority=1
|
|
||||||
)
|
|
||||||
assert lease is not None
|
|
||||||
assert lease.mb_granted == 4096
|
|
||||||
assert lease.node_id == "heimdall"
|
|
||||||
assert lease.gpu_id == 0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_grant_fails_when_vram_insufficient(mgr):
|
|
||||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
|
|
||||||
service="vllm", priority=1)
|
|
||||||
lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
|
|
||||||
service="kiwi", priority=2)
|
|
||||||
assert lease is None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_release_frees_vram(mgr):
|
|
||||||
lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
|
|
||||||
service="vllm", priority=1)
|
|
||||||
assert lease is not None
|
|
||||||
released = await mgr.release(lease.lease_id)
|
|
||||||
assert released is True
|
|
||||||
lease2 = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
|
|
||||||
service="comfyui", priority=4)
|
|
||||||
assert lease2 is not None
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_release_unknown_lease_returns_false(mgr):
|
|
||||||
result = await mgr.release("nonexistent-id")
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_get_eviction_candidates_returns_lower_priority_leases(mgr):
|
|
||||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=3000,
|
|
||||||
service="comfyui", priority=4)
|
|
||||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
|
|
||||||
service="ollama", priority=1)
|
|
||||||
candidates = mgr.get_eviction_candidates(
|
|
||||||
node_id="heimdall", gpu_id=0,
|
|
||||||
needed_mb=3000, requester_priority=2
|
|
||||||
)
|
|
||||||
assert len(candidates) == 1
|
|
||||||
assert candidates[0].holder_service == "comfyui"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_list_leases_for_gpu(mgr):
|
|
||||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=1024,
|
|
||||||
service="peregrine", priority=1)
|
|
||||||
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=512,
|
|
||||||
service="kiwi", priority=2)
|
|
||||||
leases = mgr.list_leases(node_id="heimdall", gpu_id=0)
|
|
||||||
assert len(leases) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_register_gpu_sets_total(mgr):
|
|
||||||
assert mgr.gpu_total_mb("heimdall", 0) == 8192
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_used_mb_tracks_grants():
|
|
||||||
mgr = LeaseManager()
|
|
||||||
mgr.register_gpu("heimdall", 0, 8192)
|
|
||||||
await mgr.try_grant("heimdall", 0, 3000, "a", 1)
|
|
||||||
await mgr.try_grant("heimdall", 0, 2000, "b", 2)
|
|
||||||
assert mgr.used_mb("heimdall", 0) == 5000
|
|
||||||
|
|
@ -1,47 +0,0 @@
|
||||||
import time
|
|
||||||
import pytest
|
|
||||||
from circuitforge_core.resources.models import VRAMLease, GpuInfo, NodeInfo
|
|
||||||
|
|
||||||
|
|
||||||
def test_vram_lease_create_assigns_unique_ids():
|
|
||||||
lease_a = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
|
|
||||||
service="peregrine", priority=1)
|
|
||||||
lease_b = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
|
|
||||||
service="peregrine", priority=1)
|
|
||||||
assert lease_a.lease_id != lease_b.lease_id
|
|
||||||
|
|
||||||
|
|
||||||
def test_vram_lease_create_with_ttl_sets_expiry():
|
|
||||||
before = time.time()
|
|
||||||
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=2048,
|
|
||||||
service="kiwi", priority=2, ttl_s=60.0)
|
|
||||||
after = time.time()
|
|
||||||
assert before + 60.0 <= lease.expires_at <= after + 60.0
|
|
||||||
|
|
||||||
|
|
||||||
def test_vram_lease_create_no_ttl_has_zero_expiry():
|
|
||||||
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
|
|
||||||
service="snipe", priority=2)
|
|
||||||
assert lease.expires_at == 0.0
|
|
||||||
|
|
||||||
|
|
||||||
def test_vram_lease_is_immutable():
|
|
||||||
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
|
|
||||||
service="snipe", priority=2)
|
|
||||||
with pytest.raises((AttributeError, TypeError)):
|
|
||||||
lease.mb_granted = 999 # type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
def test_gpu_info_fields():
|
|
||||||
info = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
|
|
||||||
vram_used_mb=2048, vram_free_mb=6144)
|
|
||||||
assert info.vram_free_mb == 6144
|
|
||||||
|
|
||||||
|
|
||||||
def test_node_info_fields():
|
|
||||||
gpu = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
|
|
||||||
vram_used_mb=0, vram_free_mb=8192)
|
|
||||||
node = NodeInfo(node_id="heimdall", agent_url="http://localhost:7701",
|
|
||||||
gpus=[gpu], last_heartbeat=time.time())
|
|
||||||
assert node.node_id == "heimdall"
|
|
||||||
assert len(node.gpus) == 1
|
|
||||||
|
|
@ -1,82 +0,0 @@
|
||||||
import pytest
|
|
||||||
from circuitforge_core.resources.coordinator.node_selector import select_node
|
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
|
|
||||||
from circuitforge_core.resources.models import GpuInfo
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
|
|
||||||
|
|
||||||
def _make_agent(node_id: str, free_mb: int, online: bool = True) -> AgentRecord:
|
|
||||||
r = AgentRecord(node_id=node_id, agent_url=f"http://{node_id}:7701")
|
|
||||||
r.gpus = [GpuInfo(gpu_id=0, name="RTX", vram_total_mb=8192,
|
|
||||||
vram_used_mb=8192 - free_mb, vram_free_mb=free_mb)]
|
|
||||||
r.online = online
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
def test_selects_node_with_most_free_vram():
|
|
||||||
agents = {
|
|
||||||
"a": _make_agent("a", free_mb=2000),
|
|
||||||
"b": _make_agent("b", free_mb=6000),
|
|
||||||
}
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
|
||||||
assert result == ("b", 0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefers_warm_node_even_with_less_free_vram():
|
|
||||||
agents = {
|
|
||||||
"a": _make_agent("a", free_mb=2000),
|
|
||||||
"b": _make_agent("b", free_mb=6000),
|
|
||||||
}
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
result = select_node(agents, "vllm", registry, resident_keys={"a:vllm"})
|
|
||||||
assert result == ("a", 0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_excludes_offline_nodes():
|
|
||||||
agents = {
|
|
||||||
"a": _make_agent("a", free_mb=8000, online=False),
|
|
||||||
"b": _make_agent("b", free_mb=2000, online=True),
|
|
||||||
}
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
|
||||||
assert result == ("b", 0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_returns_none_when_no_node_has_profile_for_service():
|
|
||||||
agents = {"a": _make_agent("a", free_mb=8000)}
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
result = select_node(agents, "cf-nonexistent-service", registry, resident_keys=set())
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_returns_none_when_no_agents():
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
result = select_node({}, "vllm", registry, resident_keys=set())
|
|
||||||
assert result is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
|
|
||||||
"""can_fit requires free_mb >= service max_mb (full ceiling, not half).
|
|
||||||
9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
|
|
||||||
"""
|
|
||||||
agents = {
|
|
||||||
"a": _make_agent("a", free_mb=1000),
|
|
||||||
"b": _make_agent("b", free_mb=9500),
|
|
||||||
}
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
|
||||||
# "b" is the only node in the preferred (can_fit) pool
|
|
||||||
assert result == ("b", 0)
|
|
||||||
|
|
||||||
|
|
||||||
def test_falls_back_to_best_effort_when_no_node_fully_fits():
|
|
||||||
"""When nothing can_fit, select_node returns the best-VRAM node as fallback."""
|
|
||||||
agents = {
|
|
||||||
"a": _make_agent("a", free_mb=1000),
|
|
||||||
"b": _make_agent("b", free_mb=2000),
|
|
||||||
}
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
# Neither has enough free VRAM; fallback picks highest effective_free_mb
|
|
||||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
|
||||||
assert result == ("b", 0)
|
|
||||||
|
|
@ -1,87 +0,0 @@
|
||||||
# tests/test_resources/test_node_store.py
|
|
||||||
"""Unit tests for NodeStore — SQLite persistence layer for known agent nodes."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from circuitforge_core.resources.coordinator.node_store import NodeStore
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def store(tmp_path: Path) -> NodeStore:
|
|
||||||
return NodeStore(db_path=tmp_path / "test-nodes.db")
|
|
||||||
|
|
||||||
|
|
||||||
def test_upsert_and_all(store: NodeStore) -> None:
|
|
||||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
|
||||||
rows = store.all()
|
|
||||||
assert len(rows) == 1
|
|
||||||
assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
|
|
||||||
|
|
||||||
|
|
||||||
def test_upsert_updates_url(store: NodeStore) -> None:
|
|
||||||
store.upsert("navi", "http://10.1.10.10:7701")
|
|
||||||
store.upsert("navi", "http://10.1.10.10:7702")
|
|
||||||
rows = store.all()
|
|
||||||
assert len(rows) == 1
|
|
||||||
assert rows[0][1] == "http://10.1.10.10:7702"
|
|
||||||
|
|
||||||
|
|
||||||
def test_multiple_nodes(store: NodeStore) -> None:
|
|
||||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
|
||||||
store.upsert("navi", "http://10.1.10.10:7701")
|
|
||||||
store.upsert("strahl", "http://10.1.10.20:7701")
|
|
||||||
assert len(store.all()) == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_remove(store: NodeStore) -> None:
|
|
||||||
store.upsert("heimdall", "http://127.0.0.1:7701")
|
|
||||||
store.upsert("navi", "http://10.1.10.10:7701")
|
|
||||||
store.remove("navi")
|
|
||||||
ids = [r[0] for r in store.all()]
|
|
||||||
assert "navi" not in ids
|
|
||||||
assert "heimdall" in ids
|
|
||||||
|
|
||||||
|
|
||||||
def test_prune_stale_removes_old_entries(store: NodeStore) -> None:
|
|
||||||
# Insert a node with a last_seen in the distant past
|
|
||||||
store._conn.execute(
|
|
||||||
"INSERT INTO known_nodes (node_id, agent_url, last_seen) VALUES (?, ?, ?)",
|
|
||||||
("ghost", "http://dead:7701", time.time() - 40 * 86400),
|
|
||||||
)
|
|
||||||
store._conn.commit()
|
|
||||||
store.upsert("live", "http://live:7701")
|
|
||||||
|
|
||||||
removed = store.prune_stale(max_age_days=30)
|
|
||||||
assert removed == 1
|
|
||||||
ids = [r[0] for r in store.all()]
|
|
||||||
assert "ghost" not in ids
|
|
||||||
assert "live" in ids
|
|
||||||
|
|
||||||
|
|
||||||
def test_prune_stale_keeps_recent(store: NodeStore) -> None:
|
|
||||||
store.upsert("recent", "http://recent:7701")
|
|
||||||
removed = store.prune_stale(max_age_days=30)
|
|
||||||
assert removed == 0
|
|
||||||
assert len(store.all()) == 1
|
|
||||||
|
|
||||||
|
|
||||||
def test_all_empty(store: NodeStore) -> None:
|
|
||||||
assert store.all() == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_db_persists_across_instances(tmp_path: Path) -> None:
|
|
||||||
"""Data written by one NodeStore instance is visible to a new one on the same file."""
|
|
||||||
db = tmp_path / "shared.db"
|
|
||||||
s1 = NodeStore(db_path=db)
|
|
||||||
s1.upsert("navi", "http://10.1.10.10:7701")
|
|
||||||
s1.close()
|
|
||||||
|
|
||||||
s2 = NodeStore(db_path=db)
|
|
||||||
rows = s2.all()
|
|
||||||
assert len(rows) == 1
|
|
||||||
assert rows[0][0] == "navi"
|
|
||||||
s2.close()
|
|
||||||
|
|
@ -1,176 +0,0 @@
|
||||||
# tests/test_resources/test_ollama_adopt.py
|
|
||||||
"""
|
|
||||||
Tests for the Ollama adopt-if-running path:
|
|
||||||
- ProcessSpec: adopt and health_path fields parsed from YAML
|
|
||||||
- ServiceManager.start(): adopt path claims running service; falls through if not running
|
|
||||||
- ServiceManager.is_running(): adopt path uses health probe, not proc table
|
|
||||||
- ServiceInstance.health_path persists through upsert_instance
|
|
||||||
- Probe loop uses inst.health_path instead of hardcoded /health
|
|
||||||
"""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
|
|
||||||
from circuitforge_core.resources.profiles.schema import GpuProfile, ProcessSpec, ServiceProfile, load_profile
|
|
||||||
|
|
||||||
|
|
||||||
# ── ProcessSpec schema ────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_process_spec_defaults():
|
|
||||||
spec = ProcessSpec(exec_path="/usr/local/bin/ollama")
|
|
||||||
assert spec.adopt is False
|
|
||||||
assert spec.health_path == "/health"
|
|
||||||
|
|
||||||
|
|
||||||
def test_process_spec_adopt_fields():
|
|
||||||
spec = ProcessSpec(
|
|
||||||
exec_path="/usr/local/bin/ollama",
|
|
||||||
adopt=True,
|
|
||||||
health_path="/api/tags",
|
|
||||||
port=11434,
|
|
||||||
host_port=11434,
|
|
||||||
)
|
|
||||||
assert spec.adopt is True
|
|
||||||
assert spec.health_path == "/api/tags"
|
|
||||||
|
|
||||||
|
|
||||||
def test_profile_yaml_parses_adopt(tmp_path: Path):
|
|
||||||
yaml_text = """\
|
|
||||||
schema_version: 1
|
|
||||||
name: test
|
|
||||||
services:
|
|
||||||
ollama:
|
|
||||||
max_mb: 4096
|
|
||||||
priority: 1
|
|
||||||
managed:
|
|
||||||
type: process
|
|
||||||
adopt: true
|
|
||||||
exec_path: /usr/local/bin/ollama
|
|
||||||
args_template: serve
|
|
||||||
port: 11434
|
|
||||||
host_port: 11434
|
|
||||||
health_path: /api/tags
|
|
||||||
"""
|
|
||||||
p = tmp_path / "profile.yaml"
|
|
||||||
p.write_text(yaml_text)
|
|
||||||
profile = load_profile(p)
|
|
||||||
spec = profile.services["ollama"].managed
|
|
||||||
assert isinstance(spec, ProcessSpec)
|
|
||||||
assert spec.adopt is True
|
|
||||||
assert spec.health_path == "/api/tags"
|
|
||||||
assert spec.host_port == 11434
|
|
||||||
|
|
||||||
|
|
||||||
# ── ServiceManager adopt path ─────────────────────────────────────────────────
|
|
||||||
|
|
||||||
def _make_manager_with_ollama(advertise_host: str = "127.0.0.1") -> ServiceManager:
|
|
||||||
profile = GpuProfile(
|
|
||||||
schema_version=1,
|
|
||||||
name="test",
|
|
||||||
services={
|
|
||||||
"ollama": ServiceProfile(
|
|
||||||
max_mb=4096,
|
|
||||||
priority=1,
|
|
||||||
managed=ProcessSpec(
|
|
||||||
exec_path="/usr/local/bin/ollama",
|
|
||||||
args_template="serve",
|
|
||||||
port=11434,
|
|
||||||
host_port=11434,
|
|
||||||
adopt=True,
|
|
||||||
health_path="/api/tags",
|
|
||||||
),
|
|
||||||
)
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return ServiceManager(node_id="heimdall", profile=profile, advertise_host=advertise_host)
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_adopt_claims_running_service():
|
|
||||||
"""When Ollama is already healthy, start() returns its URL without spawning a process."""
|
|
||||||
mgr = _make_manager_with_ollama()
|
|
||||||
with patch.object(mgr, "_probe_health", return_value=True) as mock_probe:
|
|
||||||
url = mgr.start("ollama", gpu_id=0, params={})
|
|
||||||
assert url == "http://127.0.0.1:11434"
|
|
||||||
mock_probe.assert_called_once_with(11434, "/api/tags")
|
|
||||||
assert "ollama" not in mgr._procs # no subprocess spawned
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_adopt_spawns_when_not_running():
|
|
||||||
"""When Ollama is not yet running, start() spawns it normally."""
|
|
||||||
mgr = _make_manager_with_ollama()
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
mock_proc.poll.return_value = None
|
|
||||||
|
|
||||||
with patch.object(mgr, "_probe_health", return_value=False), \
|
|
||||||
patch("subprocess.Popen", return_value=mock_proc) as mock_popen:
|
|
||||||
url = mgr.start("ollama", gpu_id=0, params={})
|
|
||||||
|
|
||||||
assert url == "http://127.0.0.1:11434"
|
|
||||||
mock_popen.assert_called_once()
|
|
||||||
assert "ollama" in mgr._procs
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_adopt_uses_health_probe():
|
|
||||||
"""is_running() for adopt=True services checks the health endpoint, not the proc table."""
|
|
||||||
mgr = _make_manager_with_ollama()
|
|
||||||
with patch.object(mgr, "_probe_health", return_value=True):
|
|
||||||
assert mgr.is_running("ollama") is True
|
|
||||||
with patch.object(mgr, "_probe_health", return_value=False):
|
|
||||||
assert mgr.is_running("ollama") is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_probe_health_returns_true_on_200():
|
|
||||||
mgr = _make_manager_with_ollama()
|
|
||||||
mock_resp = MagicMock()
|
|
||||||
mock_resp.status = 200
|
|
||||||
mock_resp.__enter__ = lambda s: mock_resp
|
|
||||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
|
||||||
|
|
||||||
with patch("urllib.request.urlopen", return_value=mock_resp):
|
|
||||||
assert mgr._probe_health(11434, "/api/tags") is True
|
|
||||||
|
|
||||||
|
|
||||||
def test_probe_health_returns_false_on_connection_error():
|
|
||||||
mgr = _make_manager_with_ollama()
|
|
||||||
with patch("urllib.request.urlopen", side_effect=OSError("refused")):
|
|
||||||
assert mgr._probe_health(11434, "/api/tags") is False
|
|
||||||
|
|
||||||
|
|
||||||
# ── ServiceRegistry health_path ───────────────────────────────────────────────
|
|
||||||
|
|
||||||
def test_upsert_instance_stores_health_path():
|
|
||||||
reg = ServiceRegistry()
|
|
||||||
inst = reg.upsert_instance(
|
|
||||||
service="ollama", node_id="heimdall", gpu_id=0,
|
|
||||||
state="running", model=None, url="http://127.0.0.1:11434",
|
|
||||||
health_path="/api/tags",
|
|
||||||
)
|
|
||||||
assert inst.health_path == "/api/tags"
|
|
||||||
|
|
||||||
|
|
||||||
def test_upsert_instance_default_health_path():
|
|
||||||
reg = ServiceRegistry()
|
|
||||||
inst = reg.upsert_instance(
|
|
||||||
service="vllm", node_id="heimdall", gpu_id=0,
|
|
||||||
state="starting", model="qwen", url="http://127.0.0.1:8000",
|
|
||||||
)
|
|
||||||
assert inst.health_path == "/health"
|
|
||||||
|
|
||||||
|
|
||||||
def test_all_gpu_profiles_have_ollama_managed_block():
|
|
||||||
"""Sanity check: all public GPU profiles now have a managed block for ollama."""
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
for profile in registry.list_public():
|
|
||||||
svc = profile.services.get("ollama")
|
|
||||||
if svc is None:
|
|
||||||
continue # profile may not define ollama
|
|
||||||
assert svc.managed is not None, f"{profile.name}: ollama missing managed block"
|
|
||||||
assert isinstance(svc.managed, ProcessSpec)
|
|
||||||
assert svc.managed.adopt is True, f"{profile.name}: ollama adopt should be True"
|
|
||||||
assert svc.managed.health_path == "/api/tags", f"{profile.name}: wrong health_path"
|
|
||||||
|
|
@ -1,101 +0,0 @@
|
||||||
# tests/test_resources/test_profile_registry.py
|
|
||||||
import pytest
|
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
from circuitforge_core.resources.profiles.schema import (
|
|
||||||
GpuProfile, ServiceProfile, load_profile
|
|
||||||
)
|
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
|
||||||
|
|
||||||
|
|
||||||
def test_load_8gb_profile(tmp_path):
|
|
||||||
yaml_content = """
|
|
||||||
schema_version: 1
|
|
||||||
name: single-gpu-8gb
|
|
||||||
vram_total_mb: 8192
|
|
||||||
eviction_timeout_s: 10.0
|
|
||||||
services:
|
|
||||||
vllm:
|
|
||||||
max_mb: 5120
|
|
||||||
priority: 1
|
|
||||||
cf-vision:
|
|
||||||
max_mb: 2048
|
|
||||||
priority: 2
|
|
||||||
shared: true
|
|
||||||
max_concurrent: 3
|
|
||||||
"""
|
|
||||||
profile_file = tmp_path / "test.yaml"
|
|
||||||
profile_file.write_text(yaml_content)
|
|
||||||
profile = load_profile(profile_file)
|
|
||||||
|
|
||||||
assert profile.name == "single-gpu-8gb"
|
|
||||||
assert profile.schema_version == 1
|
|
||||||
assert profile.vram_total_mb == 8192
|
|
||||||
assert profile.eviction_timeout_s == 10.0
|
|
||||||
assert "vllm" in profile.services
|
|
||||||
assert profile.services["vllm"].max_mb == 5120
|
|
||||||
assert profile.services["vllm"].priority == 1
|
|
||||||
assert profile.services["cf-vision"].shared is True
|
|
||||||
assert profile.services["cf-vision"].max_concurrent == 3
|
|
||||||
|
|
||||||
|
|
||||||
def test_load_profile_rejects_wrong_schema_version(tmp_path):
|
|
||||||
yaml_content = "schema_version: 99\nname: future\n"
|
|
||||||
profile_file = tmp_path / "future.yaml"
|
|
||||||
profile_file.write_text(yaml_content)
|
|
||||||
with pytest.raises(ValueError, match="schema_version"):
|
|
||||||
load_profile(profile_file)
|
|
||||||
|
|
||||||
|
|
||||||
def test_service_profile_defaults():
|
|
||||||
svc = ServiceProfile(max_mb=1024, priority=2)
|
|
||||||
assert svc.shared is False
|
|
||||||
assert svc.max_concurrent == 1
|
|
||||||
assert svc.always_on is False
|
|
||||||
assert svc.backend is None
|
|
||||||
assert svc.consumers == []
|
|
||||||
|
|
||||||
|
|
||||||
def test_profile_registry_loads_public_profiles():
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
profiles = registry.list_public()
|
|
||||||
names = [p.name for p in profiles]
|
|
||||||
assert "single-gpu-8gb" in names
|
|
||||||
assert "single-gpu-6gb" in names
|
|
||||||
assert "single-gpu-2gb" in names
|
|
||||||
|
|
||||||
|
|
||||||
def test_profile_registry_auto_detect_selects_8gb():
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
mock_gpus = [
|
|
||||||
MagicMock(vram_total_mb=8192),
|
|
||||||
]
|
|
||||||
profile = registry.auto_detect(mock_gpus)
|
|
||||||
assert profile.name == "single-gpu-8gb"
|
|
||||||
|
|
||||||
|
|
||||||
def test_profile_registry_auto_detect_selects_6gb():
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
mock_gpus = [MagicMock(vram_total_mb=6144)]
|
|
||||||
profile = registry.auto_detect(mock_gpus)
|
|
||||||
assert profile.name == "single-gpu-6gb"
|
|
||||||
|
|
||||||
|
|
||||||
def test_profile_registry_auto_detect_selects_2gb():
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
mock_gpus = [MagicMock(vram_total_mb=2048)]
|
|
||||||
profile = registry.auto_detect(mock_gpus)
|
|
||||||
assert profile.name == "single-gpu-2gb"
|
|
||||||
|
|
||||||
|
|
||||||
def test_profile_registry_load_from_path(tmp_path):
|
|
||||||
yaml_content = (
|
|
||||||
"schema_version: 1\nname: custom\n"
|
|
||||||
"vram_total_mb: 12288\neviction_timeout_s: 5.0\n"
|
|
||||||
)
|
|
||||||
p = tmp_path / "custom.yaml"
|
|
||||||
p.write_text(yaml_content)
|
|
||||||
registry = ProfileRegistry()
|
|
||||||
profile = registry.load(p)
|
|
||||||
assert profile.name == "custom"
|
|
||||||
assert profile.vram_total_mb == 12288
|
|
||||||
|
|
@ -1,194 +0,0 @@
|
||||||
"""Tests for ServiceManager ProcessSpec support."""
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from unittest.mock import MagicMock, patch
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from circuitforge_core.resources.agent.service_manager import ServiceManager
|
|
||||||
from circuitforge_core.resources.profiles.schema import (
|
|
||||||
GpuProfile,
|
|
||||||
ProcessSpec,
|
|
||||||
ServiceProfile,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _make_profile(args_template: str = "--port {port} --gpu-id {gpu_id}") -> GpuProfile:
|
|
||||||
return GpuProfile(
|
|
||||||
schema_version=1,
|
|
||||||
name="test",
|
|
||||||
vram_total_mb=8192,
|
|
||||||
services={
|
|
||||||
"vllm": ServiceProfile(
|
|
||||||
max_mb=5120,
|
|
||||||
priority=1,
|
|
||||||
managed=ProcessSpec(
|
|
||||||
exec_path="/usr/bin/python",
|
|
||||||
args_template=args_template,
|
|
||||||
port=8000,
|
|
||||||
host_port=8000,
|
|
||||||
cwd="/tmp",
|
|
||||||
),
|
|
||||||
),
|
|
||||||
"no_managed": ServiceProfile(max_mb=1024, priority=2),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def manager():
|
|
||||||
return ServiceManager(node_id="test-node", profile=_make_profile(), advertise_host="127.0.0.1")
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# is_running
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_returns_false_when_no_proc(manager):
|
|
||||||
assert manager.is_running("vllm") is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_returns_false_when_proc_exited(manager):
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
mock_proc.poll.return_value = 1 # exited
|
|
||||||
manager._procs["vllm"] = mock_proc
|
|
||||||
assert manager.is_running("vllm") is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_returns_false_when_port_not_listening(manager):
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
mock_proc.poll.return_value = None # still running
|
|
||||||
manager._procs["vllm"] = mock_proc
|
|
||||||
|
|
||||||
with patch("socket.create_connection", side_effect=OSError("refused")):
|
|
||||||
assert manager.is_running("vllm") is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_returns_true_when_proc_alive_and_port_open(manager):
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
mock_proc.poll.return_value = None # still running
|
|
||||||
manager._procs["vllm"] = mock_proc
|
|
||||||
|
|
||||||
mock_socket = MagicMock()
|
|
||||||
mock_socket.__enter__ = MagicMock(return_value=mock_socket)
|
|
||||||
mock_socket.__exit__ = MagicMock(return_value=False)
|
|
||||||
with patch("socket.create_connection", return_value=mock_socket):
|
|
||||||
assert manager.is_running("vllm") is True
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_unknown_service_returns_false(manager):
|
|
||||||
assert manager.is_running("nonexistent") is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_is_running_no_managed_spec_returns_false(manager):
|
|
||||||
assert manager.is_running("no_managed") is False
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# start
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_launches_process_and_returns_url(manager):
|
|
||||||
with patch("subprocess.Popen") as mock_popen, \
|
|
||||||
patch.object(manager, "is_running", return_value=False):
|
|
||||||
mock_popen.return_value = MagicMock()
|
|
||||||
url = manager.start("vllm", gpu_id=0, params={"model": "mymodel"})
|
|
||||||
|
|
||||||
assert url == "http://127.0.0.1:8000"
|
|
||||||
mock_popen.assert_called_once()
|
|
||||||
call_args = mock_popen.call_args
|
|
||||||
cmd = call_args[0][0]
|
|
||||||
assert cmd[0] == "/usr/bin/python"
|
|
||||||
assert "--port" in cmd
|
|
||||||
assert "8000" in cmd
|
|
||||||
assert "--gpu-id" in cmd
|
|
||||||
assert "0" in cmd
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_returns_url_immediately_when_already_running(manager):
|
|
||||||
with patch.object(manager, "is_running", return_value=True):
|
|
||||||
with patch("subprocess.Popen") as mock_popen:
|
|
||||||
url = manager.start("vllm", gpu_id=0, params={})
|
|
||||||
|
|
||||||
assert url == "http://127.0.0.1:8000"
|
|
||||||
mock_popen.assert_not_called()
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_raises_for_unknown_service(manager):
|
|
||||||
with pytest.raises(ValueError, match="not in profile"):
|
|
||||||
manager.start("nonexistent", gpu_id=0, params={})
|
|
||||||
|
|
||||||
|
|
||||||
def test_start_stores_proc_in_procs(manager):
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
with patch("subprocess.Popen", return_value=mock_proc), \
|
|
||||||
patch.object(manager, "is_running", return_value=False):
|
|
||||||
manager.start("vllm", gpu_id=0, params={})
|
|
||||||
|
|
||||||
assert manager._procs["vllm"] is mock_proc
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# stop
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def test_stop_terminates_running_process(manager):
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
manager._procs["vllm"] = mock_proc
|
|
||||||
|
|
||||||
result = manager.stop("vllm")
|
|
||||||
|
|
||||||
assert result is True
|
|
||||||
mock_proc.terminate.assert_called_once()
|
|
||||||
mock_proc.wait.assert_called_once()
|
|
||||||
assert "vllm" not in manager._procs
|
|
||||||
|
|
||||||
|
|
||||||
def test_stop_kills_process_that_wont_terminate(manager):
|
|
||||||
mock_proc = MagicMock()
|
|
||||||
mock_proc.wait.side_effect = Exception("timeout")
|
|
||||||
manager._procs["vllm"] = mock_proc
|
|
||||||
|
|
||||||
result = manager.stop("vllm")
|
|
||||||
|
|
||||||
assert result is True
|
|
||||||
mock_proc.kill.assert_called_once()
|
|
||||||
|
|
||||||
|
|
||||||
def test_stop_returns_true_when_no_proc_tracked(manager):
|
|
||||||
# No proc in _procs — still returns True (idempotent stop)
|
|
||||||
result = manager.stop("vllm")
|
|
||||||
assert result is True
|
|
||||||
|
|
||||||
|
|
||||||
def test_stop_returns_false_for_unknown_service(manager):
|
|
||||||
result = manager.stop("nonexistent")
|
|
||||||
assert result is False
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# list_running / get_url
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def test_list_running_returns_running_services(manager):
|
|
||||||
def _is_running(svc: str) -> bool:
|
|
||||||
return svc == "vllm"
|
|
||||||
|
|
||||||
with patch.object(manager, "is_running", side_effect=_is_running):
|
|
||||||
running = manager.list_running()
|
|
||||||
|
|
||||||
assert running == ["vllm"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_url_returns_none_when_not_running(manager):
|
|
||||||
with patch.object(manager, "is_running", return_value=False):
|
|
||||||
assert manager.get_url("vllm") is None
|
|
||||||
|
|
||||||
|
|
||||||
def test_get_url_returns_url_when_running(manager):
|
|
||||||
with patch.object(manager, "is_running", return_value=True):
|
|
||||||
assert manager.get_url("vllm") == "http://127.0.0.1:8000"
|
|
||||||
|
|
@ -1,86 +0,0 @@
|
||||||
import time
|
|
||||||
import dataclasses
|
|
||||||
import pytest
|
|
||||||
from circuitforge_core.resources.coordinator.service_registry import (
|
|
||||||
ServiceRegistry, ServiceAllocation, ServiceInstance,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def registry():
|
|
||||||
return ServiceRegistry()
|
|
||||||
|
|
||||||
|
|
||||||
def test_allocate_creates_allocation(registry):
|
|
||||||
alloc = registry.allocate(
|
|
||||||
service="vllm", node_id="heimdall", gpu_id=0,
|
|
||||||
model="Ouro-1.4B", url="http://heimdall:8000",
|
|
||||||
caller="test", ttl_s=300.0,
|
|
||||||
)
|
|
||||||
assert alloc.service == "vllm"
|
|
||||||
assert alloc.node_id == "heimdall"
|
|
||||||
assert alloc.allocation_id # non-empty UUID string
|
|
||||||
|
|
||||||
|
|
||||||
def test_active_allocations_count(registry):
|
|
||||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
|
|
||||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "b", 300.0)
|
|
||||||
assert registry.active_allocations("vllm", "heimdall", 0) == 2
|
|
||||||
|
|
||||||
|
|
||||||
def test_release_decrements_count(registry):
|
|
||||||
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
|
|
||||||
registry.release(alloc.allocation_id)
|
|
||||||
assert registry.active_allocations("vllm", "heimdall", 0) == 0
|
|
||||||
|
|
||||||
|
|
||||||
def test_release_nonexistent_returns_false(registry):
|
|
||||||
assert registry.release("nonexistent-id") is False
|
|
||||||
|
|
||||||
|
|
||||||
def test_upsert_instance_sets_running_state(registry):
|
|
||||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
|
||||||
model="Ouro-1.4B", url="http://heimdall:8000")
|
|
||||||
instances = registry.all_instances()
|
|
||||||
assert len(instances) == 1
|
|
||||||
assert instances[0].state == "running"
|
|
||||||
|
|
||||||
|
|
||||||
def test_release_last_alloc_marks_instance_idle(registry):
|
|
||||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
|
||||||
model="Ouro-1.4B", url="http://heimdall:8000")
|
|
||||||
alloc = registry.allocate("vllm", "heimdall", 0, "Ouro-1.4B", "http://heimdall:8000", "a", 300.0)
|
|
||||||
registry.release(alloc.allocation_id)
|
|
||||||
instance = registry.all_instances()[0]
|
|
||||||
assert instance.state == "idle"
|
|
||||||
assert instance.idle_since is not None
|
|
||||||
|
|
||||||
|
|
||||||
def test_new_alloc_on_idle_instance_marks_it_running(registry):
|
|
||||||
registry.upsert_instance("vllm", "heimdall", 0, state="idle",
|
|
||||||
model="M", url="http://h:8000")
|
|
||||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "x", 300.0)
|
|
||||||
assert registry.all_instances()[0].state == "running"
|
|
||||||
|
|
||||||
|
|
||||||
def test_sweep_expired_allocations(registry):
|
|
||||||
# Register a running instance so idle-transition logic has something to act on.
|
|
||||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
|
||||||
model="M", url="http://h:8000")
|
|
||||||
# Create an allocation with a very short TTL (1 second).
|
|
||||||
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "caller", ttl_s=1)
|
|
||||||
assert registry.active_allocations("vllm", "heimdall", 0) == 1
|
|
||||||
|
|
||||||
# Wait for TTL to elapse.
|
|
||||||
time.sleep(1.1)
|
|
||||||
|
|
||||||
expired = registry.sweep_expired_allocations()
|
|
||||||
|
|
||||||
# The allocation should have been swept.
|
|
||||||
assert alloc.allocation_id in expired
|
|
||||||
assert registry.active_allocations("vllm", "heimdall", 0) == 0
|
|
||||||
|
|
||||||
# The instance should have transitioned to idle since no allocations remain.
|
|
||||||
instance = registry.all_instances()[0]
|
|
||||||
assert instance.state == "idle"
|
|
||||||
assert instance.idle_since is not None
|
|
||||||
Loading…
Reference in a new issue