feat(orch): add ServiceRegistry — allocation tracking + idle state machine
This commit is contained in:
parent
17a24173f7
commit
9754f522d9
2 changed files with 203 additions and 0 deletions
140
circuitforge_core/resources/coordinator/service_registry.py
Normal file
140
circuitforge_core/resources/coordinator/service_registry.py
Normal file
|
|
@ -0,0 +1,140 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import time
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceAllocation:
|
||||
allocation_id: str
|
||||
service: str
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
model: str | None
|
||||
caller: str
|
||||
url: str
|
||||
created_at: float
|
||||
expires_at: float # 0 = no expiry
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceInstance:
|
||||
service: str
|
||||
node_id: str
|
||||
gpu_id: int
|
||||
state: Literal["starting", "running", "idle", "stopped"]
|
||||
model: str | None
|
||||
url: str | None
|
||||
idle_since: float | None = None
|
||||
|
||||
|
||||
class ServiceRegistry:
|
||||
"""
|
||||
In-memory registry of service allocations and instance state.
|
||||
|
||||
Allocations: per-caller request — many per service instance.
|
||||
Instances: per (service, node_id, gpu_id) — one per running container.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._allocations: dict[str, ServiceAllocation] = {}
|
||||
self._instances: dict[str, ServiceInstance] = {} # key: "service:node_id:gpu_id"
|
||||
|
||||
# ── allocation API ────────────────────────────────────────────────
|
||||
|
||||
def allocate(
|
||||
self,
|
||||
service: str,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
model: str | None,
|
||||
url: str,
|
||||
caller: str,
|
||||
ttl_s: float,
|
||||
) -> ServiceAllocation:
|
||||
alloc = ServiceAllocation(
|
||||
allocation_id=str(uuid.uuid4()),
|
||||
service=service,
|
||||
node_id=node_id,
|
||||
gpu_id=gpu_id,
|
||||
model=model,
|
||||
caller=caller,
|
||||
url=url,
|
||||
created_at=time.time(),
|
||||
expires_at=time.time() + ttl_s if ttl_s > 0 else 0.0,
|
||||
)
|
||||
self._allocations[alloc.allocation_id] = alloc
|
||||
|
||||
# If an instance exists in idle/stopped state, mark it running again
|
||||
key = f"{service}:{node_id}:{gpu_id}"
|
||||
if key in self._instances:
|
||||
inst = self._instances[key]
|
||||
if inst.state in ("idle", "stopped"):
|
||||
self._instances[key] = dataclasses.replace(
|
||||
inst, state="running", idle_since=None
|
||||
)
|
||||
return alloc
|
||||
|
||||
def release(self, allocation_id: str) -> bool:
|
||||
alloc = self._allocations.pop(allocation_id, None)
|
||||
if alloc is None:
|
||||
return False
|
||||
# If no active allocations remain for this instance, mark it idle
|
||||
key = f"{alloc.service}:{alloc.node_id}:{alloc.gpu_id}"
|
||||
if self.active_allocations(alloc.service, alloc.node_id) == 0:
|
||||
if key in self._instances:
|
||||
self._instances[key] = dataclasses.replace(
|
||||
self._instances[key], state="idle", idle_since=time.time()
|
||||
)
|
||||
return True
|
||||
|
||||
def active_allocations(self, service: str, node_id: str) -> int:
|
||||
return sum(
|
||||
1 for a in self._allocations.values()
|
||||
if a.service == service and a.node_id == node_id
|
||||
)
|
||||
|
||||
# ── instance API ─────────────────────────────────────────────────
|
||||
|
||||
def upsert_instance(
|
||||
self,
|
||||
service: str,
|
||||
node_id: str,
|
||||
gpu_id: int,
|
||||
state: Literal["starting", "running", "idle", "stopped"],
|
||||
model: str | None,
|
||||
url: str | None,
|
||||
) -> ServiceInstance:
|
||||
key = f"{service}:{node_id}:{gpu_id}"
|
||||
existing = self._instances.get(key)
|
||||
idle_since: float | None = None
|
||||
if state == "idle":
|
||||
# Preserve idle_since if already idle; set now if transitioning into idle
|
||||
idle_since = existing.idle_since if (existing and existing.state == "idle") else time.time()
|
||||
inst = ServiceInstance(
|
||||
service=service, node_id=node_id, gpu_id=gpu_id,
|
||||
state=state, model=model, url=url, idle_since=idle_since,
|
||||
)
|
||||
self._instances[key] = inst
|
||||
return inst
|
||||
|
||||
def all_instances(self) -> list[ServiceInstance]:
|
||||
return list(self._instances.values())
|
||||
|
||||
def idle_past_timeout(self, idle_stop_config: dict[str, int]) -> list[ServiceInstance]:
|
||||
"""
|
||||
Return instances in 'idle' state whose idle time exceeds their configured timeout.
|
||||
idle_stop_config: {service_name: seconds} — 0 means never stop automatically.
|
||||
"""
|
||||
now = time.time()
|
||||
result = []
|
||||
for inst in self._instances.values():
|
||||
if inst.state != "idle" or inst.idle_since is None:
|
||||
continue
|
||||
timeout = idle_stop_config.get(inst.service, 0)
|
||||
if timeout > 0 and (now - inst.idle_since) >= timeout:
|
||||
result.append(inst)
|
||||
return result
|
||||
63
tests/test_resources/test_service_registry.py
Normal file
63
tests/test_resources/test_service_registry.py
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
import time
|
||||
import dataclasses
|
||||
import pytest
|
||||
from circuitforge_core.resources.coordinator.service_registry import (
|
||||
ServiceRegistry, ServiceAllocation, ServiceInstance,
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def registry():
|
||||
return ServiceRegistry()
|
||||
|
||||
|
||||
def test_allocate_creates_allocation(registry):
|
||||
alloc = registry.allocate(
|
||||
service="vllm", node_id="heimdall", gpu_id=0,
|
||||
model="Ouro-1.4B", url="http://heimdall:8000",
|
||||
caller="test", ttl_s=300.0,
|
||||
)
|
||||
assert alloc.service == "vllm"
|
||||
assert alloc.node_id == "heimdall"
|
||||
assert alloc.allocation_id # non-empty UUID string
|
||||
|
||||
|
||||
def test_active_allocations_count(registry):
|
||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
|
||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "b", 300.0)
|
||||
assert registry.active_allocations("vllm", "heimdall") == 2
|
||||
|
||||
|
||||
def test_release_decrements_count(registry):
|
||||
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
|
||||
registry.release(alloc.allocation_id)
|
||||
assert registry.active_allocations("vllm", "heimdall") == 0
|
||||
|
||||
|
||||
def test_release_nonexistent_returns_false(registry):
|
||||
assert registry.release("nonexistent-id") is False
|
||||
|
||||
|
||||
def test_upsert_instance_sets_running_state(registry):
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
||||
model="Ouro-1.4B", url="http://heimdall:8000")
|
||||
instances = registry.all_instances()
|
||||
assert len(instances) == 1
|
||||
assert instances[0].state == "running"
|
||||
|
||||
|
||||
def test_release_last_alloc_marks_instance_idle(registry):
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="running",
|
||||
model="Ouro-1.4B", url="http://heimdall:8000")
|
||||
alloc = registry.allocate("vllm", "heimdall", 0, "Ouro-1.4B", "http://heimdall:8000", "a", 300.0)
|
||||
registry.release(alloc.allocation_id)
|
||||
instance = registry.all_instances()[0]
|
||||
assert instance.state == "idle"
|
||||
assert instance.idle_since is not None
|
||||
|
||||
|
||||
def test_new_alloc_on_idle_instance_marks_it_running(registry):
|
||||
registry.upsert_instance("vllm", "heimdall", 0, state="idle",
|
||||
model="M", url="http://h:8000")
|
||||
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "x", 300.0)
|
||||
assert registry.all_instances()[0].state == "running"
|
||||
Loading…
Reference in a new issue