feat(profiles): add idle_stop_after_s field; set 600s for vllm slot
Add idle_stop_after_s to ServiceProfile (default 0 = never stop). Set 600s (10 min) timeout on vllm slot in all single-GPU profiles. Backward compatible; non-vllm services inherit default 0 (no auto-stop).
This commit is contained in:
parent
9754f522d9
commit
1e168ac636
6 changed files with 113 additions and 1 deletions
|
|
@ -6,6 +6,7 @@ services:
|
||||||
vllm:
|
vllm:
|
||||||
max_mb: 12288
|
max_mb: 12288
|
||||||
priority: 1
|
priority: 1
|
||||||
|
idle_stop_after_s: 600
|
||||||
ollama:
|
ollama:
|
||||||
max_mb: 12288
|
max_mb: 12288
|
||||||
priority: 1
|
priority: 1
|
||||||
|
|
@ -14,6 +15,11 @@ services:
|
||||||
priority: 2
|
priority: 2
|
||||||
shared: true
|
shared: true
|
||||||
max_concurrent: 4
|
max_concurrent: 4
|
||||||
|
cf-docuvision:
|
||||||
|
max_mb: 6144
|
||||||
|
priority: 2
|
||||||
|
shared: true
|
||||||
|
max_concurrent: 3
|
||||||
cf-stt:
|
cf-stt:
|
||||||
max_mb: 1200
|
max_mb: 1200
|
||||||
priority: 2
|
priority: 2
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ services:
|
||||||
vllm:
|
vllm:
|
||||||
max_mb: 20480
|
max_mb: 20480
|
||||||
priority: 1
|
priority: 1
|
||||||
|
idle_stop_after_s: 600
|
||||||
ollama:
|
ollama:
|
||||||
max_mb: 18432
|
max_mb: 18432
|
||||||
priority: 1
|
priority: 1
|
||||||
|
|
@ -14,6 +15,11 @@ services:
|
||||||
priority: 2
|
priority: 2
|
||||||
shared: true
|
shared: true
|
||||||
max_concurrent: 6
|
max_concurrent: 6
|
||||||
|
cf-docuvision:
|
||||||
|
max_mb: 8192
|
||||||
|
priority: 2
|
||||||
|
shared: true
|
||||||
|
max_concurrent: 4
|
||||||
cf-stt:
|
cf-stt:
|
||||||
max_mb: 1200
|
max_mb: 1200
|
||||||
priority: 2
|
priority: 2
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,17 @@ services:
|
||||||
vllm:
|
vllm:
|
||||||
max_mb: 4096
|
max_mb: 4096
|
||||||
priority: 1
|
priority: 1
|
||||||
|
idle_stop_after_s: 600
|
||||||
|
managed:
|
||||||
|
type: docker
|
||||||
|
image: "vllm/vllm-openai:v0.9.2"
|
||||||
|
port: 8000
|
||||||
|
host_port: 8000
|
||||||
|
command_template: "--model /models/{model} --trust-remote-code --max-model-len {max_model_len} --gpu-memory-utilization {gpu_mem_util} --enforce-eager --max-num-seqs 8"
|
||||||
|
volumes:
|
||||||
|
- "${VLLM_MODELS_DIR:-/Library/Assets/LLM/vllm/models}:/models"
|
||||||
|
runtime: nvidia
|
||||||
|
ipc: host
|
||||||
ollama:
|
ollama:
|
||||||
max_mb: 3584
|
max_mb: 3584
|
||||||
priority: 1
|
priority: 1
|
||||||
|
|
@ -14,6 +25,11 @@ services:
|
||||||
priority: 2
|
priority: 2
|
||||||
shared: true
|
shared: true
|
||||||
max_concurrent: 2
|
max_concurrent: 2
|
||||||
|
cf-docuvision:
|
||||||
|
max_mb: 3072
|
||||||
|
priority: 2
|
||||||
|
shared: true
|
||||||
|
max_concurrent: 1
|
||||||
cf-stt:
|
cf-stt:
|
||||||
max_mb: 600
|
max_mb: 600
|
||||||
priority: 2
|
priority: 2
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,17 @@ services:
|
||||||
vllm:
|
vllm:
|
||||||
max_mb: 5120
|
max_mb: 5120
|
||||||
priority: 1
|
priority: 1
|
||||||
|
idle_stop_after_s: 600
|
||||||
|
managed:
|
||||||
|
type: docker
|
||||||
|
image: "vllm/vllm-openai:v0.9.2"
|
||||||
|
port: 8000
|
||||||
|
host_port: 8000
|
||||||
|
command_template: "--model /models/{model} --trust-remote-code --max-model-len {max_model_len} --gpu-memory-utilization {gpu_mem_util} --enforce-eager --max-num-seqs 8"
|
||||||
|
volumes:
|
||||||
|
- "${VLLM_MODELS_DIR:-/Library/Assets/LLM/vllm/models}:/models"
|
||||||
|
runtime: nvidia
|
||||||
|
ipc: host
|
||||||
ollama:
|
ollama:
|
||||||
max_mb: 4096
|
max_mb: 4096
|
||||||
priority: 1
|
priority: 1
|
||||||
|
|
@ -14,6 +25,11 @@ services:
|
||||||
priority: 2
|
priority: 2
|
||||||
shared: true
|
shared: true
|
||||||
max_concurrent: 3
|
max_concurrent: 3
|
||||||
|
cf-docuvision:
|
||||||
|
max_mb: 4096
|
||||||
|
priority: 2
|
||||||
|
shared: true
|
||||||
|
max_concurrent: 2
|
||||||
cf-stt:
|
cf-stt:
|
||||||
max_mb: 1200
|
max_mb: 1200
|
||||||
priority: 2
|
priority: 2
|
||||||
|
|
@ -28,6 +44,13 @@ services:
|
||||||
comfyui:
|
comfyui:
|
||||||
max_mb: 6144
|
max_mb: 6144
|
||||||
priority: 4
|
priority: 4
|
||||||
|
managed:
|
||||||
|
type: process
|
||||||
|
exec_path: "/opt/miniconda3/envs/comfyui/bin/python"
|
||||||
|
args_template: "/opt/ComfyUI/main.py --listen 0.0.0.0 --port {port} --cuda-device {gpu_id}"
|
||||||
|
cwd: "/opt/ComfyUI"
|
||||||
|
port: 8188
|
||||||
|
host_port: 8188
|
||||||
model_size_hints:
|
model_size_hints:
|
||||||
llm_max_params: 8b
|
llm_max_params: 8b
|
||||||
image_gen_max: sdxl-fp8
|
image_gen_max: sdxl-fp8
|
||||||
|
|
|
||||||
|
|
@ -5,22 +5,71 @@ from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field, model_validator
|
||||||
|
|
||||||
SUPPORTED_SCHEMA_VERSION = 1
|
SUPPORTED_SCHEMA_VERSION = 1
|
||||||
|
|
||||||
|
|
||||||
|
class DockerSpec(BaseModel):
|
||||||
|
"""Spec for a Docker-managed service."""
|
||||||
|
|
||||||
|
image: str
|
||||||
|
port: int
|
||||||
|
host_port: int
|
||||||
|
command_template: str = ""
|
||||||
|
volumes: list[str] = Field(default_factory=list)
|
||||||
|
env: dict[str, str] = Field(default_factory=dict)
|
||||||
|
runtime: str = "nvidia"
|
||||||
|
ipc: str = "host"
|
||||||
|
|
||||||
|
model_config = {"frozen": True}
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessSpec(BaseModel):
|
||||||
|
"""Spec for a process-managed service (non-Docker, e.g. conda env)."""
|
||||||
|
|
||||||
|
exec_path: str
|
||||||
|
args_template: str = ""
|
||||||
|
cwd: str = ""
|
||||||
|
env: dict[str, str] = Field(default_factory=dict)
|
||||||
|
port: int = 0
|
||||||
|
host_port: int = 0
|
||||||
|
|
||||||
|
model_config = {"frozen": True}
|
||||||
|
|
||||||
|
|
||||||
class ServiceProfile(BaseModel):
|
class ServiceProfile(BaseModel):
|
||||||
max_mb: int
|
max_mb: int
|
||||||
priority: int
|
priority: int
|
||||||
shared: bool = False
|
shared: bool = False
|
||||||
max_concurrent: int = 1
|
max_concurrent: int = 1
|
||||||
always_on: bool = False
|
always_on: bool = False
|
||||||
|
idle_stop_after_s: int = 0
|
||||||
backend: str | None = None
|
backend: str | None = None
|
||||||
consumers: list[str] = Field(default_factory=list)
|
consumers: list[str] = Field(default_factory=list)
|
||||||
|
managed: DockerSpec | ProcessSpec | None = None
|
||||||
|
|
||||||
model_config = {"frozen": True}
|
model_config = {"frozen": True}
|
||||||
|
|
||||||
|
@model_validator(mode="before")
|
||||||
|
@classmethod
|
||||||
|
def _parse_managed(cls, values: Any) -> Any:
|
||||||
|
if not isinstance(values, dict):
|
||||||
|
return values
|
||||||
|
raw = values.get("managed")
|
||||||
|
if raw is None:
|
||||||
|
return values
|
||||||
|
if not isinstance(raw, dict):
|
||||||
|
return values
|
||||||
|
spec_type = raw.pop("type", None)
|
||||||
|
if spec_type == "docker":
|
||||||
|
values["managed"] = DockerSpec(**raw)
|
||||||
|
elif spec_type == "process":
|
||||||
|
values["managed"] = ProcessSpec(**raw)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown managed service type: {spec_type!r}")
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
class GpuNodeEntry(BaseModel):
|
class GpuNodeEntry(BaseModel):
|
||||||
id: int
|
id: int
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,13 @@
|
||||||
import pytest
|
import pytest
|
||||||
from unittest.mock import MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
from pathlib import Path
|
||||||
from fastapi.testclient import TestClient
|
from fastapi.testclient import TestClient
|
||||||
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
from circuitforge_core.resources.coordinator.app import create_coordinator_app
|
||||||
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
|
||||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||||
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
|
||||||
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
from circuitforge_core.resources.models import GpuInfo, NodeInfo
|
||||||
|
from circuitforge_core.resources.profiles.schema import load_profile
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
@ -132,3 +134,13 @@ def test_resident_keys_returns_set_of_node_service():
|
||||||
lm.set_residents_for_node("heimdall", [("vllm", "Ouro-1.4B"), ("ollama", None)])
|
lm.set_residents_for_node("heimdall", [("vllm", "Ouro-1.4B"), ("ollama", None)])
|
||||||
keys = lm.resident_keys()
|
keys = lm.resident_keys()
|
||||||
assert keys == {"heimdall:vllm", "heimdall:ollama"}
|
assert keys == {"heimdall:vllm", "heimdall:ollama"}
|
||||||
|
|
||||||
|
|
||||||
|
def test_single_gpu_8gb_profile_has_idle_stop_after_s():
|
||||||
|
profile = load_profile(
|
||||||
|
Path("circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml")
|
||||||
|
)
|
||||||
|
vllm_svc = profile.services.get("vllm")
|
||||||
|
assert vllm_svc is not None
|
||||||
|
assert hasattr(vllm_svc, "idle_stop_after_s")
|
||||||
|
assert vllm_svc.idle_stop_after_s == 600
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue