fix(resources): address code review findings from final review
- eviction_engine: replace deprecated asyncio.get_event_loop() with get_running_loop() (Python 3.12 compatibility) - eviction_engine: remove unused httpx import - coordinator app: return 422 for unknown node_id instead of silently falling back to hardcoded localhost URL - eviction_executor: guard against pid <= 0 to prevent accidental SIGTERM to process group - pyproject.toml: move pytest-asyncio to [dev] extras, not [orch] - profile_registry: document CPU profile exclusion from list_public()
This commit is contained in:
parent
d755e9ea2c
commit
db4e3047fd
5 changed files with 23 additions and 5 deletions
|
|
@ -31,6 +31,12 @@ class EvictionExecutor:
|
|||
) -> EvictionResult:
|
||||
grace = grace_period_s if grace_period_s is not None else self._default_grace
|
||||
|
||||
if pid <= 0:
|
||||
return EvictionResult(
|
||||
success=False, method="error",
|
||||
message=f"Refusing to signal invalid PID {pid}"
|
||||
)
|
||||
|
||||
if not psutil.pid_exists(pid):
|
||||
return EvictionResult(
|
||||
success=False, method="not_found",
|
||||
|
|
|
|||
|
|
@ -86,7 +86,12 @@ def create_coordinator_app(
|
|||
@app.post("/api/leases")
|
||||
async def request_lease(req: LeaseRequest) -> dict[str, Any]:
|
||||
node_info = agent_supervisor.get_node_info(req.node_id)
|
||||
agent_url = node_info.agent_url if node_info else "http://localhost:7701"
|
||||
if node_info is None:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail=f"Unknown node_id {req.node_id!r} — node not registered",
|
||||
)
|
||||
agent_url = node_info.agent_url
|
||||
|
||||
lease = await eviction_engine.request_lease(
|
||||
node_id=req.node_id,
|
||||
|
|
|
|||
|
|
@ -3,8 +3,6 @@ from __future__ import annotations
|
|||
import asyncio
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
|
||||
from circuitforge_core.resources.models import VRAMLease
|
||||
|
||||
|
|
@ -61,8 +59,9 @@ class EvictionEngine:
|
|||
await self._evict_lease(candidate, agent_url)
|
||||
|
||||
# Wait for evictions to free up VRAM (poll with timeout)
|
||||
deadline = asyncio.get_event_loop().time() + self._timeout
|
||||
while asyncio.get_event_loop().time() < deadline:
|
||||
loop = asyncio.get_running_loop()
|
||||
deadline = loop.time() + self._timeout
|
||||
while loop.time() < deadline:
|
||||
lease = await self.lease_manager.try_grant(
|
||||
node_id, gpu_id, mb, service, priority, ttl_s
|
||||
)
|
||||
|
|
|
|||
|
|
@ -44,6 +44,9 @@ class ProfileRegistry:
|
|||
return profile
|
||||
|
||||
def list_public(self) -> list[GpuProfile]:
|
||||
# CPU profiles (cpu-*) are intentionally excluded — this endpoint
|
||||
# is used to match GPU hardware. CPU inference nodes self-select
|
||||
# their profile via the CLI and are not listed for lease matching.
|
||||
return [
|
||||
p for p in self._profiles.values()
|
||||
if p.name.startswith("single-gpu-")
|
||||
|
|
|
|||
|
|
@ -21,7 +21,12 @@ orch = [
|
|||
"pydantic>=2.0",
|
||||
"typer[all]>=0.12",
|
||||
"psutil>=5.9",
|
||||
]
|
||||
dev = [
|
||||
"circuitforge-core[orch]",
|
||||
"pytest>=8.0",
|
||||
"pytest-asyncio>=0.23",
|
||||
"httpx>=0.27",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
|
|
|
|||
Loading…
Reference in a new issue