feat(llm): pass CF_APP_NAME as pipeline tag in cf-orch allocations
Coordinator analytics now show which product (kiwi, peregrine, etc.) is responsible for each vllm allocation, enabling per-app GPU usage breakdown.
This commit is contained in:
parent
01ed48808b
commit
27f0b67908
1 changed files with 21 additions and 2 deletions
|
|
@ -128,7 +128,16 @@ class LLMRouter:
|
||||||
service = orch_cfg.get("service", "vllm")
|
service = orch_cfg.get("service", "vllm")
|
||||||
candidates = orch_cfg.get("model_candidates", [])
|
candidates = orch_cfg.get("model_candidates", [])
|
||||||
ttl_s = float(orch_cfg.get("ttl_s", 3600.0))
|
ttl_s = float(orch_cfg.get("ttl_s", 3600.0))
|
||||||
ctx = client.allocate(service, model_candidates=candidates, ttl_s=ttl_s, caller="llm-router")
|
# CF_APP_NAME identifies the calling product (kiwi, peregrine, etc.)
|
||||||
|
# in coordinator analytics — set in each product's .env.
|
||||||
|
pipeline = os.environ.get("CF_APP_NAME") or None
|
||||||
|
ctx = client.allocate(
|
||||||
|
service,
|
||||||
|
model_candidates=candidates,
|
||||||
|
ttl_s=ttl_s,
|
||||||
|
caller="llm-router",
|
||||||
|
pipeline=pipeline,
|
||||||
|
)
|
||||||
alloc = ctx.__enter__()
|
alloc = ctx.__enter__()
|
||||||
return (ctx, alloc)
|
return (ctx, alloc)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
|
@ -179,7 +188,14 @@ class LLMRouter:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if is_vision_service:
|
if is_vision_service:
|
||||||
if not self._is_reachable(backend["base_url"]):
|
# cf_orch: try allocation first (same pattern as openai_compat).
|
||||||
|
# Allocation can start the vision service on-demand on the cluster.
|
||||||
|
orch_ctx = orch_alloc = None
|
||||||
|
orch_result = self._try_cf_orch_alloc(backend)
|
||||||
|
if orch_result is not None:
|
||||||
|
orch_ctx, orch_alloc = orch_result
|
||||||
|
backend = {**backend, "base_url": orch_alloc.url}
|
||||||
|
elif not self._is_reachable(backend["base_url"]):
|
||||||
print(f"[LLMRouter] {name}: unreachable, skipping")
|
print(f"[LLMRouter] {name}: unreachable, skipping")
|
||||||
continue
|
continue
|
||||||
try:
|
try:
|
||||||
|
|
@ -197,6 +213,9 @@ class LLMRouter:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"[LLMRouter] {name}: error — {e}, trying next")
|
print(f"[LLMRouter] {name}: error — {e}, trying next")
|
||||||
continue
|
continue
|
||||||
|
finally:
|
||||||
|
if orch_ctx is not None:
|
||||||
|
orch_ctx.__exit__(None, None, None)
|
||||||
|
|
||||||
elif backend["type"] == "openai_compat":
|
elif backend["type"] == "openai_compat":
|
||||||
# cf_orch: try allocation first — this may start the service on-demand.
|
# cf_orch: try allocation first — this may start the service on-demand.
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue