feat(llm): pass CF_APP_NAME as pipeline tag in cf-orch allocations

Coordinator analytics now show which product (kiwi, peregrine, etc.) is responsible for each vllm allocation, enabling per-app GPU usage breakdown.
2026-04-20 07:01:55 -07:00 · 2026-04-20 07:01:55 -07:00 · 27f0b67908
commit 27f0b67908
parent 01ed48808b
1 changed files with 21 additions and 2 deletions
--- a/circuitforge_core/llm/router.py
+++ b/circuitforge_core/llm/router.py
@ -128,7 +128,16 @@ class LLMRouter:
            service = orch_cfg.get("service", "vllm")
            candidates = orch_cfg.get("model_candidates", [])
            ttl_s = float(orch_cfg.get("ttl_s", 3600.0))
-            ctx = client.allocate(service, model_candidates=candidates, ttl_s=ttl_s, caller="llm-router")
+            # CF_APP_NAME identifies the calling product (kiwi, peregrine, etc.)
+            # in coordinator analytics — set in each product's .env.
+            pipeline = os.environ.get("CF_APP_NAME") or None
+            ctx = client.allocate(
+                service,
+                model_candidates=candidates,
+                ttl_s=ttl_s,
+                caller="llm-router",
+                pipeline=pipeline,
+            )
            alloc = ctx.__enter__()
            return (ctx, alloc)
        except Exception as exc:
@ -179,7 +188,14 @@ class LLMRouter:
                continue

            if is_vision_service:
-                if not self._is_reachable(backend["base_url"]):
+                # cf_orch: try allocation first (same pattern as openai_compat).
+                # Allocation can start the vision service on-demand on the cluster.
+                orch_ctx = orch_alloc = None
+                orch_result = self._try_cf_orch_alloc(backend)
+                if orch_result is not None:
+                    orch_ctx, orch_alloc = orch_result
+                    backend = {**backend, "base_url": orch_alloc.url}
+                elif not self._is_reachable(backend["base_url"]):
                    print(f"[LLMRouter] {name}: unreachable, skipping")
                    continue
                try:
@ -197,6 +213,9 @@ class LLMRouter:
                except Exception as e:
                    print(f"[LLMRouter] {name}: error — {e}, trying next")
                    continue
+                finally:
+                    if orch_ctx is not None:
+                        orch_ctx.__exit__(None, None, None)

            elif backend["type"] == "openai_compat":
                # cf_orch: try allocation first — this may start the service on-demand.