chore(cf-orch): route recipe LLM calls through vllm with model candidates + CF_APP_NAME
Switches recipe generation service type from 'cf-text' to 'vllm' so the coordinator can route to quantized small models (Qwen2.5-3B, Phi-4-mini) rather than the full text backend. Passes CF_APP_NAME for per-product VRAM/request analytics in the coordinator dashboard. - llm_recipe.py: _SERVICE_TYPE = 'vllm'; _MODEL_CANDIDATES list; passes model_candidates and pipeline= to CFOrchClient.allocate() - compose.cloud.yml: CF_APP_NAME=kiwi env var for coordinator attribution
This commit is contained in:
parent
b223325d77
commit
7a7eae4666
2 changed files with 6 additions and 1 deletions
|
|
@ -149,7 +149,8 @@ class LLMRecipeGenerator:
|
|||
|
||||
return "\n".join(lines)
|
||||
|
||||
_SERVICE_TYPE = "cf-text"
|
||||
_SERVICE_TYPE = "vllm"
|
||||
_MODEL_CANDIDATES = ["Qwen2.5-3B-Instruct", "Phi-4-mini-instruct"]
|
||||
_TTL_S = 300.0
|
||||
_CALLER = "kiwi-recipe"
|
||||
|
||||
|
|
@ -167,8 +168,10 @@ class LLMRecipeGenerator:
|
|||
client = CFOrchClient(cf_orch_url)
|
||||
return client.allocate(
|
||||
service=self._SERVICE_TYPE,
|
||||
model_candidates=self._MODEL_CANDIDATES,
|
||||
ttl_s=self._TTL_S,
|
||||
caller=self._CALLER,
|
||||
pipeline=os.environ.get("CF_APP_NAME") or None,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("CFOrchClient init failed, falling back to direct URL: %s", exc)
|
||||
|
|
|
|||
|
|
@ -21,6 +21,8 @@ services:
|
|||
CLOUD_AUTH_BYPASS_IPS: ${CLOUD_AUTH_BYPASS_IPS:-}
|
||||
# cf-orch: route LLM calls through the coordinator for managed GPU inference
|
||||
CF_ORCH_URL: http://host.docker.internal:7700
|
||||
# Product identifier for coordinator analytics — per-product VRAM/request breakdown
|
||||
CF_APP_NAME: kiwi
|
||||
# Community PostgreSQL — shared across CF products; unset = community features unavailable (fail soft)
|
||||
COMMUNITY_DB_URL: ${COMMUNITY_DB_URL:-}
|
||||
COMMUNITY_PSEUDONYM_SALT: ${COMMUNITY_PSEUDONYM_SALT:-}
|
||||
|
|
|
|||
Loading…
Reference in a new issue