chore(cf-orch): route recipe LLM calls through vllm with model candidates + CF_APP_NAME
Switches recipe generation service type from 'cf-text' to 'vllm' so the coordinator can route to quantized small models (Qwen2.5-3B, Phi-4-mini) rather than the full text backend. Passes CF_APP_NAME for per-product VRAM/request analytics in the coordinator dashboard. - llm_recipe.py: _SERVICE_TYPE = 'vllm'; _MODEL_CANDIDATES list; passes model_candidates and pipeline= to CFOrchClient.allocate() - compose.cloud.yml: CF_APP_NAME=kiwi env var for coordinator attribution
This commit is contained in:
parent
b223325d77
commit
7a7eae4666
2 changed files with 6 additions and 1 deletions
|
|
@ -149,7 +149,8 @@ class LLMRecipeGenerator:
|
||||||
|
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
_SERVICE_TYPE = "cf-text"
|
_SERVICE_TYPE = "vllm"
|
||||||
|
_MODEL_CANDIDATES = ["Qwen2.5-3B-Instruct", "Phi-4-mini-instruct"]
|
||||||
_TTL_S = 300.0
|
_TTL_S = 300.0
|
||||||
_CALLER = "kiwi-recipe"
|
_CALLER = "kiwi-recipe"
|
||||||
|
|
||||||
|
|
@ -167,8 +168,10 @@ class LLMRecipeGenerator:
|
||||||
client = CFOrchClient(cf_orch_url)
|
client = CFOrchClient(cf_orch_url)
|
||||||
return client.allocate(
|
return client.allocate(
|
||||||
service=self._SERVICE_TYPE,
|
service=self._SERVICE_TYPE,
|
||||||
|
model_candidates=self._MODEL_CANDIDATES,
|
||||||
ttl_s=self._TTL_S,
|
ttl_s=self._TTL_S,
|
||||||
caller=self._CALLER,
|
caller=self._CALLER,
|
||||||
|
pipeline=os.environ.get("CF_APP_NAME") or None,
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("CFOrchClient init failed, falling back to direct URL: %s", exc)
|
logger.debug("CFOrchClient init failed, falling back to direct URL: %s", exc)
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,8 @@ services:
|
||||||
CLOUD_AUTH_BYPASS_IPS: ${CLOUD_AUTH_BYPASS_IPS:-}
|
CLOUD_AUTH_BYPASS_IPS: ${CLOUD_AUTH_BYPASS_IPS:-}
|
||||||
# cf-orch: route LLM calls through the coordinator for managed GPU inference
|
# cf-orch: route LLM calls through the coordinator for managed GPU inference
|
||||||
CF_ORCH_URL: http://host.docker.internal:7700
|
CF_ORCH_URL: http://host.docker.internal:7700
|
||||||
|
# Product identifier for coordinator analytics — per-product VRAM/request breakdown
|
||||||
|
CF_APP_NAME: kiwi
|
||||||
# Community PostgreSQL — shared across CF products; unset = community features unavailable (fail soft)
|
# Community PostgreSQL — shared across CF products; unset = community features unavailable (fail soft)
|
||||||
COMMUNITY_DB_URL: ${COMMUNITY_DB_URL:-}
|
COMMUNITY_DB_URL: ${COMMUNITY_DB_URL:-}
|
||||||
COMMUNITY_PSEUDONYM_SALT: ${COMMUNITY_PSEUDONYM_SALT:-}
|
COMMUNITY_PSEUDONYM_SALT: ${COMMUNITY_PSEUDONYM_SALT:-}
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue