feat: wire cf-orch allocate flow for LLM routing
- Fix cf_text base_url (was port 8006/cf-musicgen, corrected to 8008/cf-text) - Add cf_orch blocks to cf_text, ollama, ollama_research, vllm_research backends - Fix ollama_research base_url to host.docker.internal:11435 (was Docker service name) - Promote cf_text to top of research_fallback_order - Add cf_text backend to llm.cloud.yaml with cf_orch block - Wire _RL_WIZARD rate limit to wizard_ai_interview endpoint (closes TODO from #122) Closes: #122
This commit is contained in:
parent
b3435a8bd8
commit
80041d1dd9
3 changed files with 42 additions and 9 deletions
|
|
@ -1,4 +1,14 @@
|
||||||
backends:
|
backends:
|
||||||
|
cf_text:
|
||||||
|
api_key: any
|
||||||
|
base_url: http://host.docker.internal:8008/v1
|
||||||
|
enabled: true
|
||||||
|
model: cf-text
|
||||||
|
supports_images: false
|
||||||
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: cf-text
|
||||||
|
ttl_s: 300
|
||||||
anthropic:
|
anthropic:
|
||||||
api_key_env: ANTHROPIC_API_KEY
|
api_key_env: ANTHROPIC_API_KEY
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
@ -26,6 +36,9 @@ backends:
|
||||||
model: llama3.1:8b # generic — no personal fine-tunes in cloud
|
model: llama3.1:8b # generic — no personal fine-tunes in cloud
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: ollama
|
||||||
|
ttl_s: 300
|
||||||
ollama_research:
|
ollama_research:
|
||||||
api_key: ollama
|
api_key: ollama
|
||||||
base_url: http://host.docker.internal:11434/v1
|
base_url: http://host.docker.internal:11434/v1
|
||||||
|
|
@ -33,6 +46,9 @@ backends:
|
||||||
model: llama3.1:8b
|
model: llama3.1:8b
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: ollama
|
||||||
|
ttl_s: 300
|
||||||
vision_service:
|
vision_service:
|
||||||
base_url: http://host.docker.internal:8002
|
base_url: http://host.docker.internal:8002
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
@ -63,9 +79,11 @@ backends:
|
||||||
- Qwen2.5-3B-Instruct
|
- Qwen2.5-3B-Instruct
|
||||||
ttl_s: 300
|
ttl_s: 300
|
||||||
fallback_order:
|
fallback_order:
|
||||||
|
- cf_text
|
||||||
- vllm
|
- vllm
|
||||||
- ollama
|
- ollama
|
||||||
research_fallback_order:
|
research_fallback_order:
|
||||||
|
- cf_text
|
||||||
- vllm_research
|
- vllm_research
|
||||||
- ollama_research
|
- ollama_research
|
||||||
vision_fallback_order:
|
vision_fallback_order:
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,14 @@
|
||||||
backends:
|
backends:
|
||||||
cf_text:
|
cf_text:
|
||||||
api_key: any
|
api_key: any
|
||||||
base_url: http://host.docker.internal:8006/v1
|
base_url: http://host.docker.internal:8008/v1
|
||||||
enabled: true
|
enabled: true
|
||||||
model: cf-text
|
model: cf-text
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: cf-text
|
||||||
|
ttl_s: 300
|
||||||
anthropic:
|
anthropic:
|
||||||
api_key_env: ANTHROPIC_API_KEY
|
api_key_env: ANTHROPIC_API_KEY
|
||||||
enabled: false
|
enabled: false
|
||||||
|
|
@ -33,13 +36,19 @@ backends:
|
||||||
model: llama3.2:3b
|
model: llama3.2:3b
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: ollama
|
||||||
|
ttl_s: 300
|
||||||
ollama_research:
|
ollama_research:
|
||||||
api_key: ollama
|
api_key: ollama
|
||||||
base_url: http://ollama_research:11434/v1
|
base_url: http://host.docker.internal:11435/v1
|
||||||
enabled: true
|
enabled: true
|
||||||
model: llama3.1:8b
|
model: llama3.1:8b
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: ollama
|
||||||
|
ttl_s: 300
|
||||||
vision_service:
|
vision_service:
|
||||||
base_url: http://vision:8002
|
base_url: http://vision:8002
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|
@ -64,6 +73,11 @@ backends:
|
||||||
model: __auto__
|
model: __auto__
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
cf_orch:
|
||||||
|
service: vllm
|
||||||
|
model_candidates:
|
||||||
|
- Qwen2.5-3B-Instruct
|
||||||
|
ttl_s: 300
|
||||||
fallback_order:
|
fallback_order:
|
||||||
- cf_text
|
- cf_text
|
||||||
- ollama
|
- ollama
|
||||||
|
|
@ -72,10 +86,10 @@ fallback_order:
|
||||||
- github_copilot
|
- github_copilot
|
||||||
- anthropic
|
- anthropic
|
||||||
research_fallback_order:
|
research_fallback_order:
|
||||||
- claude_code
|
- cf_text
|
||||||
- vllm_research
|
- vllm_research
|
||||||
- ollama_research
|
- ollama_research
|
||||||
- cf_text
|
- claude_code
|
||||||
- github_copilot
|
- github_copilot
|
||||||
- anthropic
|
- anthropic
|
||||||
vision_fallback_order:
|
vision_fallback_order:
|
||||||
|
|
|
||||||
11
dev-api.py
11
dev-api.py
|
|
@ -80,7 +80,7 @@ _RL_COVER_LETTER = os.environ.get("LLM_RATE_COVER_LETTER", "20/hour")
|
||||||
_RL_RESEARCH = os.environ.get("LLM_RATE_RESEARCH", "10/hour")
|
_RL_RESEARCH = os.environ.get("LLM_RATE_RESEARCH", "10/hour")
|
||||||
_RL_QA_SUGGEST = os.environ.get("LLM_RATE_QA_SUGGEST", "60/hour")
|
_RL_QA_SUGGEST = os.environ.get("LLM_RATE_QA_SUGGEST", "60/hour")
|
||||||
_RL_SURVEY = os.environ.get("LLM_RATE_SURVEY", "30/hour")
|
_RL_SURVEY = os.environ.get("LLM_RATE_SURVEY", "30/hour")
|
||||||
_RL_WIZARD = os.environ.get("LLM_RATE_WIZARD", "60/hour") # TODO(#122): wire to wizard/ai/interview after feat/77 merges
|
_RL_WIZARD = os.environ.get("LLM_RATE_WIZARD", "60/hour")
|
||||||
|
|
||||||
# Resolve GPU inference server URL.
|
# Resolve GPU inference server URL.
|
||||||
# Priority: GPU_SERVER_URL → CF_ORCH_URL (backward compat) → cloud default when licensed.
|
# Priority: GPU_SERVER_URL → CF_ORCH_URL (backward compat) → cloud default when licensed.
|
||||||
|
|
@ -4654,7 +4654,8 @@ _WIZARD_ALLOWED_FIELDS: frozenset[str] = frozenset({
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/wizard/ai/interview")
|
@app.post("/api/wizard/ai/interview")
|
||||||
def wizard_ai_interview(request: WizardInterviewRequest):
|
@limiter.limit(_RL_WIZARD)
|
||||||
|
def wizard_ai_interview(request: Request, body: WizardInterviewRequest):
|
||||||
"""Conduct one turn of the AI-guided profile interview. Tier-gated (BYOK-unlockable)."""
|
"""Conduct one turn of the AI-guided profile interview. Tier-gated (BYOK-unlockable)."""
|
||||||
from app.wizard.tiers import can_use, has_configured_llm
|
from app.wizard.tiers import can_use, has_configured_llm
|
||||||
|
|
||||||
|
|
@ -4664,7 +4665,7 @@ def wizard_ai_interview(request: WizardInterviewRequest):
|
||||||
|
|
||||||
# Build conversation prompt from history
|
# Build conversation prompt from history
|
||||||
conversation_lines = []
|
conversation_lines = []
|
||||||
for msg in request.history:
|
for msg in body.history:
|
||||||
role = msg.role
|
role = msg.role
|
||||||
content = msg.content.replace("\n", " ").replace("\r", "")
|
content = msg.content.replace("\n", " ").replace("\r", "")
|
||||||
if role == "user":
|
if role == "user":
|
||||||
|
|
@ -4675,10 +4676,10 @@ def wizard_ai_interview(request: WizardInterviewRequest):
|
||||||
history_block = "\n".join(conversation_lines) if conversation_lines else "User: (starting conversation)"
|
history_block = "\n".join(conversation_lines) if conversation_lines else "User: (starting conversation)"
|
||||||
|
|
||||||
# Build profile summary to give LLM context about what's already known
|
# Build profile summary to give LLM context about what's already known
|
||||||
if request.profile_so_far:
|
if body.profile_so_far:
|
||||||
gathered = ", ".join(
|
gathered = ", ".join(
|
||||||
f"{k}={repr(v)}"
|
f"{k}={repr(v)}"
|
||||||
for k, v in request.profile_so_far.items()
|
for k, v in body.profile_so_far.items()
|
||||||
if v not in (None, "", [], {})
|
if v not in (None, "", [], {})
|
||||||
)
|
)
|
||||||
profile_context = f"\n\n[Already gathered: {gathered}]" if gathered else ""
|
profile_context = f"\n\n[Already gathered: {gathered}]" if gathered else ""
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue