diff --git a/config/llm.cloud.yaml b/config/llm.cloud.yaml index a0173f6..64aa127 100644 --- a/config/llm.cloud.yaml +++ b/config/llm.cloud.yaml @@ -1,4 +1,14 @@ backends: + cf_text: + api_key: any + base_url: http://host.docker.internal:8008/v1 + enabled: true + model: cf-text + supports_images: false + type: openai_compat + cf_orch: + service: cf-text + ttl_s: 300 anthropic: api_key_env: ANTHROPIC_API_KEY enabled: false @@ -26,6 +36,9 @@ backends: model: llama3.1:8b # generic — no personal fine-tunes in cloud supports_images: false type: openai_compat + cf_orch: + service: ollama + ttl_s: 300 ollama_research: api_key: ollama base_url: http://host.docker.internal:11434/v1 @@ -33,6 +46,9 @@ backends: model: llama3.1:8b supports_images: false type: openai_compat + cf_orch: + service: ollama + ttl_s: 300 vision_service: base_url: http://host.docker.internal:8002 enabled: true @@ -63,9 +79,11 @@ backends: - Qwen2.5-3B-Instruct ttl_s: 300 fallback_order: +- cf_text - vllm - ollama research_fallback_order: +- cf_text - vllm_research - ollama_research vision_fallback_order: diff --git a/config/llm.yaml b/config/llm.yaml index 515bf24..95f4383 100644 --- a/config/llm.yaml +++ b/config/llm.yaml @@ -1,11 +1,14 @@ backends: cf_text: api_key: any - base_url: http://host.docker.internal:8006/v1 + base_url: http://host.docker.internal:8008/v1 enabled: true model: cf-text supports_images: false type: openai_compat + cf_orch: + service: cf-text + ttl_s: 300 anthropic: api_key_env: ANTHROPIC_API_KEY enabled: false @@ -33,13 +36,19 @@ backends: model: llama3.2:3b supports_images: false type: openai_compat + cf_orch: + service: ollama + ttl_s: 300 ollama_research: api_key: ollama - base_url: http://ollama_research:11434/v1 + base_url: http://host.docker.internal:11435/v1 enabled: true model: llama3.1:8b supports_images: false type: openai_compat + cf_orch: + service: ollama + ttl_s: 300 vision_service: base_url: http://vision:8002 enabled: true @@ -64,6 +73,11 @@ backends: model: __auto__ supports_images: false type: openai_compat + cf_orch: + service: vllm + model_candidates: + - Qwen2.5-3B-Instruct + ttl_s: 300 fallback_order: - cf_text - ollama @@ -72,10 +86,10 @@ fallback_order: - github_copilot - anthropic research_fallback_order: -- claude_code +- cf_text - vllm_research - ollama_research -- cf_text +- claude_code - github_copilot - anthropic vision_fallback_order: diff --git a/dev-api.py b/dev-api.py index 615d575..52d1003 100644 --- a/dev-api.py +++ b/dev-api.py @@ -80,7 +80,7 @@ _RL_COVER_LETTER = os.environ.get("LLM_RATE_COVER_LETTER", "20/hour") _RL_RESEARCH = os.environ.get("LLM_RATE_RESEARCH", "10/hour") _RL_QA_SUGGEST = os.environ.get("LLM_RATE_QA_SUGGEST", "60/hour") _RL_SURVEY = os.environ.get("LLM_RATE_SURVEY", "30/hour") -_RL_WIZARD = os.environ.get("LLM_RATE_WIZARD", "60/hour") # TODO(#122): wire to wizard/ai/interview after feat/77 merges +_RL_WIZARD = os.environ.get("LLM_RATE_WIZARD", "60/hour") # Resolve GPU inference server URL. # Priority: GPU_SERVER_URL → CF_ORCH_URL (backward compat) → cloud default when licensed. @@ -4654,7 +4654,8 @@ _WIZARD_ALLOWED_FIELDS: frozenset[str] = frozenset({ @app.post("/api/wizard/ai/interview") -def wizard_ai_interview(request: WizardInterviewRequest): +@limiter.limit(_RL_WIZARD) +def wizard_ai_interview(request: Request, body: WizardInterviewRequest): """Conduct one turn of the AI-guided profile interview. Tier-gated (BYOK-unlockable).""" from app.wizard.tiers import can_use, has_configured_llm @@ -4664,7 +4665,7 @@ def wizard_ai_interview(request: WizardInterviewRequest): # Build conversation prompt from history conversation_lines = [] - for msg in request.history: + for msg in body.history: role = msg.role content = msg.content.replace("\n", " ").replace("\r", "") if role == "user": @@ -4675,10 +4676,10 @@ def wizard_ai_interview(request: WizardInterviewRequest): history_block = "\n".join(conversation_lines) if conversation_lines else "User: (starting conversation)" # Build profile summary to give LLM context about what's already known - if request.profile_so_far: + if body.profile_so_far: gathered = ", ".join( f"{k}={repr(v)}" - for k, v in request.profile_so_far.items() + for k, v in body.profile_so_far.items() if v not in (None, "", [], {}) ) profile_context = f"\n\n[Already gathered: {gathered}]" if gathered else ""