chore(llm): switch vllm model_candidates from Ouro to Phi-4-mini + Qwen2.5-3B

Ouro models incompatible with transformers 5.x bundled in cf env. Phi-4-mini-instruct tried first (stronger benchmarks, 7.2GB); Qwen2.5-3B-Instruct as VRAM-constrained fallback (5.8GB).
2026-04-02 15:34:59 -07:00 · 2026-04-02 15:34:59 -07:00 · 11fb3a07b4
commit 11fb3a07b4
parent 7c9dcd2620
1 changed files with 5 additions and 5 deletions
--- a/config/llm.yaml
+++ b/config/llm.yaml
@ -28,7 +28,7 @@ backends:
    type: openai_compat
  ollama_research:
    api_key: ollama
-    base_url: http://host.docker.internal:11434/v1
+    base_url: http://ollama_research:11434/v1
    enabled: true
    model: llama3.1:8b
    supports_images: false
@ -40,7 +40,7 @@ backends:
    type: vision_service
  vllm:
    api_key: ''
-    base_url: http://host.docker.internal:8000/v1
+    base_url: http://vllm:8000/v1
    enabled: true
    model: __auto__
    supports_images: false
@ -48,12 +48,12 @@ backends:
    cf_orch:
      service: vllm
      model_candidates:
-        - Ouro-2.6B-Thinking
+      - Phi-4-mini-instruct
-        - Ouro-1.4B
+      - Qwen2.5-3B-Instruct
      ttl_s: 300
  vllm_research:
    api_key: ''
-    base_url: http://host.docker.internal:8000/v1
+    base_url: http://vllm:8000/v1
    enabled: true
    model: __auto__
    supports_images: false