chore(llm): switch vllm model_candidates from Ouro to Phi-4-mini + Qwen2.5-3B

Ouro models incompatible with transformers 5.x bundled in cf env.
Phi-4-mini-instruct tried first (stronger benchmarks, 7.2GB);
Qwen2.5-3B-Instruct as VRAM-constrained fallback (5.8GB).
This commit is contained in:
pyr0ball 2026-04-02 15:34:59 -07:00
parent 7c9dcd2620
commit 11fb3a07b4

View file

@ -28,7 +28,7 @@ backends:
type: openai_compat type: openai_compat
ollama_research: ollama_research:
api_key: ollama api_key: ollama
base_url: http://host.docker.internal:11434/v1 base_url: http://ollama_research:11434/v1
enabled: true enabled: true
model: llama3.1:8b model: llama3.1:8b
supports_images: false supports_images: false
@ -40,7 +40,7 @@ backends:
type: vision_service type: vision_service
vllm: vllm:
api_key: '' api_key: ''
base_url: http://host.docker.internal:8000/v1 base_url: http://vllm:8000/v1
enabled: true enabled: true
model: __auto__ model: __auto__
supports_images: false supports_images: false
@ -48,12 +48,12 @@ backends:
cf_orch: cf_orch:
service: vllm service: vllm
model_candidates: model_candidates:
- Ouro-2.6B-Thinking - Phi-4-mini-instruct
- Ouro-1.4B - Qwen2.5-3B-Instruct
ttl_s: 300 ttl_s: 300
vllm_research: vllm_research:
api_key: '' api_key: ''
base_url: http://host.docker.internal:8000/v1 base_url: http://vllm:8000/v1
enabled: true enabled: true
model: __auto__ model: __auto__
supports_images: false supports_images: false