Compare commits

..

3 commits

Author SHA1 Message Date
bc80922d61 chore(llm): swap model_candidates order — Qwen2.5-3B first, Phi-4-mini fallback
Phi-4-mini's cached modeling_phi3.py imports SlidingWindowCache which
was removed in transformers 5.x. Qwen2.5-3B uses built-in qwen2 arch
and works cleanly. Reorder so Qwen is tried first.
2026-04-02 16:36:38 -07:00
11fb3a07b4 chore(llm): switch vllm model_candidates from Ouro to Phi-4-mini + Qwen2.5-3B
Ouro models incompatible with transformers 5.x bundled in cf env.
Phi-4-mini-instruct tried first (stronger benchmarks, 7.2GB);
Qwen2.5-3B-Instruct as VRAM-constrained fallback (5.8GB).
2026-04-02 15:34:59 -07:00
7c9dcd2620 config(llm): add cf_orch block to vllm backend 2026-04-02 12:20:41 -07:00

View file

@ -28,9 +28,9 @@ backends:
type: openai_compat type: openai_compat
ollama_research: ollama_research:
api_key: ollama api_key: ollama
base_url: http://host.docker.internal:11434/v1 base_url: http://ollama_research:11434/v1
enabled: true enabled: true
model: llama3.2:3b model: llama3.1:8b
supports_images: false supports_images: false
type: openai_compat type: openai_compat
vision_service: vision_service:
@ -40,14 +40,20 @@ backends:
type: vision_service type: vision_service
vllm: vllm:
api_key: '' api_key: ''
base_url: http://host.docker.internal:8000/v1 base_url: http://vllm:8000/v1
enabled: true enabled: true
model: __auto__ model: __auto__
supports_images: false supports_images: false
type: openai_compat type: openai_compat
cf_orch:
service: vllm
model_candidates:
- Qwen2.5-3B-Instruct
- Phi-4-mini-instruct
ttl_s: 300
vllm_research: vllm_research:
api_key: '' api_key: ''
base_url: http://host.docker.internal:8000/v1 base_url: http://vllm:8000/v1
enabled: true enabled: true
model: __auto__ model: __auto__
supports_images: false supports_images: false