peregrine/config/llm.yaml.example
pyr0ball acc04b04eb docs(config): add cf_text and cf_voice trunk service backends to llm.yaml.example
Documents the cf-orch allocation pattern for cf-text and cf-voice as
openai_compat backends with a cf_orch block. Products enable these when
CF_ORCH_URL is set; the router allocates via the broker and calls the
managed service directly. No catalog or leaf details here — those live
in cf-orch node profiles (The Orchard trunk/leaf split).
2026-04-20 10:56:22 -07:00

110 lines
3.6 KiB
Text

backends:
anthropic:
api_key_env: ANTHROPIC_API_KEY
enabled: false
model: claude-sonnet-4-6
type: anthropic
supports_images: true
claude_code:
api_key: any
base_url: http://localhost:3009/v1
enabled: false
model: claude-code-terminal
type: openai_compat
supports_images: true
github_copilot:
api_key: any
base_url: http://localhost:3010/v1
enabled: false
model: gpt-4o
type: openai_compat
supports_images: false
ollama:
api_key: ollama
base_url: http://ollama:11434/v1 # Docker service name; use localhost:11434 outside Docker
enabled: true
model: llama3.2:3b
type: openai_compat
supports_images: false
ollama_research:
api_key: ollama
base_url: http://ollama:11434/v1 # Docker service name; use localhost:11434 outside Docker
enabled: true
model: llama3.2:3b
type: openai_compat
supports_images: false
vllm:
api_key: ''
base_url: http://vllm:8000/v1 # Docker service name; use localhost:8000 outside Docker
enabled: true
model: __auto__
type: openai_compat
supports_images: false
vision_service:
base_url: http://localhost:8002
enabled: false
type: vision_service
supports_images: true
# ── cf-orch trunk services ─────────────────────────────────────────────────
# These backends allocate via cf-orch rather than connecting to a static URL.
# cf-orch starts the service on-demand and returns its URL; the router then
# calls it directly using the openai_compat path.
# Set CF_ORCH_URL (env) or url below; leave enabled: false if cf-orch is
# not deployed in your environment.
cf_text:
type: openai_compat
enabled: false
base_url: http://localhost:8008/v1 # fallback when cf-orch is not available
model: __auto__
api_key: any
supports_images: false
cf_orch:
service: cf-text
# model_candidates: leave empty to use the service's default_model,
# or specify an alias from the node's catalog (e.g. "qwen2.5-3b").
model_candidates: []
ttl_s: 3600
cf_voice:
type: openai_compat
enabled: false
base_url: http://localhost:8009/v1 # fallback when cf-orch is not available
model: __auto__
api_key: any
supports_images: false
cf_orch:
service: cf-voice
model_candidates: []
ttl_s: 3600
fallback_order:
- ollama
- claude_code
- vllm
- github_copilot
- anthropic
research_fallback_order:
- claude_code
- vllm
- ollama_research
- github_copilot
- anthropic
vision_fallback_order:
- vision_service
- claude_code
- anthropic
# Note: 'ollama' (alex-cover-writer) intentionally excluded — research
# must never use the fine-tuned writer model, and this also avoids evicting
# the writer from GPU memory while a cover letter task is in flight.
# ── Scheduler — LLM batch queue optimizer ─────────────────────────────────────
# The scheduler batches LLM tasks by model type to avoid GPU model switching.
# VRAM budgets are conservative peak estimates (GB) for each task type.
# Increase if your models are larger; decrease if tasks share GPU memory well.
scheduler:
vram_budgets:
cover_letter: 2.5 # alex-cover-writer:latest (~2GB GGUF + headroom)
company_research: 5.0 # llama3.1:8b or vllm model
wizard_generate: 2.5 # same model family as cover_letter
max_queue_depth: 500 # max pending tasks per type before drops (with logged warning)