- Rename user-facing env var CF_ORCH_URL → GPU_SERVER_URL with full backward-compat alias (closes #116). Priority chain: GPU_SERVER_URL → CF_ORCH_URL → orch.circuitforge.tech when CF_LICENSE_KEY present. Write-back to os.environ[CF_ORCH_URL] keeps all downstream callers unchanged. - Add four task-routed llm.yaml backends (cf_cover_letter, cf_ats_rewrite, cf_job_research, cf_interview_prep) using cf_orch.product + cf_orch.task. Coordinator resolves model/node from assignments.yaml (closes #115). - Update compose.yml, compose.cloud.yml, compose.test-cfcore.yml, .env.example to use GPU_SERVER_URL as primary documented var.
160 lines
4.7 KiB
Text
160 lines
4.7 KiB
Text
backends:
|
|
anthropic:
|
|
api_key_env: ANTHROPIC_API_KEY
|
|
enabled: false
|
|
model: claude-sonnet-4-6
|
|
type: anthropic
|
|
supports_images: true
|
|
claude_code:
|
|
api_key: any
|
|
base_url: http://localhost:3009/v1
|
|
enabled: false
|
|
model: claude-code-terminal
|
|
type: openai_compat
|
|
supports_images: true
|
|
github_copilot:
|
|
api_key: any
|
|
base_url: http://localhost:3010/v1
|
|
enabled: false
|
|
model: gpt-4o
|
|
type: openai_compat
|
|
supports_images: false
|
|
ollama:
|
|
api_key: ollama
|
|
base_url: http://ollama:11434/v1 # Docker service name; use localhost:11434 outside Docker
|
|
enabled: true
|
|
model: llama3.2:3b
|
|
type: openai_compat
|
|
supports_images: false
|
|
ollama_research:
|
|
api_key: ollama
|
|
base_url: http://ollama:11434/v1 # Docker service name; use localhost:11434 outside Docker
|
|
enabled: true
|
|
model: llama3.2:3b
|
|
type: openai_compat
|
|
supports_images: false
|
|
vllm:
|
|
api_key: ''
|
|
base_url: http://vllm:8000/v1 # Docker service name; use localhost:8000 outside Docker
|
|
enabled: true
|
|
model: __auto__
|
|
type: openai_compat
|
|
supports_images: false
|
|
vision_service:
|
|
base_url: http://localhost:8002
|
|
enabled: false
|
|
type: vision_service
|
|
supports_images: true
|
|
|
|
# ── cf-orch task-routed backends (preferred for GPU inference) ────────────
|
|
# Use these when GPU_SERVER_URL is configured. The coordinator resolves
|
|
# product+task → model_id → node via assignments.yaml; no model IDs needed here.
|
|
# Set enabled: true once GPU_SERVER_URL is configured.
|
|
cf_cover_letter:
|
|
type: openai_compat
|
|
enabled: false
|
|
base_url: http://localhost:8008/v1 # fallback when cf-orch is unavailable
|
|
model: __auto__
|
|
api_key: any
|
|
supports_images: false
|
|
cf_orch:
|
|
product: peregrine
|
|
task: cover_letter
|
|
ttl_s: 3600
|
|
|
|
cf_ats_rewrite:
|
|
type: openai_compat
|
|
enabled: false
|
|
base_url: http://localhost:8008/v1
|
|
model: __auto__
|
|
api_key: any
|
|
supports_images: false
|
|
cf_orch:
|
|
product: peregrine
|
|
task: ats_rewrite
|
|
ttl_s: 3600
|
|
|
|
cf_job_research:
|
|
type: openai_compat
|
|
enabled: false
|
|
base_url: http://localhost:8008/v1
|
|
model: __auto__
|
|
api_key: any
|
|
supports_images: false
|
|
cf_orch:
|
|
product: peregrine
|
|
task: job_research
|
|
ttl_s: 3600
|
|
|
|
cf_interview_prep:
|
|
type: openai_compat
|
|
enabled: false
|
|
base_url: http://localhost:8008/v1
|
|
model: __auto__
|
|
api_key: any
|
|
supports_images: false
|
|
cf_orch:
|
|
product: peregrine
|
|
task: interview_prep
|
|
ttl_s: 3600
|
|
|
|
# ── cf-orch trunk services (service-based, legacy) ─────────────────────────
|
|
# Generic service allocation — use the task-routed backends above when possible.
|
|
# Set GPU_SERVER_URL (env) or url below; leave enabled: false if cf-orch is
|
|
# not deployed in your environment.
|
|
cf_text:
|
|
type: openai_compat
|
|
enabled: false
|
|
base_url: http://localhost:8008/v1 # fallback when cf-orch is not available
|
|
model: __auto__
|
|
api_key: any
|
|
supports_images: false
|
|
cf_orch:
|
|
service: cf-text
|
|
# model_candidates: leave empty to use the service's default_model,
|
|
# or specify an alias from the node's catalog (e.g. "qwen2.5-3b").
|
|
model_candidates: []
|
|
ttl_s: 3600
|
|
|
|
cf_voice:
|
|
type: openai_compat
|
|
enabled: false
|
|
base_url: http://localhost:8009/v1 # fallback when cf-orch is not available
|
|
model: __auto__
|
|
api_key: any
|
|
supports_images: false
|
|
cf_orch:
|
|
service: cf-voice
|
|
model_candidates: []
|
|
ttl_s: 3600
|
|
|
|
fallback_order:
|
|
- ollama
|
|
- claude_code
|
|
- vllm
|
|
- github_copilot
|
|
- anthropic
|
|
research_fallback_order:
|
|
- claude_code
|
|
- vllm
|
|
- ollama_research
|
|
- github_copilot
|
|
- anthropic
|
|
vision_fallback_order:
|
|
- vision_service
|
|
- claude_code
|
|
- anthropic
|
|
# Note: 'ollama' (alex-cover-writer) intentionally excluded — research
|
|
# must never use the fine-tuned writer model, and this also avoids evicting
|
|
# the writer from GPU memory while a cover letter task is in flight.
|
|
|
|
# ── Scheduler — LLM batch queue optimizer ─────────────────────────────────────
|
|
# The scheduler batches LLM tasks by model type to avoid GPU model switching.
|
|
# VRAM budgets are conservative peak estimates (GB) for each task type.
|
|
# Increase if your models are larger; decrease if tasks share GPU memory well.
|
|
scheduler:
|
|
vram_budgets:
|
|
cover_letter: 2.5 # alex-cover-writer:latest (~2GB GGUF + headroom)
|
|
company_research: 5.0 # llama3.1:8b or vllm model
|
|
wizard_generate: 2.5 # same model family as cover_letter
|
|
max_queue_depth: 500 # max pending tasks per type before drops (with logged warning)
|