peregrine/config/llm.yaml.example

backends:
  anthropic:
    api_key_env: ANTHROPIC_API_KEY
    enabled: false
    model: claude-sonnet-4-6
    type: anthropic
    supports_images: true
  claude_code:
    api_key: any
    base_url: http://localhost:3009/v1
    enabled: false
    model: claude-code-terminal
    type: openai_compat
    supports_images: true
  github_copilot:
    api_key: any
    base_url: http://localhost:3010/v1
    enabled: false
    model: gpt-4o
    type: openai_compat
    supports_images: false
  ollama:
    api_key: ollama
    base_url: http://ollama:11434/v1    # Docker service name; use localhost:11434 outside Docker
    enabled: true
    model: llama3.2:3b
    type: openai_compat
    supports_images: false
  ollama_research:
    api_key: ollama
    base_url: http://ollama:11434/v1    # Docker service name; use localhost:11434 outside Docker
    enabled: true
    model: llama3.2:3b
    type: openai_compat
    supports_images: false
  vllm:
    api_key: ''
    base_url: http://vllm:8000/v1      # Docker service name; use localhost:8000 outside Docker
    enabled: true
    model: __auto__
    type: openai_compat
    supports_images: false
  vision_service:
    base_url: http://localhost:8002
    enabled: false
    type: vision_service
    supports_images: true

  # ── cf-orch task-routed backends (preferred for GPU inference) ────────────
  # Use these when GPU_SERVER_URL is configured. The coordinator resolves
  # product+task → model_id → node via assignments.yaml; no model IDs needed here.
  # Set enabled: true once GPU_SERVER_URL is configured.
  cf_cover_letter:
    type: openai_compat
    enabled: false
    base_url: http://localhost:8008/v1   # fallback when cf-orch is unavailable
    model: __auto__
    api_key: any
    supports_images: false
    cf_orch:
      product: peregrine
      task: cover_letter
      ttl_s: 3600

  cf_ats_rewrite:
    type: openai_compat
    enabled: false
    base_url: http://localhost:8008/v1
    model: __auto__
    api_key: any
    supports_images: false
    cf_orch:
      product: peregrine
      task: ats_rewrite
      ttl_s: 3600

  cf_job_research:
    type: openai_compat
    enabled: false
    base_url: http://localhost:8008/v1
    model: __auto__
    api_key: any
    supports_images: false
    cf_orch:
      product: peregrine
      task: job_research
      ttl_s: 3600

  cf_interview_prep:
    type: openai_compat
    enabled: false
    base_url: http://localhost:8008/v1
    model: __auto__
    api_key: any
    supports_images: false
    cf_orch:
      product: peregrine
      task: interview_prep
      ttl_s: 3600

  # ── cf-orch trunk services (service-based, legacy) ─────────────────────────
  # Generic service allocation — use the task-routed backends above when possible.
  # Set GPU_SERVER_URL (env) or url below; leave enabled: false if cf-orch is
  # not deployed in your environment.
  cf_text:
    type: openai_compat
    enabled: false
    base_url: http://localhost:8008/v1   # fallback when cf-orch is not available
    model: __auto__
    api_key: any
    supports_images: false
    cf_orch:
      service: cf-text
      # model_candidates: leave empty to use the service's default_model,
      # or specify an alias from the node's catalog (e.g. "qwen2.5-3b").
      model_candidates: []
      ttl_s: 3600

  cf_voice:
    type: openai_compat
    enabled: false
    base_url: http://localhost:8009/v1   # fallback when cf-orch is not available
    model: __auto__
    api_key: any
    supports_images: false
    cf_orch:
      service: cf-voice
      model_candidates: []
      ttl_s: 3600

fallback_order:
- ollama
- claude_code
- vllm
- github_copilot
- anthropic
research_fallback_order:
- claude_code
- vllm
- ollama_research
- github_copilot
- anthropic
vision_fallback_order:
- vision_service
- claude_code
- anthropic
# Note: 'ollama' (alex-cover-writer) intentionally excluded — research
# must never use the fine-tuned writer model, and this also avoids evicting
# the writer from GPU memory while a cover letter task is in flight.

# ── Scheduler — LLM batch queue optimizer ─────────────────────────────────────
# The scheduler batches LLM tasks by model type to avoid GPU model switching.
# VRAM budgets are conservative peak estimates (GB) for each task type.
# Increase if your models are larger; decrease if tasks share GPU memory well.
scheduler:
  vram_budgets:
    cover_letter: 2.5       # alex-cover-writer:latest (~2GB GGUF + headroom)
    company_research: 5.0   # llama3.1:8b or vllm model
    wizard_generate: 2.5    # same model family as cover_letter
  max_queue_depth: 500      # max pending tasks per type before drops (with logged warning)