From e3547cd998378b1c8f22d2ed97f9c20b8efd3624 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Sun, 15 Mar 2026 03:28:26 -0700 Subject: [PATCH] docs(config): add scheduler VRAM budget config to llm.yaml.example --- config/llm.yaml.example | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/config/llm.yaml.example b/config/llm.yaml.example index 5b006ef..a42a25a 100644 --- a/config/llm.yaml.example +++ b/config/llm.yaml.example @@ -64,3 +64,14 @@ vision_fallback_order: # Note: 'ollama' (alex-cover-writer) intentionally excluded — research # must never use the fine-tuned writer model, and this also avoids evicting # the writer from GPU memory while a cover letter task is in flight. + +# ── Scheduler — LLM batch queue optimizer ───────────────────────────────────── +# The scheduler batches LLM tasks by model type to avoid GPU model switching. +# VRAM budgets are conservative peak estimates (GB) for each task type. +# Increase if your models are larger; decrease if tasks share GPU memory well. +scheduler: + vram_budgets: + cover_letter: 2.5 # alex-cover-writer:latest (~2GB GGUF + headroom) + company_research: 5.0 # llama3.1:8b or vllm model + wizard_generate: 2.5 # same model family as cover_letter + max_queue_depth: 500 # max pending tasks per type before drops (with logged warning)