From af1ffa1d9436f7a39f31ed996d4c3feddec2ac8c Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Tue, 14 Apr 2026 13:23:44 -0700
Subject: [PATCH] =?UTF-8?q?feat:=20wire=20Search=20with=20AI=20to=20cf-orc?=
 =?UTF-8?q?h=20=E2=86=92=20Ollama=20(llama3.1:8b)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add app/llm/router.py shim — tri-level config lookup:
  repo config/llm.yaml → ~/.config/circuitforge/llm.yaml → env vars
- Add config/llm.cloud.yaml — ollama via cf-orch, llama3.1:8b
- Add config/llm.yaml.example — self-hosted reference config
- compose.cloud.yml: mount llm.cloud.yaml, set CF_ORCH_URL,
  add host.docker.internal:host-gateway (required on Linux Docker)
- api/main.py: use app.llm.router.LLMRouter (shim) not core directly
- .env.example: update LLM section to reference config/llm.yaml.example
- .gitignore: exclude config/llm.yaml (keep example + cloud yaml)

End-to-end tested: 3.2s for "used RTX 3080 under $400, no mining cards"
via cloud container → host.docker.internal:11434 → Ollama llama3.1:8b
---
 .env.example            | 12 +++++++----
 .gitignore              |  1 +
 api/main.py             |  2 +-
 app/llm/router.py       | 36 +++++++++++++++++++++++++++++++++
 compose.cloud.yml       |  6 ++++++
 config/llm.cloud.yaml   | 38 ++++++++++++++++++++++++++++++++++
 config/llm.yaml.example | 45 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 135 insertions(+), 5 deletions(-)
 create mode 100644 app/llm/router.py
 create mode 100644 config/llm.cloud.yaml
 create mode 100644 config/llm.yaml.example
diff --git a/.env.example b/.env.example
index e1c674d..9ee65a4 100644
--- a/.env.example
+++ b/.env.example
@@ -54,13 +54,17 @@ SNIPE_DB=data/snipe.db
 # own ID; the CF cloud instance uses CF's campaign ID (disclosed in the UI).
 # EBAY_AFFILIATE_CAMPAIGN_ID=
 
-# ── LLM inference (vision / photo analysis) ──────────────────────────────────
-# circuitforge-core LLMRouter auto-detects backends from these env vars
-# (no llm.yaml required). Backends are tried in this priority order:
+# ── LLM inference (Search with AI / photo analysis) ──────────────────────────
+# For self-hosted use, create config/llm.yaml from config/llm.yaml.example.
+# config/llm.yaml is the preferred way to configure backends (supports cf-orch,
+# multiple fallback backends, per-backend model selection).
+#
+# As a quick alternative, circuitforge-core LLMRouter also auto-detects backends
+# from these env vars when no llm.yaml is present:
 #   1. ANTHROPIC_API_KEY   → Claude API (cloud; requires Paid tier key)
 #   2. OPENAI_API_KEY      → OpenAI-compatible endpoint
 #   3. OLLAMA_HOST         → local Ollama (default: http://localhost:11434)
-# Leave all unset to disable LLM features (photo analysis won't run).
+# Leave all unset to disable LLM features (Search with AI won't be available).
 
 # ANTHROPIC_API_KEY=
 # ANTHROPIC_MODEL=claude-haiku-4-5-20251001
diff --git a/.gitignore b/.gitignore
index 4e81ab0..60c85e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,4 @@ data/
 .superpowers/
 web/node_modules/
 web/dist/
+config/llm.yaml
diff --git a/api/main.py b/api/main.py
index 83333c5..86eee40 100644
--- a/api/main.py
+++ b/api/main.py
@@ -109,7 +109,7 @@ async def _lifespan(app: FastAPI):
             _category_cache.refresh(token_manager=None)  # bootstrap fallback
 
         try:
-            from circuitforge_core.llm import LLMRouter
+            from app.llm.router import LLMRouter
             _llm_router = LLMRouter()
             _query_translator = QueryTranslator(
                 category_cache=_category_cache,
diff --git a/app/llm/router.py b/app/llm/router.py
new file mode 100644
index 0000000..802c3a6
--- /dev/null
+++ b/app/llm/router.py
@@ -0,0 +1,36 @@
+# app/llm/router.py
+# BSL 1.1 License
+"""
+Snipe LLMRouter shim — tri-level config path priority.
+
+Config lookup order:
+  1. <repo>/config/llm.yaml  — per-install local override
+  2. ~/.config/circuitforge/llm.yaml  — user-level config (circuitforge-core default)
+  3. env-var auto-config  (ANTHROPIC_API_KEY, OPENAI_API_KEY, OLLAMA_HOST, CF_ORCH_URL)
+"""
+from pathlib import Path
+
+from circuitforge_core.llm import LLMRouter as _CoreLLMRouter
+
+_REPO_CONFIG = Path(__file__).parent.parent.parent / "config" / "llm.yaml"
+_USER_CONFIG = Path.home() / ".config" / "circuitforge" / "llm.yaml"
+
+
+class LLMRouter(_CoreLLMRouter):
+    """Snipe-specific LLMRouter with tri-level config resolution.
+
+    Explicit ``config_path`` bypasses the lookup (useful in tests).
+    """
+
+    def __init__(self, config_path: Path | None = None) -> None:
+        if config_path is not None:
+            super().__init__(config_path)
+            return
+
+        if _REPO_CONFIG.exists():
+            super().__init__(_REPO_CONFIG)
+        elif _USER_CONFIG.exists():
+            super().__init__(_USER_CONFIG)
+        else:
+            # No yaml — let circuitforge-core env-var auto-config handle it.
+            super().__init__()
diff --git a/compose.cloud.yml b/compose.cloud.yml
index 95b77eb..a41bb14 100644
--- a/compose.cloud.yml
+++ b/compose.cloud.yml
@@ -20,9 +20,15 @@ services:
       CLOUD_MODE: "true"
       CLOUD_DATA_ROOT: /devl/snipe-cloud-data
       # DIRECTUS_JWT_SECRET, HEIMDALL_URL, HEIMDALL_ADMIN_TOKEN — set in .env (never commit)
+      # CF_ORCH_URL routes LLM query builder through cf-orch for VRAM-aware scheduling.
+      # Override in .env to use a different coordinator URL.
+      CF_ORCH_URL: "http://host.docker.internal:7700"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
     # No network_mode: host — isolated on snipe-cloud-net; nginx reaches it via 'api:8510'
     volumes:
       - /devl/snipe-cloud-data:/devl/snipe-cloud-data
+      - ./config/llm.cloud.yaml:/app/snipe/config/llm.yaml:ro
     networks:
       - snipe-cloud-net
 
diff --git a/config/llm.cloud.yaml b/config/llm.cloud.yaml
new file mode 100644
index 0000000..0d39774
--- /dev/null
+++ b/config/llm.cloud.yaml
@@ -0,0 +1,38 @@
+# config/llm.cloud.yaml
+# Snipe — LLM config for the managed cloud instance (menagerie)
+#
+# Mounted read-only into the cloud API container at /app/config/llm.yaml
+# (see compose.cloud.yml).  Personal fine-tunes and local-only backends
+# (claude_code, copilot) are intentionally excluded here.
+#
+# CF Orchestrator routes both ollama and vllm allocations for VRAM-aware
+# scheduling.  CF_ORCH_URL must be set in .env for allocations to resolve;
+# if cf-orch is unreachable the backend falls back to its static base_url.
+#
+# Model choice for query builder: llama3.1:8b
+#   - Reliable instruction following and JSON output
+#   - No creative fine-tuning drift (unlike writer models in the pool)
+#   - Fits comfortably in 8 GB VRAM alongside other services
+
+backends:
+  ollama:
+    type: openai_compat
+    base_url: http://host.docker.internal:11434/v1
+    api_key: ollama
+    model: llama3.1:8b
+    enabled: true
+    supports_images: false
+    cf_orch:
+      service: ollama
+      ttl_s: 300
+
+  anthropic:
+    type: anthropic
+    api_key_env: ANTHROPIC_API_KEY
+    model: claude-haiku-4-5-20251001
+    enabled: false
+    supports_images: false
+
+fallback_order:
+  - ollama
+  - anthropic
diff --git a/config/llm.yaml.example b/config/llm.yaml.example
new file mode 100644
index 0000000..23a0972
--- /dev/null
+++ b/config/llm.yaml.example
@@ -0,0 +1,45 @@
+# config/llm.yaml.example
+# Snipe — LLM backend configuration
+#
+# Copy to config/llm.yaml and edit for your setup.
+# The query builder ("Search with AI") uses the text fallback_order.
+#
+# Backends are tried in fallback_order until one succeeds.
+# Set enabled: false to skip a backend without removing it.
+#
+# CF Orchestrator (cf-orch): when CF_ORCH_URL is set in the environment and a
+# backend has a cf_orch block, allocations are routed through cf-orch for
+# VRAM-aware scheduling. Omit cf_orch to hit the backend directly.
+
+backends:
+  anthropic:
+    type: anthropic
+    api_key_env: ANTHROPIC_API_KEY
+    model: claude-haiku-4-5-20251001
+    enabled: false
+    supports_images: false
+
+  openai:
+    type: openai_compat
+    base_url: https://api.openai.com/v1
+    api_key_env: OPENAI_API_KEY
+    model: gpt-4o-mini
+    enabled: false
+    supports_images: false
+
+  ollama:
+    type: openai_compat
+    base_url: http://localhost:11434/v1
+    api_key: ollama
+    model: llama3.1:8b
+    enabled: true
+    supports_images: false
+    # Uncomment to route through cf-orch for VRAM-aware scheduling:
+    # cf_orch:
+    #   service: ollama
+    #   ttl_s: 300
+
+fallback_order:
+  - anthropic
+  - openai
+  - ollama