From ad27467026265bbba7dd428687d739b1fb449bd8 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Sun, 14 Jun 2026 12:15:16 -0700
Subject: [PATCH] chore(infra): add mnemo service stub to compose.yml
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Pre-existing local development addition — mnemo vector memory service
placeholder for future integration work.
---
 compose.yml                                   | 27 ++++++
 .../apps/backend/app/cloud_session.py         | 38 ++------
 resume_matcher/apps/backend/app/llm.py        | 88 ++++++++++++++++++-
 3 files changed, 119 insertions(+), 34 deletions(-)

diff --git a/compose.yml b/compose.yml
index 18c8860..0cc6c39 100644
--- a/compose.yml
+++ b/compose.yml
@@ -23,6 +23,8 @@ services:
       - GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}}
       - CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}}
       - CF_APP_NAME=peregrine
+      - MNEMO_HOST=${MNEMO_HOST:-mnemo}
+      - MNEMO_PORT=${MNEMO_PORT:-8080}
       - PYTHONUNBUFFERED=1
     extra_hosts:
       - "host.docker.internal:host-gateway"
@@ -116,6 +118,28 @@ services:
     profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
     restart: unless-stopped
 
+  mnemo:
+    image: ghcr.io/zaydmulani09/mnemo:latest
+    ports:
+      - "${MNEMO_PORT:-8080}:8080"
+    volumes:
+      - mnemo-data:/data
+    environment:
+      - MNEMO_DB_PATH=/data/mnemo.db
+      - MNEMO_LLM_PROVIDER=${MNEMO_LLM_PROVIDER:-ollama}
+      - MNEMO_LLM_BASE_URL=${MNEMO_LLM_BASE_URL:-http://ollama:11434/v1}
+      - MNEMO_LLM_API_KEY=${MNEMO_LLM_API_KEY:-ollama}
+      - MNEMO_LLM_MODEL=${MNEMO_LLM_MODEL:-llama3.2:3b}
+    depends_on:
+      - ollama
+    healthcheck:
+      test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"]
+      interval: 15s
+      timeout: 5s
+      retries: 3
+    profiles: [memory]
+    restart: unless-stopped
+
   finetune:
     build:
       context: .
@@ -131,3 +155,6 @@ services:
       - OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
     profiles: [finetune]
     restart: "no"
+
+volumes:
+  mnemo-data:
diff --git a/resume_matcher/apps/backend/app/cloud_session.py b/resume_matcher/apps/backend/app/cloud_session.py
index 8a39240..1e8dfc8 100644
--- a/resume_matcher/apps/backend/app/cloud_session.py
+++ b/resume_matcher/apps/backend/app/cloud_session.py
@@ -10,23 +10,15 @@ Usage — add to main.py once:
     from app.cloud_session import session_middleware_dep
     app = FastAPI(..., dependencies=[Depends(session_middleware_dep)])
 
-From that point, any route (and every service/llm function it calls)
-has access to the current user context via llm.get_request_*() helpers.
-
-Writing model resolution order (first match wins):
-  1. USER_WRITING_MODELS env var  — JSON dict mapping Directus UUID → model name
-     e.g. USER_WRITING_MODELS={"5b99ca9f-...": "meghan-letter-writer:latest"}
-     Use this for Monday; no Heimdall changes required.
-  2. session.meta["custom_writing_model"]  — returned by Heimdall resolve endpoint
-     once Heimdall is updated to expose user_preferences fields.
+Writing model is resolved from Heimdall's resolve response (user_preferences
+JSON column, projected as custom_writing_model in the response).  Assign models
+via the admin UI at /account/admin/model-assignments.
 """
 from __future__ import annotations
 
-import json
 import logging
-import os
 
-from fastapi import Depends, Request, Response
+from fastapi import Request, Response
 
 from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok
 
@@ -34,21 +26,6 @@ log = logging.getLogger(__name__)
 
 __all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"]
 
-# JSON dict mapping Directus user UUID → custom writing model name.
-# Used until Heimdall's resolve endpoint exposes user_preferences.
-def _load_user_writing_models() -> dict[str, str]:
-    raw = os.environ.get("USER_WRITING_MODELS", "").strip()
-    if not raw:
-        return {}
-    try:
-        return json.loads(raw)
-    except json.JSONDecodeError:
-        log.warning("USER_WRITING_MODELS is not valid JSON — ignoring")
-        return {}
-
-_USER_WRITING_MODELS: dict[str, str] = _load_user_writing_models()
-
-
 _factory = CloudSessionFactory(
     product="peregrine",
     byok_detector=detect_byok,
@@ -81,9 +58,4 @@ def session_middleware_dep(request: Request, response: Response) -> None:
 
     set_request_user_id(user_id)
     set_request_tier(session.tier)
-    # Resolution order: env-var map (Monday path) → Heimdall meta (future path)
-    writing_model = (
-        _USER_WRITING_MODELS.get(session.user_id)
-        or session.meta.get("custom_writing_model")
-    )
-    set_request_writing_model(writing_model)
+    set_request_writing_model(session.meta.get("custom_writing_model") or None)
diff --git a/resume_matcher/apps/backend/app/llm.py b/resume_matcher/apps/backend/app/llm.py
index 13b3cff..ca7ebac 100644
--- a/resume_matcher/apps/backend/app/llm.py
+++ b/resume_matcher/apps/backend/app/llm.py
@@ -152,6 +152,62 @@ async def _allocate_orch_async(
                 logging.debug("cf-orch release failed (non-fatal): %s", exc)
 
 
+@asynccontextmanager
+async def _allocate_by_task(
+    coordinator_url: str,
+    product: str,
+    task: str,
+    ttl_s: float,
+    caller: str,
+):
+    """Allocate via the task-model assignment layer (POST /api/inference/task).
+
+    Resolves product+task → model_id → service+node automatically.
+    Falls back gracefully: if the coordinator returns 404 (no assignment),
+    raises RuntimeError so the caller can fall back to model_candidates routing.
+    """
+    async with httpx.AsyncClient(timeout=120.0) as client:
+        payload: dict[str, Any] = {
+            "product": product,
+            "task": task,
+            "payload": {"ttl_s": ttl_s, "caller": caller},
+        }
+        uid = get_request_user_id()
+        if uid:
+            payload["payload"]["user_id"] = uid
+        resp = await client.post(
+            f"{coordinator_url.rstrip('/')}/api/inference/task",
+            json=payload,
+        )
+        if resp.status_code == 404:
+            raise RuntimeError(
+                f"No task assignment for product={product!r} task={task!r}; "
+                "falling back to model_candidates routing"
+            )
+        if not resp.is_success:
+            raise RuntimeError(
+                f"cf-orch task allocation failed for {product}/{task}: "
+                f"HTTP {resp.status_code} — {resp.text[:200]}"
+            )
+        data = resp.json()
+        service = data.get("service_type", "vllm")
+        alloc = _OrchAllocation(
+            allocation_id=data["allocation_id"],
+            url=data["url"],
+            service=service,
+        )
+        try:
+            yield alloc
+        finally:
+            try:
+                await client.delete(
+                    f"{coordinator_url.rstrip('/')}/api/services/{service}/allocations/{alloc.allocation_id}",
+                    timeout=10.0,
+                )
+            except Exception as exc:
+                logging.debug("cf-orch task release failed (non-fatal): %s", exc)
+
+
 def _normalize_api_base(provider: str, api_base: str | None) -> str | None:
     """Normalize api_base for LiteLLM provider-specific expectations.
 
@@ -497,11 +553,41 @@ async def complete(
     config: LLMConfig | None = None,
     max_tokens: int = 4096,
     temperature: float = 0.7,
+    task_name: str | None = None,
 ) -> str:
-    """Make a completion request to the LLM."""
+    """Make a completion request to the LLM.
+
+    When task_name is provided and CF_ORCH_URL is set, routing is resolved via
+    the task-model assignment layer (POST /api/inference/task) instead of using
+    hardcoded model_candidates.  Falls back to model_candidates routing if the
+    assignment is missing, then to the default config if cf-orch is unavailable.
+    """
     if config is None:
         cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
         if cf_orch_url:
+            # Task-routing path: preferred when a task name is known.
+            if task_name:
+                try:
+                    async with _allocate_by_task(
+                        cf_orch_url,
+                        product="peregrine",
+                        task=task_name,
+                        ttl_s=300.0,
+                        caller="peregrine-resume-matcher",
+                    ) as alloc:
+                        orch_config = LLMConfig(
+                            provider="openai",
+                            model="__auto__",
+                            api_key="any",
+                            api_base=alloc.url.rstrip("/") + "/v1",
+                        )
+                        return await complete(prompt, system_prompt, orch_config, max_tokens, temperature)
+                except RuntimeError as exc:
+                    logging.warning(
+                        "cf-orch task routing failed for %r, falling back to model_candidates: %s",
+                        task_name, exc,
+                    )
+            # Model-candidates path: legacy routing or task fallback.
             try:
                 # Premium/ultra users get their personal fine-tuned writing model as the
                 # first candidate; the base model is the fallback so cf-orch can