chore(infra): add mnemo service stub to compose.yml

Pre-existing local development addition — mnemo vector memory service placeholder for future integration work.
2026-06-14 12:15:16 -07:00 · 2026-06-14 12:15:16 -07:00 · ad27467026
commit ad27467026
parent 3048d8e2f4
3 changed files with 119 additions and 34 deletions
--- a/compose.yml
+++ b/compose.yml
@ -23,6 +23,8 @@ services:
      - GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}}
      - CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}}
      - CF_APP_NAME=peregrine
      - MNEMO_HOST=${MNEMO_HOST:-mnemo}
      - MNEMO_PORT=${MNEMO_PORT:-8080}
      - PYTHONUNBUFFERED=1
    extra_hosts:
      - "host.docker.internal:host-gateway"
@ -116,6 +118,28 @@ services:
    profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
    restart: unless-stopped
  mnemo:
    image: ghcr.io/zaydmulani09/mnemo:latest
    ports:
      - "${MNEMO_PORT:-8080}:8080"
    volumes:
      - mnemo-data:/data
    environment:
      - MNEMO_DB_PATH=/data/mnemo.db
      - MNEMO_LLM_PROVIDER=${MNEMO_LLM_PROVIDER:-ollama}
      - MNEMO_LLM_BASE_URL=${MNEMO_LLM_BASE_URL:-http://ollama:11434/v1}
      - MNEMO_LLM_API_KEY=${MNEMO_LLM_API_KEY:-ollama}
      - MNEMO_LLM_MODEL=${MNEMO_LLM_MODEL:-llama3.2:3b}
    depends_on:
      - ollama
    healthcheck:
      test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"]
      interval: 15s
      timeout: 5s
      retries: 3
    profiles: [memory]
    restart: unless-stopped
  finetune:
    build:
      context: .
@ -131,3 +155,6 @@ services:
      - OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
    profiles: [finetune]
    restart: "no"
 volumes:
  mnemo-data:
--- a/resume_matcher/apps/backend/app/cloud_session.py
+++ b/resume_matcher/apps/backend/app/cloud_session.py
@ -10,23 +10,15 @@ Usage — add to main.py once:
    from app.cloud_session import session_middleware_dep
    app = FastAPI(..., dependencies=[Depends(session_middleware_dep)])
-From that point, any route (and every service/llm function it calls)
+Writing model is resolved from Heimdall's resolve response (user_preferences
-has access to the current user context via llm.get_request_*() helpers.
+JSON column, projected as custom_writing_model in the response).  Assign models
-
+via the admin UI at /account/admin/model-assignments.
 Writing model resolution order (first match wins):
  1. USER_WRITING_MODELS env var  — JSON dict mapping Directus UUID → model name
     e.g. USER_WRITING_MODELS={"5b99ca9f-...": "meghan-letter-writer:latest"}
     Use this for Monday; no Heimdall changes required.
  2. session.meta["custom_writing_model"]  — returned by Heimdall resolve endpoint
     once Heimdall is updated to expose user_preferences fields.
 """
 from __future__ import annotations
 import json
 import logging
 import os
-from fastapi import Depends, Request, Response
+from fastapi import Request, Response
 from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok
@ -34,21 +26,6 @@ log = logging.getLogger(__name__)
 __all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"]
 # JSON dict mapping Directus user UUID → custom writing model name.
 # Used until Heimdall's resolve endpoint exposes user_preferences.
 def _load_user_writing_models() -> dict[str, str]:
    raw = os.environ.get("USER_WRITING_MODELS", "").strip()
    if not raw:
        return {}
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        log.warning("USER_WRITING_MODELS is not valid JSON — ignoring")
        return {}
 _USER_WRITING_MODELS: dict[str, str] = _load_user_writing_models()
 _factory = CloudSessionFactory(
    product="peregrine",
    byok_detector=detect_byok,
@ -81,9 +58,4 @@ def session_middleware_dep(request: Request, response: Response) -> None:
    set_request_user_id(user_id)
    set_request_tier(session.tier)
-    # Resolution order: env-var map (Monday path) → Heimdall meta (future path)
+    set_request_writing_model(session.meta.get("custom_writing_model") or None)
    writing_model = (
        _USER_WRITING_MODELS.get(session.user_id)
        or session.meta.get("custom_writing_model")
    )
    set_request_writing_model(writing_model)
--- a/resume_matcher/apps/backend/app/llm.py
+++ b/resume_matcher/apps/backend/app/llm.py
@ -152,6 +152,62 @@ async def _allocate_orch_async(
                logging.debug("cf-orch release failed (non-fatal): %s", exc)
@asynccontextmanager
 async def _allocate_by_task(
    coordinator_url: str,
    product: str,
    task: str,
    ttl_s: float,
    caller: str,
 ):
    """Allocate via the task-model assignment layer (POST /api/inference/task).
    Resolves product+task → model_id → service+node automatically.
    Falls back gracefully: if the coordinator returns 404 (no assignment),
    raises RuntimeError so the caller can fall back to model_candidates routing.
    """
    async with httpx.AsyncClient(timeout=120.0) as client:
        payload: dict[str, Any] = {
            "product": product,
            "task": task,
            "payload": {"ttl_s": ttl_s, "caller": caller},
        }
        uid = get_request_user_id()
        if uid:
            payload["payload"]["user_id"] = uid
        resp = await client.post(
            f"{coordinator_url.rstrip('/')}/api/inference/task",
            json=payload,
        )
        if resp.status_code == 404:
            raise RuntimeError(
                f"No task assignment for product={product!r} task={task!r}; "
                "falling back to model_candidates routing"
            )
        if not resp.is_success:
            raise RuntimeError(
                f"cf-orch task allocation failed for {product}/{task}: "
                f"HTTP {resp.status_code} — {resp.text[:200]}"
            )
        data = resp.json()
        service = data.get("service_type", "vllm")
        alloc = _OrchAllocation(
            allocation_id=data["allocation_id"],
            url=data["url"],
            service=service,
        )
        try:
            yield alloc
        finally:
            try:
                await client.delete(
                    f"{coordinator_url.rstrip('/')}/api/services/{service}/allocations/{alloc.allocation_id}",
                    timeout=10.0,
                )
            except Exception as exc:
                logging.debug("cf-orch task release failed (non-fatal): %s", exc)
 def _normalize_api_base(provider: str, api_base: str | None) -> str | None:
    """Normalize api_base for LiteLLM provider-specific expectations.
@ -497,11 +553,41 @@ async def complete(
    config: LLMConfig | None = None,
    max_tokens: int = 4096,
    temperature: float = 0.7,
    task_name: str | None = None,
 ) -> str:
-    """Make a completion request to the LLM."""
+    """Make a completion request to the LLM.
    When task_name is provided and CF_ORCH_URL is set, routing is resolved via
    the task-model assignment layer (POST /api/inference/task) instead of using
    hardcoded model_candidates.  Falls back to model_candidates routing if the
    assignment is missing, then to the default config if cf-orch is unavailable.
    """
    if config is None:
        cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
        if cf_orch_url:
            # Task-routing path: preferred when a task name is known.
            if task_name:
                try:
                    async with _allocate_by_task(
                        cf_orch_url,
                        product="peregrine",
                        task=task_name,
                        ttl_s=300.0,
                        caller="peregrine-resume-matcher",
                    ) as alloc:
                        orch_config = LLMConfig(
                            provider="openai",
                            model="__auto__",
                            api_key="any",
                            api_base=alloc.url.rstrip("/") + "/v1",
                        )
                        return await complete(prompt, system_prompt, orch_config, max_tokens, temperature)
                except RuntimeError as exc:
                    logging.warning(
                        "cf-orch task routing failed for %r, falling back to model_candidates: %s",
                        task_name, exc,
                    )
            # Model-candidates path: legacy routing or task fallback.
            try:
                # Premium/ultra users get their personal fine-tuned writing model as the
                # first candidate; the base model is the fallback so cf-orch can