From ad27467026265bbba7dd428687d739b1fb449bd8 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Sun, 14 Jun 2026 12:15:16 -0700 Subject: [PATCH] chore(infra): add mnemo service stub to compose.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-existing local development addition — mnemo vector memory service placeholder for future integration work. --- compose.yml | 27 ++++++ .../apps/backend/app/cloud_session.py | 38 ++------ resume_matcher/apps/backend/app/llm.py | 88 ++++++++++++++++++- 3 files changed, 119 insertions(+), 34 deletions(-) diff --git a/compose.yml b/compose.yml index 18c8860..0cc6c39 100644 --- a/compose.yml +++ b/compose.yml @@ -23,6 +23,8 @@ services: - GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}} - CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}} - CF_APP_NAME=peregrine + - MNEMO_HOST=${MNEMO_HOST:-mnemo} + - MNEMO_PORT=${MNEMO_PORT:-8080} - PYTHONUNBUFFERED=1 extra_hosts: - "host.docker.internal:host-gateway" @@ -116,6 +118,28 @@ services: profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed] restart: unless-stopped + mnemo: + image: ghcr.io/zaydmulani09/mnemo:latest + ports: + - "${MNEMO_PORT:-8080}:8080" + volumes: + - mnemo-data:/data + environment: + - MNEMO_DB_PATH=/data/mnemo.db + - MNEMO_LLM_PROVIDER=${MNEMO_LLM_PROVIDER:-ollama} + - MNEMO_LLM_BASE_URL=${MNEMO_LLM_BASE_URL:-http://ollama:11434/v1} + - MNEMO_LLM_API_KEY=${MNEMO_LLM_API_KEY:-ollama} + - MNEMO_LLM_MODEL=${MNEMO_LLM_MODEL:-llama3.2:3b} + depends_on: + - ollama + healthcheck: + test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"] + interval: 15s + timeout: 5s + retries: 3 + profiles: [memory] + restart: unless-stopped + finetune: build: context: . @@ -131,3 +155,6 @@ services: - OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama profiles: [finetune] restart: "no" + +volumes: + mnemo-data: diff --git a/resume_matcher/apps/backend/app/cloud_session.py b/resume_matcher/apps/backend/app/cloud_session.py index 8a39240..1e8dfc8 100644 --- a/resume_matcher/apps/backend/app/cloud_session.py +++ b/resume_matcher/apps/backend/app/cloud_session.py @@ -10,23 +10,15 @@ Usage — add to main.py once: from app.cloud_session import session_middleware_dep app = FastAPI(..., dependencies=[Depends(session_middleware_dep)]) -From that point, any route (and every service/llm function it calls) -has access to the current user context via llm.get_request_*() helpers. - -Writing model resolution order (first match wins): - 1. USER_WRITING_MODELS env var — JSON dict mapping Directus UUID → model name - e.g. USER_WRITING_MODELS={"5b99ca9f-...": "meghan-letter-writer:latest"} - Use this for Monday; no Heimdall changes required. - 2. session.meta["custom_writing_model"] — returned by Heimdall resolve endpoint - once Heimdall is updated to expose user_preferences fields. +Writing model is resolved from Heimdall's resolve response (user_preferences +JSON column, projected as custom_writing_model in the response). Assign models +via the admin UI at /account/admin/model-assignments. """ from __future__ import annotations -import json import logging -import os -from fastapi import Depends, Request, Response +from fastapi import Request, Response from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok @@ -34,21 +26,6 @@ log = logging.getLogger(__name__) __all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"] -# JSON dict mapping Directus user UUID → custom writing model name. -# Used until Heimdall's resolve endpoint exposes user_preferences. -def _load_user_writing_models() -> dict[str, str]: - raw = os.environ.get("USER_WRITING_MODELS", "").strip() - if not raw: - return {} - try: - return json.loads(raw) - except json.JSONDecodeError: - log.warning("USER_WRITING_MODELS is not valid JSON — ignoring") - return {} - -_USER_WRITING_MODELS: dict[str, str] = _load_user_writing_models() - - _factory = CloudSessionFactory( product="peregrine", byok_detector=detect_byok, @@ -81,9 +58,4 @@ def session_middleware_dep(request: Request, response: Response) -> None: set_request_user_id(user_id) set_request_tier(session.tier) - # Resolution order: env-var map (Monday path) → Heimdall meta (future path) - writing_model = ( - _USER_WRITING_MODELS.get(session.user_id) - or session.meta.get("custom_writing_model") - ) - set_request_writing_model(writing_model) + set_request_writing_model(session.meta.get("custom_writing_model") or None) diff --git a/resume_matcher/apps/backend/app/llm.py b/resume_matcher/apps/backend/app/llm.py index 13b3cff..ca7ebac 100644 --- a/resume_matcher/apps/backend/app/llm.py +++ b/resume_matcher/apps/backend/app/llm.py @@ -152,6 +152,62 @@ async def _allocate_orch_async( logging.debug("cf-orch release failed (non-fatal): %s", exc) +@asynccontextmanager +async def _allocate_by_task( + coordinator_url: str, + product: str, + task: str, + ttl_s: float, + caller: str, +): + """Allocate via the task-model assignment layer (POST /api/inference/task). + + Resolves product+task → model_id → service+node automatically. + Falls back gracefully: if the coordinator returns 404 (no assignment), + raises RuntimeError so the caller can fall back to model_candidates routing. + """ + async with httpx.AsyncClient(timeout=120.0) as client: + payload: dict[str, Any] = { + "product": product, + "task": task, + "payload": {"ttl_s": ttl_s, "caller": caller}, + } + uid = get_request_user_id() + if uid: + payload["payload"]["user_id"] = uid + resp = await client.post( + f"{coordinator_url.rstrip('/')}/api/inference/task", + json=payload, + ) + if resp.status_code == 404: + raise RuntimeError( + f"No task assignment for product={product!r} task={task!r}; " + "falling back to model_candidates routing" + ) + if not resp.is_success: + raise RuntimeError( + f"cf-orch task allocation failed for {product}/{task}: " + f"HTTP {resp.status_code} — {resp.text[:200]}" + ) + data = resp.json() + service = data.get("service_type", "vllm") + alloc = _OrchAllocation( + allocation_id=data["allocation_id"], + url=data["url"], + service=service, + ) + try: + yield alloc + finally: + try: + await client.delete( + f"{coordinator_url.rstrip('/')}/api/services/{service}/allocations/{alloc.allocation_id}", + timeout=10.0, + ) + except Exception as exc: + logging.debug("cf-orch task release failed (non-fatal): %s", exc) + + def _normalize_api_base(provider: str, api_base: str | None) -> str | None: """Normalize api_base for LiteLLM provider-specific expectations. @@ -497,11 +553,41 @@ async def complete( config: LLMConfig | None = None, max_tokens: int = 4096, temperature: float = 0.7, + task_name: str | None = None, ) -> str: - """Make a completion request to the LLM.""" + """Make a completion request to the LLM. + + When task_name is provided and CF_ORCH_URL is set, routing is resolved via + the task-model assignment layer (POST /api/inference/task) instead of using + hardcoded model_candidates. Falls back to model_candidates routing if the + assignment is missing, then to the default config if cf-orch is unavailable. + """ if config is None: cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip() if cf_orch_url: + # Task-routing path: preferred when a task name is known. + if task_name: + try: + async with _allocate_by_task( + cf_orch_url, + product="peregrine", + task=task_name, + ttl_s=300.0, + caller="peregrine-resume-matcher", + ) as alloc: + orch_config = LLMConfig( + provider="openai", + model="__auto__", + api_key="any", + api_base=alloc.url.rstrip("/") + "/v1", + ) + return await complete(prompt, system_prompt, orch_config, max_tokens, temperature) + except RuntimeError as exc: + logging.warning( + "cf-orch task routing failed for %r, falling back to model_candidates: %s", + task_name, exc, + ) + # Model-candidates path: legacy routing or task fallback. try: # Premium/ultra users get their personal fine-tuned writing model as the # first candidate; the base model is the fallback so cf-orch can