chore(infra): add mnemo service stub to compose.yml

Pre-existing local development addition — mnemo vector memory service
placeholder for future integration work.
This commit is contained in:
pyr0ball 2026-06-14 12:15:16 -07:00
parent 3048d8e2f4
commit ad27467026
3 changed files with 119 additions and 34 deletions

View file

@ -23,6 +23,8 @@ services:
- GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}} - GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}}
- CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}} - CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}}
- CF_APP_NAME=peregrine - CF_APP_NAME=peregrine
- MNEMO_HOST=${MNEMO_HOST:-mnemo}
- MNEMO_PORT=${MNEMO_PORT:-8080}
- PYTHONUNBUFFERED=1 - PYTHONUNBUFFERED=1
extra_hosts: extra_hosts:
- "host.docker.internal:host-gateway" - "host.docker.internal:host-gateway"
@ -116,6 +118,28 @@ services:
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed] profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
restart: unless-stopped restart: unless-stopped
mnemo:
image: ghcr.io/zaydmulani09/mnemo:latest
ports:
- "${MNEMO_PORT:-8080}:8080"
volumes:
- mnemo-data:/data
environment:
- MNEMO_DB_PATH=/data/mnemo.db
- MNEMO_LLM_PROVIDER=${MNEMO_LLM_PROVIDER:-ollama}
- MNEMO_LLM_BASE_URL=${MNEMO_LLM_BASE_URL:-http://ollama:11434/v1}
- MNEMO_LLM_API_KEY=${MNEMO_LLM_API_KEY:-ollama}
- MNEMO_LLM_MODEL=${MNEMO_LLM_MODEL:-llama3.2:3b}
depends_on:
- ollama
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"]
interval: 15s
timeout: 5s
retries: 3
profiles: [memory]
restart: unless-stopped
finetune: finetune:
build: build:
context: . context: .
@ -131,3 +155,6 @@ services:
- OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama - OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
profiles: [finetune] profiles: [finetune]
restart: "no" restart: "no"
volumes:
mnemo-data:

View file

@ -10,23 +10,15 @@ Usage — add to main.py once:
from app.cloud_session import session_middleware_dep from app.cloud_session import session_middleware_dep
app = FastAPI(..., dependencies=[Depends(session_middleware_dep)]) app = FastAPI(..., dependencies=[Depends(session_middleware_dep)])
From that point, any route (and every service/llm function it calls) Writing model is resolved from Heimdall's resolve response (user_preferences
has access to the current user context via llm.get_request_*() helpers. JSON column, projected as custom_writing_model in the response). Assign models
via the admin UI at /account/admin/model-assignments.
Writing model resolution order (first match wins):
1. USER_WRITING_MODELS env var JSON dict mapping Directus UUID model name
e.g. USER_WRITING_MODELS={"5b99ca9f-...": "meghan-letter-writer:latest"}
Use this for Monday; no Heimdall changes required.
2. session.meta["custom_writing_model"] returned by Heimdall resolve endpoint
once Heimdall is updated to expose user_preferences fields.
""" """
from __future__ import annotations from __future__ import annotations
import json
import logging import logging
import os
from fastapi import Depends, Request, Response from fastapi import Request, Response
from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok
@ -34,21 +26,6 @@ log = logging.getLogger(__name__)
__all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"] __all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"]
# JSON dict mapping Directus user UUID → custom writing model name.
# Used until Heimdall's resolve endpoint exposes user_preferences.
def _load_user_writing_models() -> dict[str, str]:
raw = os.environ.get("USER_WRITING_MODELS", "").strip()
if not raw:
return {}
try:
return json.loads(raw)
except json.JSONDecodeError:
log.warning("USER_WRITING_MODELS is not valid JSON — ignoring")
return {}
_USER_WRITING_MODELS: dict[str, str] = _load_user_writing_models()
_factory = CloudSessionFactory( _factory = CloudSessionFactory(
product="peregrine", product="peregrine",
byok_detector=detect_byok, byok_detector=detect_byok,
@ -81,9 +58,4 @@ def session_middleware_dep(request: Request, response: Response) -> None:
set_request_user_id(user_id) set_request_user_id(user_id)
set_request_tier(session.tier) set_request_tier(session.tier)
# Resolution order: env-var map (Monday path) → Heimdall meta (future path) set_request_writing_model(session.meta.get("custom_writing_model") or None)
writing_model = (
_USER_WRITING_MODELS.get(session.user_id)
or session.meta.get("custom_writing_model")
)
set_request_writing_model(writing_model)

View file

@ -152,6 +152,62 @@ async def _allocate_orch_async(
logging.debug("cf-orch release failed (non-fatal): %s", exc) logging.debug("cf-orch release failed (non-fatal): %s", exc)
@asynccontextmanager
async def _allocate_by_task(
coordinator_url: str,
product: str,
task: str,
ttl_s: float,
caller: str,
):
"""Allocate via the task-model assignment layer (POST /api/inference/task).
Resolves product+task model_id service+node automatically.
Falls back gracefully: if the coordinator returns 404 (no assignment),
raises RuntimeError so the caller can fall back to model_candidates routing.
"""
async with httpx.AsyncClient(timeout=120.0) as client:
payload: dict[str, Any] = {
"product": product,
"task": task,
"payload": {"ttl_s": ttl_s, "caller": caller},
}
uid = get_request_user_id()
if uid:
payload["payload"]["user_id"] = uid
resp = await client.post(
f"{coordinator_url.rstrip('/')}/api/inference/task",
json=payload,
)
if resp.status_code == 404:
raise RuntimeError(
f"No task assignment for product={product!r} task={task!r}; "
"falling back to model_candidates routing"
)
if not resp.is_success:
raise RuntimeError(
f"cf-orch task allocation failed for {product}/{task}: "
f"HTTP {resp.status_code}{resp.text[:200]}"
)
data = resp.json()
service = data.get("service_type", "vllm")
alloc = _OrchAllocation(
allocation_id=data["allocation_id"],
url=data["url"],
service=service,
)
try:
yield alloc
finally:
try:
await client.delete(
f"{coordinator_url.rstrip('/')}/api/services/{service}/allocations/{alloc.allocation_id}",
timeout=10.0,
)
except Exception as exc:
logging.debug("cf-orch task release failed (non-fatal): %s", exc)
def _normalize_api_base(provider: str, api_base: str | None) -> str | None: def _normalize_api_base(provider: str, api_base: str | None) -> str | None:
"""Normalize api_base for LiteLLM provider-specific expectations. """Normalize api_base for LiteLLM provider-specific expectations.
@ -497,11 +553,41 @@ async def complete(
config: LLMConfig | None = None, config: LLMConfig | None = None,
max_tokens: int = 4096, max_tokens: int = 4096,
temperature: float = 0.7, temperature: float = 0.7,
task_name: str | None = None,
) -> str: ) -> str:
"""Make a completion request to the LLM.""" """Make a completion request to the LLM.
When task_name is provided and CF_ORCH_URL is set, routing is resolved via
the task-model assignment layer (POST /api/inference/task) instead of using
hardcoded model_candidates. Falls back to model_candidates routing if the
assignment is missing, then to the default config if cf-orch is unavailable.
"""
if config is None: if config is None:
cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip() cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
if cf_orch_url: if cf_orch_url:
# Task-routing path: preferred when a task name is known.
if task_name:
try:
async with _allocate_by_task(
cf_orch_url,
product="peregrine",
task=task_name,
ttl_s=300.0,
caller="peregrine-resume-matcher",
) as alloc:
orch_config = LLMConfig(
provider="openai",
model="__auto__",
api_key="any",
api_base=alloc.url.rstrip("/") + "/v1",
)
return await complete(prompt, system_prompt, orch_config, max_tokens, temperature)
except RuntimeError as exc:
logging.warning(
"cf-orch task routing failed for %r, falling back to model_candidates: %s",
task_name, exc,
)
# Model-candidates path: legacy routing or task fallback.
try: try:
# Premium/ultra users get their personal fine-tuned writing model as the # Premium/ultra users get their personal fine-tuned writing model as the
# first candidate; the base model is the fallback so cf-orch can # first candidate; the base model is the fallback so cf-orch can