chore(infra): add mnemo service stub to compose.yml
Pre-existing local development addition — mnemo vector memory service placeholder for future integration work.
This commit is contained in:
parent
3048d8e2f4
commit
ad27467026
3 changed files with 119 additions and 34 deletions
27
compose.yml
27
compose.yml
|
|
@ -23,6 +23,8 @@ services:
|
|||
- GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}}
|
||||
- CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}}
|
||||
- CF_APP_NAME=peregrine
|
||||
- MNEMO_HOST=${MNEMO_HOST:-mnemo}
|
||||
- MNEMO_PORT=${MNEMO_PORT:-8080}
|
||||
- PYTHONUNBUFFERED=1
|
||||
extra_hosts:
|
||||
- "host.docker.internal:host-gateway"
|
||||
|
|
@ -116,6 +118,28 @@ services:
|
|||
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||
restart: unless-stopped
|
||||
|
||||
mnemo:
|
||||
image: ghcr.io/zaydmulani09/mnemo:latest
|
||||
ports:
|
||||
- "${MNEMO_PORT:-8080}:8080"
|
||||
volumes:
|
||||
- mnemo-data:/data
|
||||
environment:
|
||||
- MNEMO_DB_PATH=/data/mnemo.db
|
||||
- MNEMO_LLM_PROVIDER=${MNEMO_LLM_PROVIDER:-ollama}
|
||||
- MNEMO_LLM_BASE_URL=${MNEMO_LLM_BASE_URL:-http://ollama:11434/v1}
|
||||
- MNEMO_LLM_API_KEY=${MNEMO_LLM_API_KEY:-ollama}
|
||||
- MNEMO_LLM_MODEL=${MNEMO_LLM_MODEL:-llama3.2:3b}
|
||||
depends_on:
|
||||
- ollama
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"]
|
||||
interval: 15s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
profiles: [memory]
|
||||
restart: unless-stopped
|
||||
|
||||
finetune:
|
||||
build:
|
||||
context: .
|
||||
|
|
@ -131,3 +155,6 @@ services:
|
|||
- OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
|
||||
profiles: [finetune]
|
||||
restart: "no"
|
||||
|
||||
volumes:
|
||||
mnemo-data:
|
||||
|
|
|
|||
|
|
@ -10,23 +10,15 @@ Usage — add to main.py once:
|
|||
from app.cloud_session import session_middleware_dep
|
||||
app = FastAPI(..., dependencies=[Depends(session_middleware_dep)])
|
||||
|
||||
From that point, any route (and every service/llm function it calls)
|
||||
has access to the current user context via llm.get_request_*() helpers.
|
||||
|
||||
Writing model resolution order (first match wins):
|
||||
1. USER_WRITING_MODELS env var — JSON dict mapping Directus UUID → model name
|
||||
e.g. USER_WRITING_MODELS={"5b99ca9f-...": "meghan-letter-writer:latest"}
|
||||
Use this for Monday; no Heimdall changes required.
|
||||
2. session.meta["custom_writing_model"] — returned by Heimdall resolve endpoint
|
||||
once Heimdall is updated to expose user_preferences fields.
|
||||
Writing model is resolved from Heimdall's resolve response (user_preferences
|
||||
JSON column, projected as custom_writing_model in the response). Assign models
|
||||
via the admin UI at /account/admin/model-assignments.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from fastapi import Depends, Request, Response
|
||||
from fastapi import Request, Response
|
||||
|
||||
from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok
|
||||
|
||||
|
|
@ -34,21 +26,6 @@ log = logging.getLogger(__name__)
|
|||
|
||||
__all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"]
|
||||
|
||||
# JSON dict mapping Directus user UUID → custom writing model name.
|
||||
# Used until Heimdall's resolve endpoint exposes user_preferences.
|
||||
def _load_user_writing_models() -> dict[str, str]:
|
||||
raw = os.environ.get("USER_WRITING_MODELS", "").strip()
|
||||
if not raw:
|
||||
return {}
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
log.warning("USER_WRITING_MODELS is not valid JSON — ignoring")
|
||||
return {}
|
||||
|
||||
_USER_WRITING_MODELS: dict[str, str] = _load_user_writing_models()
|
||||
|
||||
|
||||
_factory = CloudSessionFactory(
|
||||
product="peregrine",
|
||||
byok_detector=detect_byok,
|
||||
|
|
@ -81,9 +58,4 @@ def session_middleware_dep(request: Request, response: Response) -> None:
|
|||
|
||||
set_request_user_id(user_id)
|
||||
set_request_tier(session.tier)
|
||||
# Resolution order: env-var map (Monday path) → Heimdall meta (future path)
|
||||
writing_model = (
|
||||
_USER_WRITING_MODELS.get(session.user_id)
|
||||
or session.meta.get("custom_writing_model")
|
||||
)
|
||||
set_request_writing_model(writing_model)
|
||||
set_request_writing_model(session.meta.get("custom_writing_model") or None)
|
||||
|
|
|
|||
|
|
@ -152,6 +152,62 @@ async def _allocate_orch_async(
|
|||
logging.debug("cf-orch release failed (non-fatal): %s", exc)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def _allocate_by_task(
|
||||
coordinator_url: str,
|
||||
product: str,
|
||||
task: str,
|
||||
ttl_s: float,
|
||||
caller: str,
|
||||
):
|
||||
"""Allocate via the task-model assignment layer (POST /api/inference/task).
|
||||
|
||||
Resolves product+task → model_id → service+node automatically.
|
||||
Falls back gracefully: if the coordinator returns 404 (no assignment),
|
||||
raises RuntimeError so the caller can fall back to model_candidates routing.
|
||||
"""
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
payload: dict[str, Any] = {
|
||||
"product": product,
|
||||
"task": task,
|
||||
"payload": {"ttl_s": ttl_s, "caller": caller},
|
||||
}
|
||||
uid = get_request_user_id()
|
||||
if uid:
|
||||
payload["payload"]["user_id"] = uid
|
||||
resp = await client.post(
|
||||
f"{coordinator_url.rstrip('/')}/api/inference/task",
|
||||
json=payload,
|
||||
)
|
||||
if resp.status_code == 404:
|
||||
raise RuntimeError(
|
||||
f"No task assignment for product={product!r} task={task!r}; "
|
||||
"falling back to model_candidates routing"
|
||||
)
|
||||
if not resp.is_success:
|
||||
raise RuntimeError(
|
||||
f"cf-orch task allocation failed for {product}/{task}: "
|
||||
f"HTTP {resp.status_code} — {resp.text[:200]}"
|
||||
)
|
||||
data = resp.json()
|
||||
service = data.get("service_type", "vllm")
|
||||
alloc = _OrchAllocation(
|
||||
allocation_id=data["allocation_id"],
|
||||
url=data["url"],
|
||||
service=service,
|
||||
)
|
||||
try:
|
||||
yield alloc
|
||||
finally:
|
||||
try:
|
||||
await client.delete(
|
||||
f"{coordinator_url.rstrip('/')}/api/services/{service}/allocations/{alloc.allocation_id}",
|
||||
timeout=10.0,
|
||||
)
|
||||
except Exception as exc:
|
||||
logging.debug("cf-orch task release failed (non-fatal): %s", exc)
|
||||
|
||||
|
||||
def _normalize_api_base(provider: str, api_base: str | None) -> str | None:
|
||||
"""Normalize api_base for LiteLLM provider-specific expectations.
|
||||
|
||||
|
|
@ -497,11 +553,41 @@ async def complete(
|
|||
config: LLMConfig | None = None,
|
||||
max_tokens: int = 4096,
|
||||
temperature: float = 0.7,
|
||||
task_name: str | None = None,
|
||||
) -> str:
|
||||
"""Make a completion request to the LLM."""
|
||||
"""Make a completion request to the LLM.
|
||||
|
||||
When task_name is provided and CF_ORCH_URL is set, routing is resolved via
|
||||
the task-model assignment layer (POST /api/inference/task) instead of using
|
||||
hardcoded model_candidates. Falls back to model_candidates routing if the
|
||||
assignment is missing, then to the default config if cf-orch is unavailable.
|
||||
"""
|
||||
if config is None:
|
||||
cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
|
||||
if cf_orch_url:
|
||||
# Task-routing path: preferred when a task name is known.
|
||||
if task_name:
|
||||
try:
|
||||
async with _allocate_by_task(
|
||||
cf_orch_url,
|
||||
product="peregrine",
|
||||
task=task_name,
|
||||
ttl_s=300.0,
|
||||
caller="peregrine-resume-matcher",
|
||||
) as alloc:
|
||||
orch_config = LLMConfig(
|
||||
provider="openai",
|
||||
model="__auto__",
|
||||
api_key="any",
|
||||
api_base=alloc.url.rstrip("/") + "/v1",
|
||||
)
|
||||
return await complete(prompt, system_prompt, orch_config, max_tokens, temperature)
|
||||
except RuntimeError as exc:
|
||||
logging.warning(
|
||||
"cf-orch task routing failed for %r, falling back to model_candidates: %s",
|
||||
task_name, exc,
|
||||
)
|
||||
# Model-candidates path: legacy routing or task fallback.
|
||||
try:
|
||||
# Premium/ultra users get their personal fine-tuned writing model as the
|
||||
# first candidate; the base model is the fallback so cf-orch can
|
||||
|
|
|
|||
Loading…
Reference in a new issue