chore(infra): add mnemo service stub to compose.yml
Pre-existing local development addition — mnemo vector memory service placeholder for future integration work.
This commit is contained in:
parent
3048d8e2f4
commit
ad27467026
3 changed files with 119 additions and 34 deletions
27
compose.yml
27
compose.yml
|
|
@ -23,6 +23,8 @@ services:
|
||||||
- GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}}
|
- GPU_SERVER_URL=${GPU_SERVER_URL:-${CF_ORCH_URL:-http://host.docker.internal:7700}}
|
||||||
- CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}}
|
- CF_ORCH_URL=${CF_ORCH_URL:-${GPU_SERVER_URL:-http://host.docker.internal:7700}}
|
||||||
- CF_APP_NAME=peregrine
|
- CF_APP_NAME=peregrine
|
||||||
|
- MNEMO_HOST=${MNEMO_HOST:-mnemo}
|
||||||
|
- MNEMO_PORT=${MNEMO_PORT:-8080}
|
||||||
- PYTHONUNBUFFERED=1
|
- PYTHONUNBUFFERED=1
|
||||||
extra_hosts:
|
extra_hosts:
|
||||||
- "host.docker.internal:host-gateway"
|
- "host.docker.internal:host-gateway"
|
||||||
|
|
@ -116,6 +118,28 @@ services:
|
||||||
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
mnemo:
|
||||||
|
image: ghcr.io/zaydmulani09/mnemo:latest
|
||||||
|
ports:
|
||||||
|
- "${MNEMO_PORT:-8080}:8080"
|
||||||
|
volumes:
|
||||||
|
- mnemo-data:/data
|
||||||
|
environment:
|
||||||
|
- MNEMO_DB_PATH=/data/mnemo.db
|
||||||
|
- MNEMO_LLM_PROVIDER=${MNEMO_LLM_PROVIDER:-ollama}
|
||||||
|
- MNEMO_LLM_BASE_URL=${MNEMO_LLM_BASE_URL:-http://ollama:11434/v1}
|
||||||
|
- MNEMO_LLM_API_KEY=${MNEMO_LLM_API_KEY:-ollama}
|
||||||
|
- MNEMO_LLM_MODEL=${MNEMO_LLM_MODEL:-llama3.2:3b}
|
||||||
|
depends_on:
|
||||||
|
- ollama
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "wget", "-q", "--spider", "http://localhost:8080/health"]
|
||||||
|
interval: 15s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
profiles: [memory]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
finetune:
|
finetune:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
|
|
@ -131,3 +155,6 @@ services:
|
||||||
- OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
|
- OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
|
||||||
profiles: [finetune]
|
profiles: [finetune]
|
||||||
restart: "no"
|
restart: "no"
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
mnemo-data:
|
||||||
|
|
|
||||||
|
|
@ -10,23 +10,15 @@ Usage — add to main.py once:
|
||||||
from app.cloud_session import session_middleware_dep
|
from app.cloud_session import session_middleware_dep
|
||||||
app = FastAPI(..., dependencies=[Depends(session_middleware_dep)])
|
app = FastAPI(..., dependencies=[Depends(session_middleware_dep)])
|
||||||
|
|
||||||
From that point, any route (and every service/llm function it calls)
|
Writing model is resolved from Heimdall's resolve response (user_preferences
|
||||||
has access to the current user context via llm.get_request_*() helpers.
|
JSON column, projected as custom_writing_model in the response). Assign models
|
||||||
|
via the admin UI at /account/admin/model-assignments.
|
||||||
Writing model resolution order (first match wins):
|
|
||||||
1. USER_WRITING_MODELS env var — JSON dict mapping Directus UUID → model name
|
|
||||||
e.g. USER_WRITING_MODELS={"5b99ca9f-...": "meghan-letter-writer:latest"}
|
|
||||||
Use this for Monday; no Heimdall changes required.
|
|
||||||
2. session.meta["custom_writing_model"] — returned by Heimdall resolve endpoint
|
|
||||||
once Heimdall is updated to expose user_preferences fields.
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
|
|
||||||
from fastapi import Depends, Request, Response
|
from fastapi import Request, Response
|
||||||
|
|
||||||
from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok
|
from circuitforge_core.cloud_session import CloudSessionFactory, CloudUser, detect_byok
|
||||||
|
|
||||||
|
|
@ -34,21 +26,6 @@ log = logging.getLogger(__name__)
|
||||||
|
|
||||||
__all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"]
|
__all__ = ["CloudUser", "get_session", "require_tier", "session_middleware_dep"]
|
||||||
|
|
||||||
# JSON dict mapping Directus user UUID → custom writing model name.
|
|
||||||
# Used until Heimdall's resolve endpoint exposes user_preferences.
|
|
||||||
def _load_user_writing_models() -> dict[str, str]:
|
|
||||||
raw = os.environ.get("USER_WRITING_MODELS", "").strip()
|
|
||||||
if not raw:
|
|
||||||
return {}
|
|
||||||
try:
|
|
||||||
return json.loads(raw)
|
|
||||||
except json.JSONDecodeError:
|
|
||||||
log.warning("USER_WRITING_MODELS is not valid JSON — ignoring")
|
|
||||||
return {}
|
|
||||||
|
|
||||||
_USER_WRITING_MODELS: dict[str, str] = _load_user_writing_models()
|
|
||||||
|
|
||||||
|
|
||||||
_factory = CloudSessionFactory(
|
_factory = CloudSessionFactory(
|
||||||
product="peregrine",
|
product="peregrine",
|
||||||
byok_detector=detect_byok,
|
byok_detector=detect_byok,
|
||||||
|
|
@ -81,9 +58,4 @@ def session_middleware_dep(request: Request, response: Response) -> None:
|
||||||
|
|
||||||
set_request_user_id(user_id)
|
set_request_user_id(user_id)
|
||||||
set_request_tier(session.tier)
|
set_request_tier(session.tier)
|
||||||
# Resolution order: env-var map (Monday path) → Heimdall meta (future path)
|
set_request_writing_model(session.meta.get("custom_writing_model") or None)
|
||||||
writing_model = (
|
|
||||||
_USER_WRITING_MODELS.get(session.user_id)
|
|
||||||
or session.meta.get("custom_writing_model")
|
|
||||||
)
|
|
||||||
set_request_writing_model(writing_model)
|
|
||||||
|
|
|
||||||
|
|
@ -152,6 +152,62 @@ async def _allocate_orch_async(
|
||||||
logging.debug("cf-orch release failed (non-fatal): %s", exc)
|
logging.debug("cf-orch release failed (non-fatal): %s", exc)
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def _allocate_by_task(
|
||||||
|
coordinator_url: str,
|
||||||
|
product: str,
|
||||||
|
task: str,
|
||||||
|
ttl_s: float,
|
||||||
|
caller: str,
|
||||||
|
):
|
||||||
|
"""Allocate via the task-model assignment layer (POST /api/inference/task).
|
||||||
|
|
||||||
|
Resolves product+task → model_id → service+node automatically.
|
||||||
|
Falls back gracefully: if the coordinator returns 404 (no assignment),
|
||||||
|
raises RuntimeError so the caller can fall back to model_candidates routing.
|
||||||
|
"""
|
||||||
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
|
payload: dict[str, Any] = {
|
||||||
|
"product": product,
|
||||||
|
"task": task,
|
||||||
|
"payload": {"ttl_s": ttl_s, "caller": caller},
|
||||||
|
}
|
||||||
|
uid = get_request_user_id()
|
||||||
|
if uid:
|
||||||
|
payload["payload"]["user_id"] = uid
|
||||||
|
resp = await client.post(
|
||||||
|
f"{coordinator_url.rstrip('/')}/api/inference/task",
|
||||||
|
json=payload,
|
||||||
|
)
|
||||||
|
if resp.status_code == 404:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No task assignment for product={product!r} task={task!r}; "
|
||||||
|
"falling back to model_candidates routing"
|
||||||
|
)
|
||||||
|
if not resp.is_success:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"cf-orch task allocation failed for {product}/{task}: "
|
||||||
|
f"HTTP {resp.status_code} — {resp.text[:200]}"
|
||||||
|
)
|
||||||
|
data = resp.json()
|
||||||
|
service = data.get("service_type", "vllm")
|
||||||
|
alloc = _OrchAllocation(
|
||||||
|
allocation_id=data["allocation_id"],
|
||||||
|
url=data["url"],
|
||||||
|
service=service,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
yield alloc
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
await client.delete(
|
||||||
|
f"{coordinator_url.rstrip('/')}/api/services/{service}/allocations/{alloc.allocation_id}",
|
||||||
|
timeout=10.0,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logging.debug("cf-orch task release failed (non-fatal): %s", exc)
|
||||||
|
|
||||||
|
|
||||||
def _normalize_api_base(provider: str, api_base: str | None) -> str | None:
|
def _normalize_api_base(provider: str, api_base: str | None) -> str | None:
|
||||||
"""Normalize api_base for LiteLLM provider-specific expectations.
|
"""Normalize api_base for LiteLLM provider-specific expectations.
|
||||||
|
|
||||||
|
|
@ -497,11 +553,41 @@ async def complete(
|
||||||
config: LLMConfig | None = None,
|
config: LLMConfig | None = None,
|
||||||
max_tokens: int = 4096,
|
max_tokens: int = 4096,
|
||||||
temperature: float = 0.7,
|
temperature: float = 0.7,
|
||||||
|
task_name: str | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Make a completion request to the LLM."""
|
"""Make a completion request to the LLM.
|
||||||
|
|
||||||
|
When task_name is provided and CF_ORCH_URL is set, routing is resolved via
|
||||||
|
the task-model assignment layer (POST /api/inference/task) instead of using
|
||||||
|
hardcoded model_candidates. Falls back to model_candidates routing if the
|
||||||
|
assignment is missing, then to the default config if cf-orch is unavailable.
|
||||||
|
"""
|
||||||
if config is None:
|
if config is None:
|
||||||
cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
|
cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
|
||||||
if cf_orch_url:
|
if cf_orch_url:
|
||||||
|
# Task-routing path: preferred when a task name is known.
|
||||||
|
if task_name:
|
||||||
|
try:
|
||||||
|
async with _allocate_by_task(
|
||||||
|
cf_orch_url,
|
||||||
|
product="peregrine",
|
||||||
|
task=task_name,
|
||||||
|
ttl_s=300.0,
|
||||||
|
caller="peregrine-resume-matcher",
|
||||||
|
) as alloc:
|
||||||
|
orch_config = LLMConfig(
|
||||||
|
provider="openai",
|
||||||
|
model="__auto__",
|
||||||
|
api_key="any",
|
||||||
|
api_base=alloc.url.rstrip("/") + "/v1",
|
||||||
|
)
|
||||||
|
return await complete(prompt, system_prompt, orch_config, max_tokens, temperature)
|
||||||
|
except RuntimeError as exc:
|
||||||
|
logging.warning(
|
||||||
|
"cf-orch task routing failed for %r, falling back to model_candidates: %s",
|
||||||
|
task_name, exc,
|
||||||
|
)
|
||||||
|
# Model-candidates path: legacy routing or task fallback.
|
||||||
try:
|
try:
|
||||||
# Premium/ultra users get their personal fine-tuned writing model as the
|
# Premium/ultra users get their personal fine-tuned writing model as the
|
||||||
# first candidate; the base model is the fallback so cf-orch can
|
# first candidate; the base model is the fallback so cf-orch can
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue