chore: wire anomaly scoring pipeline into deployment config

Add TURNSTONE_ANOMALY_* env vars to docker-compose.yml, docker-standalone.sh, and .env.example. Mount shared HF model cache (/Library/Assets/LLM on Heimdall) as read-only bind in both compose and standalone — avoids re-downloading models that are already cached by the diagnose pipeline. Heimdall: byviz/bylastic_classification_logs already cached, threshold 0.80, glean-triggered only (TURNSTONE_ANOMALY_INTERVAL=0).
2026-06-09 23:01:48 -07:00 · 2026-06-09 23:01:48 -07:00 · 26a413b093
commit 26a413b093
parent 01f0e45222
3 changed files with 49 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -42,6 +42,23 @@
 # TURNSTONE_EMBED_MODEL=BAAI/bge-small-en-v1.5
 # TURNSTONE_EMBED_DEVICE=cpu

+# --- Anomaly scoring pipeline (IDS / watchdog) ---
+# Batch-scores every ingested log entry after each glean cycle.
+# Any HuggingFace text-classification model works; the byviz classifier (already
+# required by the diagnose pipeline) is the recommended starting point.
+# Detections above the threshold are inserted into the detections table and
+# surfaced in the Security Alerts tab.
+#
+# Set TURNSTONE_ANOMALY_MODEL to enable; leave unset to disable (safe default).
+# TURNSTONE_ANOMALY_MODEL=byviz/bylastic_classification_logs
+# TURNSTONE_ANOMALY_DEVICE=cpu          # or "cuda" / "mps" for GPU inference
+# TURNSTONE_ANOMALY_THRESHOLD=0.80      # confidence floor for detection insertion
+# TURNSTONE_ANOMALY_INTERVAL=0          # standalone loop (0 = glean-triggered only)
+#
+# HuggingFace model cache — share with the host to avoid re-downloading models.
+# HF_HOME=/hf_cache                     # inside container (set in docker-compose)
+# HF_CACHE_PATH=/Library/Assets/LLM    # host bind-mount source (docker-compose only)
+
 # --- Air-gapped / offline deployment ---
 # Set to 1 to block all HuggingFace hub network access at runtime.
 # Pre-download models to ~/.cache/huggingface/ before deploying — see docs/air-gapped-deployment.md.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -41,9 +41,23 @@ services:
      TURNSTONE_GLEAN_INTERVAL: ${TURNSTONE_GLEAN_INTERVAL:-900}
      TURNSTONE_SOURCE_HOST: ${TURNSTONE_SOURCE_HOST:-}
      TURNSTONE_SUBMIT_ENDPOINT: ${TURNSTONE_SUBMIT_ENDPOINT:-}
+      # --- Multi-agent diagnose pipeline ---
+      TURNSTONE_MULTI_AGENT_DIAGNOSE: ${TURNSTONE_MULTI_AGENT_DIAGNOSE:-false}
+      TURNSTONE_CLASSIFIER_MODEL: ${TURNSTONE_CLASSIFIER_MODEL:-}
+      TURNSTONE_EMBED_BACKEND: ${TURNSTONE_EMBED_BACKEND:-}
+      TURNSTONE_EMBED_MODEL: ${TURNSTONE_EMBED_MODEL:-}
+      TURNSTONE_EMBED_DEVICE: ${TURNSTONE_EMBED_DEVICE:-cpu}
+      # --- Anomaly scoring pipeline ---
+      TURNSTONE_ANOMALY_MODEL: ${TURNSTONE_ANOMALY_MODEL:-}
+      TURNSTONE_ANOMALY_DEVICE: ${TURNSTONE_ANOMALY_DEVICE:-cpu}
+      TURNSTONE_ANOMALY_THRESHOLD: ${TURNSTONE_ANOMALY_THRESHOLD:-0.75}
+      TURNSTONE_ANOMALY_INTERVAL: ${TURNSTONE_ANOMALY_INTERVAL:-0}
+      # --- HuggingFace model cache ---
+      HF_HOME: /hf_cache
    volumes:
      - ./patterns:/app/patterns:ro
      - ./data:/app/data  # optional: persists SQLite files if DATABASE_URL unset
+      - ${HF_CACHE_PATH:-/Library/Assets/LLM}:/hf_cache:ro  # shared model cache

 volumes:
  turnstone_pgdata:
--- a/docker-standalone.sh
+++ b/docker-standalone.sh
@ -62,7 +62,10 @@ set -euo pipefail
 REPO_DIR="${HOME}/turnstone"
 DATA_DIR="${REPO_DIR}/data"
 PATTERNS_DIR="${REPO_DIR}/patterns"
-HF_CACHE_DIR="${REPO_DIR}/hf-cache"   # persists downloaded ML models across restarts
+# HF_CACHE_DIR: override to a shared cache directory to avoid re-downloading models.
+# Example (Heimdall, where byviz/bylastic_classification_logs is already cached):
+#   export HF_CACHE_DIR=/Library/Assets/LLM
+HF_CACHE_DIR="${HF_CACHE_DIR:-${REPO_DIR}/hf-cache}"

 TZ="${TZ:-America/Los_Angeles}"

@ -83,6 +86,16 @@ TZ="${TZ:-America/Los_Angeles}"
 #   bash ~/turnstone/docker-standalone.sh
 #

+# ── Anomaly scoring pipeline (IDS / watchdog) ────────────────────────────────
+# Set TURNSTONE_ANOMALY_MODEL to enable automatic anomaly scoring after each
+# glean run.  The byviz classifier (already used by the diagnose pipeline) is
+# a good default — it's cached alongside the other models.
+#
+#   export TURNSTONE_ANOMALY_MODEL=byviz/bylastic_classification_logs
+#   export TURNSTONE_ANOMALY_THRESHOLD=0.80   # confidence floor (default 0.75)
+#   bash ~/turnstone/docker-standalone.sh
+#
+
 # ── Multi-agent diagnose pipeline ────────────────────────────────────────────
 # Enable the 5-stage ML pipeline to get smarter diagnose results.
 #
@ -134,6 +147,10 @@ docker run -d \
  -e TURNSTONE_EMBED_BACKEND="${TURNSTONE_EMBED_BACKEND:-sentence_transformers}" \
  -e TURNSTONE_EMBED_MODEL="${TURNSTONE_EMBED_MODEL:-sentence-transformers/all-MiniLM-L6-v2}" \
  -e TURNSTONE_EMBED_DEVICE="${TURNSTONE_EMBED_DEVICE:-cpu}" \
+  -e TURNSTONE_ANOMALY_MODEL="${TURNSTONE_ANOMALY_MODEL:-}" \
+  -e TURNSTONE_ANOMALY_DEVICE="${TURNSTONE_ANOMALY_DEVICE:-cpu}" \
+  -e TURNSTONE_ANOMALY_THRESHOLD="${TURNSTONE_ANOMALY_THRESHOLD:-0.75}" \
+  -e TURNSTONE_ANOMALY_INTERVAL="${TURNSTONE_ANOMALY_INTERVAL:-0}" \
  localhost/turnstone:latest

 echo ""