chore: wire anomaly scoring pipeline into deployment config

Add TURNSTONE_ANOMALY_* env vars to docker-compose.yml, docker-standalone.sh, and .env.example. Mount shared HF model cache (/Library/Assets/LLM on Heimdall) as read-only bind in both compose and standalone — avoids re-downloading models that are already cached by the diagnose pipeline. Heimdall: byviz/bylastic_classification_logs already cached, threshold 0.80, glean-triggered only (TURNSTONE_ANOMALY_INTERVAL=0).
2026-06-09 23:01:48 -07:00 · 2026-06-09 23:01:48 -07:00 · ae13322648
commit ae13322648
parent 6e00bf03d3
3 changed files with 49 additions and 1 deletions
--- a/.env.example
+++ b/.env.example
@ -42,6 +42,23 @@
 # TURNSTONE_EMBED_MODEL=BAAI/bge-small-en-v1.5
 # TURNSTONE_EMBED_DEVICE=cpu
 # --- Anomaly scoring pipeline (IDS / watchdog) ---
 # Batch-scores every ingested log entry after each glean cycle.
 # Any HuggingFace text-classification model works; the byviz classifier (already
 # required by the diagnose pipeline) is the recommended starting point.
 # Detections above the threshold are inserted into the detections table and
 # surfaced in the Security Alerts tab.
 #
 # Set TURNSTONE_ANOMALY_MODEL to enable; leave unset to disable (safe default).
 # TURNSTONE_ANOMALY_MODEL=byviz/bylastic_classification_logs
 # TURNSTONE_ANOMALY_DEVICE=cpu          # or "cuda" / "mps" for GPU inference
 # TURNSTONE_ANOMALY_THRESHOLD=0.80      # confidence floor for detection insertion
 # TURNSTONE_ANOMALY_INTERVAL=0          # standalone loop (0 = glean-triggered only)
 #
 # HuggingFace model cache — share with the host to avoid re-downloading models.
 # HF_HOME=/hf_cache                     # inside container (set in docker-compose)
 # HF_CACHE_PATH=/Library/Assets/LLM    # host bind-mount source (docker-compose only)
 # --- Air-gapped / offline deployment ---
 # Set to 1 to block all HuggingFace hub network access at runtime.
 # Pre-download models to ~/.cache/huggingface/ before deploying — see docs/air-gapped-deployment.md.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -41,9 +41,23 @@ services:
      TURNSTONE_GLEAN_INTERVAL: ${TURNSTONE_GLEAN_INTERVAL:-900}
      TURNSTONE_SOURCE_HOST: ${TURNSTONE_SOURCE_HOST:-}
      TURNSTONE_SUBMIT_ENDPOINT: ${TURNSTONE_SUBMIT_ENDPOINT:-}
      # --- Multi-agent diagnose pipeline ---
      TURNSTONE_MULTI_AGENT_DIAGNOSE: ${TURNSTONE_MULTI_AGENT_DIAGNOSE:-false}
      TURNSTONE_CLASSIFIER_MODEL: ${TURNSTONE_CLASSIFIER_MODEL:-}
      TURNSTONE_EMBED_BACKEND: ${TURNSTONE_EMBED_BACKEND:-}
      TURNSTONE_EMBED_MODEL: ${TURNSTONE_EMBED_MODEL:-}
      TURNSTONE_EMBED_DEVICE: ${TURNSTONE_EMBED_DEVICE:-cpu}
      # --- Anomaly scoring pipeline ---
      TURNSTONE_ANOMALY_MODEL: ${TURNSTONE_ANOMALY_MODEL:-}
      TURNSTONE_ANOMALY_DEVICE: ${TURNSTONE_ANOMALY_DEVICE:-cpu}
      TURNSTONE_ANOMALY_THRESHOLD: ${TURNSTONE_ANOMALY_THRESHOLD:-0.75}
      TURNSTONE_ANOMALY_INTERVAL: ${TURNSTONE_ANOMALY_INTERVAL:-0}
      # --- HuggingFace model cache ---
      HF_HOME: /hf_cache
    volumes:
      - ./patterns:/app/patterns:ro
      - ./data:/app/data  # optional: persists SQLite files if DATABASE_URL unset
      - ${HF_CACHE_PATH:-/Library/Assets/LLM}:/hf_cache:ro  # shared model cache
 volumes:
  turnstone_pgdata:
--- a/docker-standalone.sh
+++ b/docker-standalone.sh
@ -62,7 +62,10 @@ set -euo pipefail
 REPO_DIR="${HOME}/turnstone"
 DATA_DIR="${REPO_DIR}/data"
 PATTERNS_DIR="${REPO_DIR}/patterns"
-HF_CACHE_DIR="${REPO_DIR}/hf-cache"   # persists downloaded ML models across restarts
+# HF_CACHE_DIR: override to a shared cache directory to avoid re-downloading models.
 # Example (Heimdall, where byviz/bylastic_classification_logs is already cached):
 #   export HF_CACHE_DIR=/Library/Assets/LLM
 HF_CACHE_DIR="${HF_CACHE_DIR:-${REPO_DIR}/hf-cache}"
 TZ="${TZ:-America/Los_Angeles}"
@ -83,6 +86,16 @@ TZ="${TZ:-America/Los_Angeles}"
 #   bash ~/turnstone/docker-standalone.sh
 #
 # ── Anomaly scoring pipeline (IDS / watchdog) ────────────────────────────────
 # Set TURNSTONE_ANOMALY_MODEL to enable automatic anomaly scoring after each
 # glean run.  The byviz classifier (already used by the diagnose pipeline) is
 # a good default — it's cached alongside the other models.
 #
 #   export TURNSTONE_ANOMALY_MODEL=byviz/bylastic_classification_logs
 #   export TURNSTONE_ANOMALY_THRESHOLD=0.80   # confidence floor (default 0.75)
 #   bash ~/turnstone/docker-standalone.sh
 #
 # ── Multi-agent diagnose pipeline ────────────────────────────────────────────
 # Enable the 5-stage ML pipeline to get smarter diagnose results.
 #
@ -134,6 +147,10 @@ docker run -d \
  -e TURNSTONE_EMBED_BACKEND="${TURNSTONE_EMBED_BACKEND:-sentence_transformers}" \
  -e TURNSTONE_EMBED_MODEL="${TURNSTONE_EMBED_MODEL:-sentence-transformers/all-MiniLM-L6-v2}" \
  -e TURNSTONE_EMBED_DEVICE="${TURNSTONE_EMBED_DEVICE:-cpu}" \
  -e TURNSTONE_ANOMALY_MODEL="${TURNSTONE_ANOMALY_MODEL:-}" \
  -e TURNSTONE_ANOMALY_DEVICE="${TURNSTONE_ANOMALY_DEVICE:-cpu}" \
  -e TURNSTONE_ANOMALY_THRESHOLD="${TURNSTONE_ANOMALY_THRESHOLD:-0.75}" \
  -e TURNSTONE_ANOMALY_INTERVAL="${TURNSTONE_ANOMALY_INTERVAL:-0}" \
  localhost/turnstone:latest
 echo ""