feat(pipeline): add TURNSTONE_CLASSIFIER_MODEL env var for Stage 2 ML config

Makes the HuggingFace classifier model for Stage 2 configurable via
TURNSTONE_CLASSIFIER_MODEL. When unset (default), Stage 2 falls back
to pattern_tags then regex — no download required on first run.

Also documents TURNSTONE_MULTI_AGENT_DIAGNOSE, TURNSTONE_CLASSIFIER_MODEL,
TURNSTONE_EMBED_BACKEND/MODEL/DEVICE in .env.example.
This commit is contained in:
pyr0ball 2026-05-25 19:11:32 -07:00
parent 85e7a70536
commit 2375e073ba
2 changed files with 23 additions and 1 deletions

View file

@ -26,3 +26,18 @@
# --- Periodic batch glean --- # --- Periodic batch glean ---
# Seconds between automatic glean runs from sources.yaml. Set to 0 to disable. # Seconds between automatic glean runs from sources.yaml. Set to 0 to disable.
# TURNSTONE_GLEAN_INTERVAL=900 # TURNSTONE_GLEAN_INTERVAL=900
# --- Multi-agent diagnose pipeline (experimental) ---
# Enable the 5-stage ML pipeline instead of the single-LLM summarize() call.
# TURNSTONE_MULTI_AGENT_DIAGNOSE=true
# Stage 2 — ML severity classifier (optional; falls back to pattern_tags then regex).
# Recommended: byviz/bylastic_classification_logs (~300MB, downloaded from HuggingFace)
# TURNSTONE_CLASSIFIER_MODEL=byviz/bylastic_classification_logs
# Stage 4 — Embedding backend for false-positive suppression.
# sentence_transformers: in-process local model (downloads on first use)
# ollama: uses a running Ollama instance (no download needed if model is already pulled)
# TURNSTONE_EMBED_BACKEND=sentence_transformers
# TURNSTONE_EMBED_MODEL=BAAI/bge-small-en-v1.5
# TURNSTONE_EMBED_DEVICE=cpu

View file

@ -5,10 +5,17 @@ from __future__ import annotations
import asyncio import asyncio
import dataclasses import dataclasses
import logging import logging
import os
from collections.abc import AsyncGenerator from collections.abc import AsyncGenerator
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
# Optional ML classifier model for Stage 2.
# When empty (default), Stage 2 falls back to pattern_tags then regex.
# Set TURNSTONE_CLASSIFIER_MODEL to a HuggingFace model ID to enable ML classification.
# Recommended: byviz/bylastic_classification_logs (DistilBERT, ~300MB)
_CLASSIFIER_MODEL: str = os.environ.get("TURNSTONE_CLASSIFIER_MODEL", "")
from app.context.retriever import RetrievedContext from app.context.retriever import RetrievedContext
from app.services.diagnose.classifier import SeverityClassifier from app.services.diagnose.classifier import SeverityClassifier
from app.services.diagnose.hypothesizer import RootCauseHypothesizer from app.services.diagnose.hypothesizer import RootCauseHypothesizer
@ -74,7 +81,7 @@ async def run_pipeline(
# Stage 2: Severity classification # Stage 2: Severity classification
try: try:
classified = await asyncio.to_thread( classified = await asyncio.to_thread(
SeverityClassifier().classify, timeline SeverityClassifier(model_id=_CLASSIFIER_MODEL).classify, timeline
) )
except Exception as exc: except Exception as exc:
logger.exception("Stage 2 (classifier) failed: %s", exc) logger.exception("Stage 2 (classifier) failed: %s", exc)