- #33: Wrap ClassifiedTimeline.cluster_severities in MappingProxyType for true immutability (frozen=True only blocks field reassignment, not dict mutation). - #34: Remove dead suppression branch in synthesizer._build_hypothesis_block. active[] is already filtered to not rh.suppress, so the 'Yes — suppressed' branch was unreachable. Now shows novelty score only. - #35: Extract shared _llm_client.py with call_llm() + extract_content() + strip_json_fences(). Both RootCauseHypothesizer and SummarySynthesizer now import from one source. Also strips JSON fences from LLM output before parsing in hypothesizer._parse_response. - #36: Add per-stage try/except in pipeline.run_pipeline(). Unhandled stage exceptions now emit {type: 'error'} + {type: 'done'} SSE events instead of silently closing the stream. - #37: Move format_context_block() call inside the legacy LLM branch in diagnose/__init__.py — it was being computed unconditionally but only used in the non-pipeline path. - #38: Coerce supporting_cluster_ids items to str() in hypothesizer _parse_response to guard against LLMs returning integers instead of string cluster IDs.
251 lines
8.4 KiB
Python
251 lines
8.4 KiB
Python
"""Stage 2: Severity Classifier — ML with two fallback levels.
|
|
|
|
Classification strategy (in priority order):
|
|
|
|
Path A — ML: Hugging Face text-classification pipeline, loaded lazily.
|
|
Path B — pattern_tags: Map cluster.pattern_tags through the loaded pattern
|
|
severity dict; pick the highest severity across matching tags.
|
|
Path C — regex: Call detect_severity() from app.glean.base on the cluster's
|
|
representative_text.
|
|
|
|
Each cluster is classified independently. The ``classifier_used`` field on the
|
|
returned ``ClassifiedTimeline`` reflects the primary path (the one that governed
|
|
the overall classification session, not individual cluster fallbacks).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from types import MappingProxyType
|
|
|
|
from app.services.diagnose.models import (
|
|
ClassifiedTimeline,
|
|
EventCluster,
|
|
SeverityLabel,
|
|
TimelineResult,
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Module-level ML singleton — reset to None between tests via the fixture
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_ml_classifier: Any | None = None
|
|
|
|
|
|
def _get_ml_classifier(model_id: str, device: str) -> Any:
|
|
"""Return the cached HF pipeline, loading it on first call."""
|
|
global _ml_classifier # noqa: PLW0603
|
|
if _ml_classifier is None:
|
|
from transformers import pipeline as hf_pipeline # type: ignore[import-untyped]
|
|
|
|
_ml_classifier = hf_pipeline(
|
|
"text-classification", model=model_id, device=device
|
|
)
|
|
return _ml_classifier
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Label mapping
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_LABEL_MAP: dict[str, SeverityLabel] = {
|
|
"ERROR": "ERROR",
|
|
"WARNING": "WARN",
|
|
"WARN": "WARN",
|
|
"INFO": "INFO",
|
|
"DEBUG": "DEBUG",
|
|
"CRITICAL": "CRITICAL",
|
|
}
|
|
|
|
_CRITICAL_KEYWORDS: frozenset[str] = frozenset(
|
|
{
|
|
"panic",
|
|
"oom",
|
|
"fatal",
|
|
"critical",
|
|
"kernel panic",
|
|
"out of memory",
|
|
"segfault",
|
|
"segmentation fault",
|
|
}
|
|
)
|
|
|
|
_SEVERITY_ORDER: dict[str | None, int] = {
|
|
"CRITICAL": 5,
|
|
"ERROR": 4,
|
|
"WARN": 3,
|
|
"WARNING": 3,
|
|
"INFO": 2,
|
|
"DEBUG": 1,
|
|
None: 0,
|
|
}
|
|
|
|
|
|
def _map_label(label: str, score: float, text: str) -> SeverityLabel:
|
|
"""Apply the severity shim: promote to CRITICAL or demote to DEBUG where warranted."""
|
|
upper = label.upper()
|
|
if upper == "ERROR" and score > 0.95 and any(
|
|
k in text.lower() for k in _CRITICAL_KEYWORDS
|
|
):
|
|
return "CRITICAL"
|
|
if upper == "INFO" and score < 0.4:
|
|
return "DEBUG"
|
|
return _LABEL_MAP.get(upper, "UNKNOWN") # type: ignore[return-value]
|
|
|
|
|
|
def _highest_from_tags(
|
|
tags: tuple[str, ...], severity_map: dict[str, str]
|
|
) -> SeverityLabel | None:
|
|
"""Return the highest severity from the pattern_tags that appear in severity_map."""
|
|
best: str | None = None
|
|
best_rank = -1
|
|
for tag in tags:
|
|
sev = severity_map.get(tag)
|
|
rank = _SEVERITY_ORDER.get(sev, 0)
|
|
if rank > best_rank:
|
|
best_rank = rank
|
|
best = sev
|
|
if best is None:
|
|
return None
|
|
normalised = "WARN" if best.upper() == "WARNING" else best.upper()
|
|
return normalised # type: ignore[return-value]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SeverityClassifier
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class SeverityClassifier:
|
|
"""Classify each EventCluster's severity using ML, patterns, or regex fallback.
|
|
|
|
Parameters
|
|
----------
|
|
model_id:
|
|
Hugging Face model identifier. When empty (default), ML is skipped.
|
|
device:
|
|
Torch device string passed to the HF pipeline (e.g. ``"cpu"`` or ``"cuda:0"``).
|
|
pattern_file:
|
|
Path to the YAML pattern file. When ``None`` the classifier reads
|
|
``TURNSTONE_PATTERNS`` env var (same logic as ``app/rest.py``).
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
model_id: str = "",
|
|
device: str = "cpu",
|
|
pattern_file: Path | None = None,
|
|
) -> None:
|
|
self._model_id = model_id
|
|
self._device = device
|
|
self._pattern_file: Path | None = pattern_file
|
|
self._pattern_severity: dict[str, str] = {}
|
|
self._patterns_loaded = False
|
|
|
|
# ------------------------------------------------------------------
|
|
# Lazy loaders
|
|
# ------------------------------------------------------------------
|
|
|
|
def _resolve_pattern_file(self) -> Path | None:
|
|
"""Resolve pattern file from constructor arg or env var."""
|
|
if self._pattern_file is not None:
|
|
return self._pattern_file
|
|
env_dir = os.environ.get("TURNSTONE_PATTERNS")
|
|
if env_dir:
|
|
return Path(env_dir) / "default.yaml"
|
|
return None
|
|
|
|
def _ensure_patterns_loaded(self) -> None:
|
|
"""Populate _pattern_severity from the pattern YAML file (once)."""
|
|
if self._patterns_loaded:
|
|
return
|
|
self._patterns_loaded = True
|
|
path = self._resolve_pattern_file()
|
|
if path is None:
|
|
return
|
|
from app.glean.base import load_patterns
|
|
|
|
patterns = load_patterns(path)
|
|
self._pattern_severity = {p.name: p.severity for p in patterns}
|
|
|
|
# ------------------------------------------------------------------
|
|
# Per-cluster classification helpers
|
|
# ------------------------------------------------------------------
|
|
|
|
def _classify_cluster_ml(self, cluster: EventCluster) -> SeverityLabel | None:
|
|
"""Attempt ML classification. Returns None on any inference failure."""
|
|
try:
|
|
pipe = _get_ml_classifier(self._model_id, self._device)
|
|
results = pipe(cluster.representative_text)
|
|
if not results:
|
|
return None
|
|
hit = results[0]
|
|
return _map_label(hit["label"], hit["score"], cluster.representative_text)
|
|
except Exception: # noqa: BLE001
|
|
logger.warning(
|
|
"ML inference failed for cluster %s — falling back",
|
|
cluster.cluster_id,
|
|
)
|
|
return None
|
|
|
|
def _classify_cluster_pattern_tags(
|
|
self, cluster: EventCluster
|
|
) -> SeverityLabel | None:
|
|
"""Derive severity from the cluster's pattern_tags. Returns None if no match."""
|
|
return _highest_from_tags(cluster.pattern_tags, self._pattern_severity)
|
|
|
|
def _classify_cluster_regex(self, cluster: EventCluster) -> SeverityLabel:
|
|
"""Classify by scanning representative_text with the severity regex."""
|
|
from app.glean.base import detect_severity
|
|
|
|
raw = detect_severity(cluster.representative_text)
|
|
if raw is None:
|
|
return "INFO"
|
|
return _LABEL_MAP.get(raw.upper(), "INFO") # type: ignore[return-value]
|
|
|
|
# ------------------------------------------------------------------
|
|
# Public API
|
|
# ------------------------------------------------------------------
|
|
|
|
def classify(self, timeline: TimelineResult) -> ClassifiedTimeline:
|
|
"""Classify every cluster in *timeline* and return a ``ClassifiedTimeline``."""
|
|
self._ensure_patterns_loaded()
|
|
|
|
# Determine which primary path governs this session
|
|
ml_available = bool(self._model_id)
|
|
patterns_available = bool(self._pattern_severity)
|
|
|
|
if ml_available:
|
|
classifier_used: str = "ml"
|
|
elif patterns_available:
|
|
classifier_used = "pattern_tags"
|
|
else:
|
|
classifier_used = "regex"
|
|
|
|
cluster_severities: dict[str, SeverityLabel] = {}
|
|
|
|
for cluster in timeline.clusters:
|
|
severity: SeverityLabel | None = None
|
|
|
|
if ml_available:
|
|
severity = self._classify_cluster_ml(cluster)
|
|
|
|
if severity is None and patterns_available:
|
|
severity = self._classify_cluster_pattern_tags(cluster)
|
|
|
|
if severity is None:
|
|
severity = self._classify_cluster_regex(cluster)
|
|
|
|
cluster_severities[cluster.cluster_id] = severity
|
|
|
|
return ClassifiedTimeline(
|
|
timeline=timeline,
|
|
cluster_severities=MappingProxyType(cluster_severities),
|
|
classifier_used=classifier_used, # type: ignore[arg-type]
|
|
model_id=self._model_id if ml_available else None,
|
|
)
|