"""Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer. All tests use mocking; no real LLM calls are made. """ from __future__ import annotations import json import re from typing import Any from unittest.mock import MagicMock, patch import pytest from app.context.retriever import RetrievedContext from app.services.diagnose.hypothesizer import RootCauseHypothesizer from app.services.diagnose.models import ( ClassifiedTimeline, EventCluster, Hypothesis, TimelineResult, ) # --------------------------------------------------------------------------- # Fixture helpers # --------------------------------------------------------------------------- def _make_cluster( cluster_id: str = "c1", representative_text: str = "kernel: oom-killer invoked", severity: str = "ERROR", source_ids: tuple[str, ...] = ("syslog",), pattern_tags: tuple[str, ...] = ("oom",), start_iso: str | None = "2024-01-01T00:00:00+00:00", ) -> EventCluster: return EventCluster( cluster_id=cluster_id, entries=("e1",), start_iso=start_iso, end_iso=None, duration_seconds=1.0, source_ids=source_ids, pattern_tags=pattern_tags, severity=severity, # type: ignore[arg-type] burst=False, gap_before_seconds=0.0, representative_text=representative_text, ) def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult: return TimelineResult( clusters=clusters, total_entries=len(clusters), window_start=None, window_end=None, gap_count=0, burst_count=0, dominant_sources=(), ) def _make_classified( clusters: tuple[EventCluster, ...] = (), cluster_severities: dict | None = None, ) -> ClassifiedTimeline: if cluster_severities is None: cluster_severities = {c.cluster_id: c.severity for c in clusters} return ClassifiedTimeline( timeline=_make_timeline(clusters), cluster_severities=cluster_severities, classifier_used="pattern_tags", model_id=None, ) def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext: return RetrievedContext( facts=[], chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}], ) def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock: """Build a mock httpx.Response that returns the given list as JSON.""" mock_resp = MagicMock() mock_resp.status_code = 200 mock_resp.json.return_value = { "choices": [{"message": {"content": json.dumps(items)}}] } return mock_resp _SAMPLE_HYPOTHESES = [ { "title": "OOM killer terminated critical process", "description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.", "confidence": 0.85, "severity": "CRITICAL", "supporting_clusters": ["c1"], }, { "title": "Disk I/O saturation", "description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.", "confidence": 0.6, "severity": "ERROR", "supporting_clusters": ["c2"], }, ] # --------------------------------------------------------------------------- # Test 1: Valid JSON response returns correct Hypothesis objects # --------------------------------------------------------------------------- def test_valid_json_response_returns_hypotheses(): """Valid LLM JSON array produces a list of Hypothesis objects with correct fields.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="why is memory failing?", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 2 assert isinstance(results[0], Hypothesis) assert results[0].title == "OOM killer terminated critical process" assert results[0].confidence == pytest.approx(0.85) assert results[0].severity == "CRITICAL" assert results[0].supporting_cluster_ids == ("c1",) assert results[1].title == "Disk I/O saturation" assert results[1].severity == "ERROR" # --------------------------------------------------------------------------- # Test 2: hypothesis_id is a non-empty UUID string on each result # --------------------------------------------------------------------------- _UUID_RE = re.compile( r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$" ) def test_hypothesis_id_is_uuid(): """Each returned Hypothesis carries a distinct UUID v4 hypothesis_id.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 2 for h in results: assert h.hypothesis_id, "hypothesis_id must not be empty" assert _UUID_RE.match(h.hypothesis_id), ( f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4" ) # Each ID must be distinct ids = [h.hypothesis_id for h in results] assert len(set(ids)) == len(ids), "hypothesis_ids must be unique" # --------------------------------------------------------------------------- # Test 3: Malformed JSON response returns [] with a logged warning # --------------------------------------------------------------------------- def test_malformed_json_returns_empty_and_warns(caplog): """When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() bad_resp = MagicMock() bad_resp.status_code = 200 bad_resp.json.return_value = { "choices": [{"message": {"content": "not valid json"}}] } import logging with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert results == [] assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records) # --------------------------------------------------------------------------- # Test 4: Non-list JSON (dict) returns [] # --------------------------------------------------------------------------- def test_non_list_json_returns_empty(caplog): """When the LLM returns a JSON object instead of an array, hypothesize() returns [].""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() dict_resp = MagicMock() dict_resp.status_code = 200 dict_resp.json.return_value = { "choices": [{"message": {"content": '{"error": "oops"}'}}] } import logging with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert results == [] assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records) # --------------------------------------------------------------------------- # Test 5: Empty clusters returns [] without any LLM call # --------------------------------------------------------------------------- def test_empty_clusters_returns_empty_no_llm_call(): """ClassifiedTimeline with no clusters returns [] and never calls the LLM.""" classified = _make_classified(clusters=()) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() with patch("httpx.post") as mock_post: results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert results == [] mock_post.assert_not_called() # --------------------------------------------------------------------------- # Test 6: No LLM URL returns [] without any HTTP call # --------------------------------------------------------------------------- def test_no_llm_url_returns_empty_no_http_call(): """When llm_url is None, hypothesize() returns [] immediately with no HTTP requests.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() with patch("httpx.post") as mock_post: results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url=None, llm_model="llama3", ) assert results == [] mock_post.assert_not_called() def test_empty_llm_url_returns_empty_no_http_call(): """When llm_url is empty string, hypothesize() returns [] immediately.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() with patch("httpx.post") as mock_post: results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="", llm_model="llama3", ) assert results == [] mock_post.assert_not_called() def test_no_llm_model_returns_empty_no_http_call(): """When llm_model is None, hypothesize() returns [] immediately.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() with patch("httpx.post") as mock_post: results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model=None, ) assert results == [] mock_post.assert_not_called() # --------------------------------------------------------------------------- # Test 7: max_hypotheses is respected # --------------------------------------------------------------------------- def test_max_hypotheses_respected(): """When LLM returns more items than max_hypotheses, only max_hypotheses are returned.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer(max_hypotheses=3) six_items = [ { "title": f"Hypothesis {i}", "description": "Some description. A second sentence. Third sentence here.", "confidence": 0.5, "severity": "ERROR", "supporting_clusters": ["c1"], } for i in range(6) ] mock_resp = _llm_json_response(six_items) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 3 # --------------------------------------------------------------------------- # Test 8: Severity validation — WARNING → WARN, garbage → ERROR # --------------------------------------------------------------------------- def test_severity_warning_maps_to_warn(): """'WARNING' from the LLM is normalised to 'WARN'.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() items = [ { "title": "A warning severity hypothesis", "description": "Test description. Second sentence. Third.", "confidence": 0.7, "severity": "WARNING", "supporting_clusters": ["c1"], } ] mock_resp = _llm_json_response(items) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 1 assert results[0].severity == "WARN" def test_severity_garbage_maps_to_error(): """An unrecognised severity string from the LLM defaults to 'ERROR'.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() items = [ { "title": "A garbage severity hypothesis", "description": "Test description. Second sentence. Third.", "confidence": 0.4, "severity": "GARBAGE", "supporting_clusters": ["c1"], } ] mock_resp = _llm_json_response(items) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 1 assert results[0].severity == "ERROR" # --------------------------------------------------------------------------- # Test 9: Confidence field works with string floats from the LLM # --------------------------------------------------------------------------- def test_confidence_string_float_coercion(): """A confidence value returned as a string by the LLM is coerced to float via float().""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() items = [ { "title": "String confidence test", "description": "Some description. Second sentence. Third.", "confidence": "0.8", # LLM returned a string, not a float "severity": "INFO", "supporting_clusters": ["c1"], } ] mock_resp = _llm_json_response(items) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 1 assert isinstance(results[0].confidence, float) assert results[0].confidence == pytest.approx(0.8) # --------------------------------------------------------------------------- # Test 10: Non-numeric confidence string falls back to default 0.5 # --------------------------------------------------------------------------- def test_non_numeric_confidence_uses_default(): """LLM returning 'high' for confidence should not raise and defaults to 0.5.""" cluster = _make_cluster() classified = _make_classified(clusters=(cluster,)) ctx = _make_ctx() hypothesizer = RootCauseHypothesizer() items = [ { "title": "t", "description": "d", "confidence": "high", "severity": "ERROR", "supporting_clusters": [], } ] mock_resp = _llm_json_response(items) with patch("httpx.post", return_value=mock_resp): results = hypothesizer.hypothesize( classified, ctx, query="test", llm_url="http://localhost:11434", llm_model="llama3", ) assert len(results) == 1 assert isinstance(results[0].confidence, float) assert results[0].confidence == pytest.approx(0.5)