turnstone/tests/test_diagnose_hypothesizer.py

"""Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer.

All tests use mocking; no real LLM calls are made.
"""
from __future__ import annotations

import json
import re
from typing import Any
from unittest.mock import MagicMock, patch

import pytest

from app.context.retriever import RetrievedContext
from app.services.diagnose.hypothesizer import RootCauseHypothesizer
from app.services.diagnose.models import (
    ClassifiedTimeline,
    EventCluster,
    Hypothesis,
    TimelineResult,
)


# ---------------------------------------------------------------------------
# Fixture helpers
# ---------------------------------------------------------------------------


def _make_cluster(
    cluster_id: str = "c1",
    representative_text: str = "kernel: oom-killer invoked",
    severity: str = "ERROR",
    source_ids: tuple[str, ...] = ("syslog",),
    pattern_tags: tuple[str, ...] = ("oom",),
    start_iso: str | None = "2024-01-01T00:00:00+00:00",
) -> EventCluster:
    return EventCluster(
        cluster_id=cluster_id,
        entries=("e1",),
        start_iso=start_iso,
        end_iso=None,
        duration_seconds=1.0,
        source_ids=source_ids,
        pattern_tags=pattern_tags,
        severity=severity,  # type: ignore[arg-type]
        burst=False,
        gap_before_seconds=0.0,
        representative_text=representative_text,
    )


def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
    return TimelineResult(
        clusters=clusters,
        total_entries=len(clusters),
        window_start=None,
        window_end=None,
        gap_count=0,
        burst_count=0,
        dominant_sources=(),
    )


def _make_classified(
    clusters: tuple[EventCluster, ...] = (),
    cluster_severities: dict | None = None,
) -> ClassifiedTimeline:
    if cluster_severities is None:
        cluster_severities = {c.cluster_id: c.severity for c in clusters}
    return ClassifiedTimeline(
        timeline=_make_timeline(clusters),
        cluster_severities=cluster_severities,
        classifier_used="pattern_tags",
        model_id=None,
    )


def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
    return RetrievedContext(
        facts=[],
        chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}],
    )


def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock:
    """Build a mock httpx.Response that returns the given list as JSON."""
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.json.return_value = {
        "choices": [{"message": {"content": json.dumps(items)}}]
    }
    return mock_resp


_SAMPLE_HYPOTHESES = [
    {
        "title": "OOM killer terminated critical process",
        "description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.",
        "confidence": 0.85,
        "severity": "CRITICAL",
        "supporting_clusters": ["c1"],
    },
    {
        "title": "Disk I/O saturation",
        "description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.",
        "confidence": 0.6,
        "severity": "ERROR",
        "supporting_clusters": ["c2"],
    },
]


# ---------------------------------------------------------------------------
# Test 1: Valid JSON response returns correct Hypothesis objects
# ---------------------------------------------------------------------------


def test_valid_json_response_returns_hypotheses():
    """Valid LLM JSON array produces a list of Hypothesis objects with correct fields."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)

    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="why is memory failing?",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert len(results) == 2
    assert isinstance(results[0], Hypothesis)
    assert results[0].title == "OOM killer terminated critical process"
    assert results[0].confidence == pytest.approx(0.85)
    assert results[0].severity == "CRITICAL"
    assert results[0].supporting_cluster_ids == ("c1",)
    assert results[1].title == "Disk I/O saturation"
    assert results[1].severity == "ERROR"


# ---------------------------------------------------------------------------
# Test 2: hypothesis_id is a non-empty UUID string on each result
# ---------------------------------------------------------------------------


_UUID_RE = re.compile(
    r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
)


def test_hypothesis_id_is_uuid():
    """Each returned Hypothesis carries a distinct UUID v4 hypothesis_id."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)

    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert len(results) == 2
    for h in results:
        assert h.hypothesis_id, "hypothesis_id must not be empty"
        assert _UUID_RE.match(h.hypothesis_id), (
            f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4"
        )
    # Each ID must be distinct
    ids = [h.hypothesis_id for h in results]
    assert len(set(ids)) == len(ids), "hypothesis_ids must be unique"


# ---------------------------------------------------------------------------
# Test 3: Malformed JSON response returns [] with a logged warning
# ---------------------------------------------------------------------------


def test_malformed_json_returns_empty_and_warns(caplog):
    """When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    bad_resp = MagicMock()
    bad_resp.status_code = 200
    bad_resp.json.return_value = {
        "choices": [{"message": {"content": "not valid json"}}]
    }

    import logging
    with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert results == []
    assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records)


# ---------------------------------------------------------------------------
# Test 4: Non-list JSON (dict) returns []
# ---------------------------------------------------------------------------


def test_non_list_json_returns_empty(caplog):
    """When the LLM returns a JSON object instead of an array, hypothesize() returns []."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    dict_resp = MagicMock()
    dict_resp.status_code = 200
    dict_resp.json.return_value = {
        "choices": [{"message": {"content": '{"error": "oops"}'}}]
    }

    import logging
    with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert results == []
    assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records)


# ---------------------------------------------------------------------------
# Test 5: Empty clusters returns [] without any LLM call
# ---------------------------------------------------------------------------


def test_empty_clusters_returns_empty_no_llm_call():
    """ClassifiedTimeline with no clusters returns [] and never calls the LLM."""
    classified = _make_classified(clusters=())
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert results == []
    mock_post.assert_not_called()


# ---------------------------------------------------------------------------
# Test 6: No LLM URL returns [] without any HTTP call
# ---------------------------------------------------------------------------


def test_no_llm_url_returns_empty_no_http_call():
    """When llm_url is None, hypothesize() returns [] immediately with no HTTP requests."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url=None,
            llm_model="llama3",
        )

    assert results == []
    mock_post.assert_not_called()


def test_empty_llm_url_returns_empty_no_http_call():
    """When llm_url is empty string, hypothesize() returns [] immediately."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="",
            llm_model="llama3",
        )

    assert results == []
    mock_post.assert_not_called()


def test_no_llm_model_returns_empty_no_http_call():
    """When llm_model is None, hypothesize() returns [] immediately."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model=None,
        )

    assert results == []
    mock_post.assert_not_called()


# ---------------------------------------------------------------------------
# Test 7: max_hypotheses is respected
# ---------------------------------------------------------------------------


def test_max_hypotheses_respected():
    """When LLM returns more items than max_hypotheses, only max_hypotheses are returned."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer(max_hypotheses=3)

    six_items = [
        {
            "title": f"Hypothesis {i}",
            "description": "Some description. A second sentence. Third sentence here.",
            "confidence": 0.5,
            "severity": "ERROR",
            "supporting_clusters": ["c1"],
        }
        for i in range(6)
    ]
    mock_resp = _llm_json_response(six_items)

    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert len(results) == 3


# ---------------------------------------------------------------------------
# Test 8: Severity validation — WARNING → WARN, garbage → ERROR
# ---------------------------------------------------------------------------


def test_severity_warning_maps_to_warn():
    """'WARNING' from the LLM is normalised to 'WARN'."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    items = [
        {
            "title": "A warning severity hypothesis",
            "description": "Test description. Second sentence. Third.",
            "confidence": 0.7,
            "severity": "WARNING",
            "supporting_clusters": ["c1"],
        }
    ]
    mock_resp = _llm_json_response(items)

    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert len(results) == 1
    assert results[0].severity == "WARN"


def test_severity_garbage_maps_to_error():
    """An unrecognised severity string from the LLM defaults to 'ERROR'."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    items = [
        {
            "title": "A garbage severity hypothesis",
            "description": "Test description. Second sentence. Third.",
            "confidence": 0.4,
            "severity": "GARBAGE",
            "supporting_clusters": ["c1"],
        }
    ]
    mock_resp = _llm_json_response(items)

    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert len(results) == 1
    assert results[0].severity == "ERROR"


# ---------------------------------------------------------------------------
# Test 9: Confidence field works with string floats from the LLM
# ---------------------------------------------------------------------------


def test_confidence_string_float_coercion():
    """A confidence value returned as a string by the LLM is coerced to float via float()."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()

    items = [
        {
            "title": "String confidence test",
            "description": "Some description. Second sentence. Third.",
            "confidence": "0.8",  # LLM returned a string, not a float
            "severity": "INFO",
            "supporting_clusters": ["c1"],
        }
    ]
    mock_resp = _llm_json_response(items)

    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )

    assert len(results) == 1
    assert isinstance(results[0].confidence, float)
    assert results[0].confidence == pytest.approx(0.8)