turnstone/app/ingest/wazuh.py

"""Wazuh SIEM alert parser.

Handles Wazuh's alerts.json format (JSON Lines — one alert object per line):

    /var/ossec/logs/alerts/alerts.json  (on the Wazuh manager)

Each line is a complete JSON object. Key fields used:
    timestamp     — ISO 8601 with timezone offset ("2024-01-15T10:23:45.123+0000")
    rule.level    — 1-15 (maps to Turnstone severity)
    rule.id       — Wazuh rule ID
    rule.description — human-readable rule description (primary message text)
    rule.groups   — list of category tags
    agent.name    — hostname that generated the original event
    agent.ip      — agent IP address
    full_log      — original raw log line that triggered the alert
    location      — log file or input that was monitored
    data          — dict of decoded fields (srcip, dstip, url, etc.)
"""
from __future__ import annotations

import json
from datetime import datetime, timezone
from typing import Iterator

from app.ingest.base import (
    SourceState, apply_patterns, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

# Wazuh rule levels 1-15 → Turnstone severity labels.
# Levels < 4 are normally informational, 7+ begin to matter operationally,
# 10+ correspond to SIEM-worthy events, 13+ are critical.
_LEVEL_SEVERITY: dict[int, str] = {
    1:  "DEBUG",  2:  "DEBUG",  3:  "DEBUG",
    4:  "INFO",   5:  "INFO",   6:  "NOTICE",
    7:  "WARN",   8:  "WARN",   9:  "WARN",
    10: "ERROR",  11: "ERROR",  12: "ERROR",
    13: "CRITICAL", 14: "CRITICAL", 15: "CRITICAL",
}


def is_wazuh_alert(obj: dict) -> bool:
    """Return True if a parsed JSON object looks like a Wazuh alert."""
    return (
        isinstance(obj.get("rule"), dict)
        and isinstance(obj.get("agent"), dict)
        and ("timestamp" in obj or "manager" in obj)
    )


def _parse_timestamp(raw: str) -> str:
    """Convert Wazuh's ISO 8601 timestamp to UTC ISO 8601."""
    if not raw:
        return ""
    for fmt in (
        "%Y-%m-%dT%H:%M:%S.%f%z",
        "%Y-%m-%dT%H:%M:%S%z",
        "%Y-%m-%dT%H:%M:%S.%fZ",
        "%Y-%m-%dT%H:%M:%SZ",
    ):
        try:
            dt = datetime.strptime(raw, fmt)
            return dt.astimezone(timezone.utc).isoformat()
        except ValueError:
            continue
    return raw


def _build_text(alert: dict) -> str:
    """Compose a readable, searchable text representation of the alert."""
    rule = alert.get("rule", {})
    agent = alert.get("agent", {})

    agent_name = agent.get("name", "unknown")
    agent_ip = agent.get("ip", "")
    rule_id = rule.get("id", "")
    rule_desc = rule.get("description", "(no description)")
    groups = rule.get("groups", [])
    location = alert.get("location", "")
    full_log = alert.get("full_log", "")

    parts: list[str] = []

    # Header line: agent + rule context
    agent_tag = f"{agent_name}/{agent_ip}" if agent_ip else agent_name
    group_tag = ",".join(groups) if groups else ""
    header = f"[wazuh][agent:{agent_tag}][rule:{rule_id}]"
    if group_tag:
        header += f"[{group_tag}]"
    parts.append(f"{header} {rule_desc}")

    if location:
        parts.append(f"location: {location}")

    # Extra decoded fields (srcip, dstip, url, user, etc.)
    data = alert.get("data", {})
    if isinstance(data, dict) and data:
        kv = " | ".join(f"{k}={v}" for k, v in sorted(data.items()) if v)
        if kv:
            parts.append(kv)

    if full_log and full_log.strip() != rule_desc.strip():
        parts.append(f"raw: {full_log.strip()}")

    return "\n".join(parts)


def parse(
    lines: Iterator[str],
    source_id: str,
    compiled_patterns: list[tuple[LogPattern, object]],
    ingest_time: str | None = None,
) -> Iterator[RetrievedEntry]:
    ingest_time = ingest_time or now_iso()
    state = SourceState()

    for raw_line in lines:
        raw_line = raw_line.strip()
        if not raw_line:
            continue
        try:
            alert = json.loads(raw_line)
        except json.JSONDecodeError:
            continue

        if not isinstance(alert, dict):
            continue

        rule = alert.get("rule", {})
        agent = alert.get("agent", {})

        ts_raw = alert.get("timestamp", "")
        ts_iso = _parse_timestamp(ts_raw)

        level = int(rule.get("level", 0))
        severity = _LEVEL_SEVERITY.get(level, "INFO")

        # Qualify source_id by agent so logs from different hosts stay separate.
        agent_name = agent.get("name", "")
        src = f"{source_id}:{agent_name}" if agent_name else source_id

        text = _build_text(alert)
        if not text:
            continue

        repeat, out_of_order = state.observe(text, ts_iso)
        matched = apply_patterns(text, compiled_patterns)

        yield RetrievedEntry(
            entry_id=make_entry_id(src, state.sequence, text),
            source_id=src,
            sequence=state.sequence,
            timestamp_raw=ts_raw,
            timestamp_iso=ts_iso,
            ingest_time=ingest_time,
            severity=severity,
            repeat_count=repeat,
            out_of_order=out_of_order,
            matched_patterns=matched,
            text=text,
        )