"""Wazuh SIEM alert parser. Handles Wazuh's alerts.json format (JSON Lines — one alert object per line): /var/ossec/logs/alerts/alerts.json (on the Wazuh manager) Each line is a complete JSON object. Key fields used: timestamp — ISO 8601 with timezone offset ("2024-01-15T10:23:45.123+0000") rule.level — 1-15 (maps to Turnstone severity) rule.id — Wazuh rule ID rule.description — human-readable rule description (primary message text) rule.groups — list of category tags agent.name — hostname that generated the original event agent.ip — agent IP address full_log — original raw log line that triggered the alert location — log file or input that was monitored data — dict of decoded fields (srcip, dstip, url, etc.) """ from __future__ import annotations import json from datetime import datetime, timezone from typing import Iterator from app.ingest.base import ( SourceState, apply_patterns, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry # Wazuh rule levels 1-15 → Turnstone severity labels. # Levels < 4 are normally informational, 7+ begin to matter operationally, # 10+ correspond to SIEM-worthy events, 13+ are critical. _LEVEL_SEVERITY: dict[int, str] = { 1: "DEBUG", 2: "DEBUG", 3: "DEBUG", 4: "INFO", 5: "INFO", 6: "NOTICE", 7: "WARN", 8: "WARN", 9: "WARN", 10: "ERROR", 11: "ERROR", 12: "ERROR", 13: "CRITICAL", 14: "CRITICAL", 15: "CRITICAL", } def is_wazuh_alert(obj: dict) -> bool: """Return True if a parsed JSON object looks like a Wazuh alert.""" return ( isinstance(obj.get("rule"), dict) and isinstance(obj.get("agent"), dict) and ("timestamp" in obj or "manager" in obj) ) def _parse_timestamp(raw: str) -> str: """Convert Wazuh's ISO 8601 timestamp to UTC ISO 8601.""" if not raw: return "" for fmt in ( "%Y-%m-%dT%H:%M:%S.%f%z", "%Y-%m-%dT%H:%M:%S%z", "%Y-%m-%dT%H:%M:%S.%fZ", "%Y-%m-%dT%H:%M:%SZ", ): try: dt = datetime.strptime(raw, fmt) return dt.astimezone(timezone.utc).isoformat() except ValueError: continue return raw def _build_text(alert: dict) -> str: """Compose a readable, searchable text representation of the alert.""" rule = alert.get("rule", {}) agent = alert.get("agent", {}) agent_name = agent.get("name", "unknown") agent_ip = agent.get("ip", "") rule_id = rule.get("id", "") rule_desc = rule.get("description", "(no description)") groups = rule.get("groups", []) location = alert.get("location", "") full_log = alert.get("full_log", "") parts: list[str] = [] # Header line: agent + rule context agent_tag = f"{agent_name}/{agent_ip}" if agent_ip else agent_name group_tag = ",".join(groups) if groups else "" header = f"[wazuh][agent:{agent_tag}][rule:{rule_id}]" if group_tag: header += f"[{group_tag}]" parts.append(f"{header} {rule_desc}") if location: parts.append(f"location: {location}") # Extra decoded fields (srcip, dstip, url, user, etc.) data = alert.get("data", {}) if isinstance(data, dict) and data: kv = " | ".join(f"{k}={v}" for k, v in sorted(data.items()) if v) if kv: parts.append(kv) if full_log and full_log.strip() != rule_desc.strip(): parts.append(f"raw: {full_log.strip()}") return "\n".join(parts) def parse( lines: Iterator[str], source_id: str, compiled_patterns: list[tuple[LogPattern, object]], ingest_time: str | None = None, ) -> Iterator[RetrievedEntry]: ingest_time = ingest_time or now_iso() state = SourceState() for raw_line in lines: raw_line = raw_line.strip() if not raw_line: continue try: alert = json.loads(raw_line) except json.JSONDecodeError: continue if not isinstance(alert, dict): continue rule = alert.get("rule", {}) agent = alert.get("agent", {}) ts_raw = alert.get("timestamp", "") ts_iso = _parse_timestamp(ts_raw) level = int(rule.get("level", 0)) severity = _LEVEL_SEVERITY.get(level, "INFO") # Qualify source_id by agent so logs from different hosts stay separate. agent_name = agent.get("name", "") src = f"{source_id}:{agent_name}" if agent_name else source_id text = _build_text(alert) if not text: continue repeat, out_of_order = state.observe(text, ts_iso) matched = apply_patterns(text, compiled_patterns) yield RetrievedEntry( entry_id=make_entry_id(src, state.sequence, text), source_id=src, sequence=state.sequence, timestamp_raw=ts_raw, timestamp_iso=ts_iso, ingest_time=ingest_time, severity=severity, repeat_count=repeat, out_of_order=out_of_order, matched_patterns=matched, text=text, )