turnstone/app/ingest/wazuh.py
pyr0ball 0f86d35062 feat: periodic ingest scheduler + Orchard submission pipeline
Adds asyncio-native background scheduler (TURNSTONE_INGEST_INTERVAL,
default 900s) that runs batch ingest then pushes pattern-matched entries
to a remote CF harvest endpoint (TURNSTONE_SUBMIT_ENDPOINT).

- app/tasks/ingest_scheduler.py: IngestState, scheduler_loop, run_once,
  submit_matched, _query_matched_since — asyncio.Lock prevents concurrent runs
- app/rest.py: POST /api/ingest/batch (pre-parsed entry receiver),
  GET /api/tasks/ingest/status, POST /api/tasks/ingest (manual trigger),
  TURNSTONE_INGEST_INTERVAL + TURNSTONE_SUBMIT_ENDPOINT env wiring in lifespan
- docker-compose.submissions.yml: segregated daniel (8536) + xander (8537)
  receiving instances on Heimdall, isolated DBs under
  /devl/docker/turnstone-submissions/<node>/
- podman-standalone.sh: pass-through for TURNSTONE_SUBMIT_ENDPOINT +
  TURNSTONE_SOURCE_HOST
- app/ingest/mqtt_subscriber.py: MQTT log source adapter
- app/ingest/wazuh.py: Wazuh alert JSON adapter
- tests/test_ingest_wazuh.py: Wazuh adapter test suite
2026-05-20 08:57:25 -07:00

161 lines
5.1 KiB
Python

"""Wazuh SIEM alert parser.
Handles Wazuh's alerts.json format (JSON Lines — one alert object per line):
/var/ossec/logs/alerts/alerts.json (on the Wazuh manager)
Each line is a complete JSON object. Key fields used:
timestamp — ISO 8601 with timezone offset ("2024-01-15T10:23:45.123+0000")
rule.level — 1-15 (maps to Turnstone severity)
rule.id — Wazuh rule ID
rule.description — human-readable rule description (primary message text)
rule.groups — list of category tags
agent.name — hostname that generated the original event
agent.ip — agent IP address
full_log — original raw log line that triggered the alert
location — log file or input that was monitored
data — dict of decoded fields (srcip, dstip, url, etc.)
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
SourceState, apply_patterns, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry
# Wazuh rule levels 1-15 → Turnstone severity labels.
# Levels < 4 are normally informational, 7+ begin to matter operationally,
# 10+ correspond to SIEM-worthy events, 13+ are critical.
_LEVEL_SEVERITY: dict[int, str] = {
1: "DEBUG", 2: "DEBUG", 3: "DEBUG",
4: "INFO", 5: "INFO", 6: "NOTICE",
7: "WARN", 8: "WARN", 9: "WARN",
10: "ERROR", 11: "ERROR", 12: "ERROR",
13: "CRITICAL", 14: "CRITICAL", 15: "CRITICAL",
}
def is_wazuh_alert(obj: dict) -> bool:
"""Return True if a parsed JSON object looks like a Wazuh alert."""
return (
isinstance(obj.get("rule"), dict)
and isinstance(obj.get("agent"), dict)
and ("timestamp" in obj or "manager" in obj)
)
def _parse_timestamp(raw: str) -> str:
"""Convert Wazuh's ISO 8601 timestamp to UTC ISO 8601."""
if not raw:
return ""
for fmt in (
"%Y-%m-%dT%H:%M:%S.%f%z",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S.%fZ",
"%Y-%m-%dT%H:%M:%SZ",
):
try:
dt = datetime.strptime(raw, fmt)
return dt.astimezone(timezone.utc).isoformat()
except ValueError:
continue
return raw
def _build_text(alert: dict) -> str:
"""Compose a readable, searchable text representation of the alert."""
rule = alert.get("rule", {})
agent = alert.get("agent", {})
agent_name = agent.get("name", "unknown")
agent_ip = agent.get("ip", "")
rule_id = rule.get("id", "")
rule_desc = rule.get("description", "(no description)")
groups = rule.get("groups", [])
location = alert.get("location", "")
full_log = alert.get("full_log", "")
parts: list[str] = []
# Header line: agent + rule context
agent_tag = f"{agent_name}/{agent_ip}" if agent_ip else agent_name
group_tag = ",".join(groups) if groups else ""
header = f"[wazuh][agent:{agent_tag}][rule:{rule_id}]"
if group_tag:
header += f"[{group_tag}]"
parts.append(f"{header} {rule_desc}")
if location:
parts.append(f"location: {location}")
# Extra decoded fields (srcip, dstip, url, user, etc.)
data = alert.get("data", {})
if isinstance(data, dict) and data:
kv = " | ".join(f"{k}={v}" for k, v in sorted(data.items()) if v)
if kv:
parts.append(kv)
if full_log and full_log.strip() != rule_desc.strip():
parts.append(f"raw: {full_log.strip()}")
return "\n".join(parts)
def parse(
lines: Iterator[str],
source_id: str,
compiled_patterns: list[tuple[LogPattern, object]],
ingest_time: str | None = None,
) -> Iterator[RetrievedEntry]:
ingest_time = ingest_time or now_iso()
state = SourceState()
for raw_line in lines:
raw_line = raw_line.strip()
if not raw_line:
continue
try:
alert = json.loads(raw_line)
except json.JSONDecodeError:
continue
if not isinstance(alert, dict):
continue
rule = alert.get("rule", {})
agent = alert.get("agent", {})
ts_raw = alert.get("timestamp", "")
ts_iso = _parse_timestamp(ts_raw)
level = int(rule.get("level", 0))
severity = _LEVEL_SEVERITY.get(level, "INFO")
# Qualify source_id by agent so logs from different hosts stay separate.
agent_name = agent.get("name", "")
src = f"{source_id}:{agent_name}" if agent_name else source_id
text = _build_text(alert)
if not text:
continue
repeat, out_of_order = state.observe(text, ts_iso)
matched = apply_patterns(text, compiled_patterns)
yield RetrievedEntry(
entry_id=make_entry_id(src, state.sequence, text),
source_id=src,
sequence=state.sequence,
timestamp_raw=ts_raw,
timestamp_iso=ts_iso,
ingest_time=ingest_time,
severity=severity,
repeat_count=repeat,
out_of_order=out_of_order,
matched_patterns=matched,
text=text,
)