Renames the app/ingest/ package to app/glean/ and updates all references across Python modules, shell scripts, Vue components, tests, and documentation. Intentionally preserved: - SQLite column name ingest_time (avoids schema migration) - RetrievedEntry.ingest_time field (maps to the column above) - Any public-facing JSON keys that reference ingest_time Changes by category: - app/ingest/ → app/glean/ (full package move, all parsers) - app/tasks/ingest_scheduler.py → app/tasks/glean_scheduler.py - scripts/ingest_corpus.py → scripts/glean_corpus.py - tests/test_ingest_*.py → tests/test_glean_*.py - Docstrings, log messages, comments: ingest → glean - Env var: TURNSTONE_INGEST_INTERVAL → TURNSTONE_GLEAN_INTERVAL - Shell scripts: glean.log, glean_corpus.py references - README.md: multi-source ingest → multi-source glean - .env.example: updated env var name - patterns/: new diagnostic patterns from 2026-05-20 SSH incident (service_crash_loop, pkg_daemon_restart, ssh_forward_conflict) - SourcesView.vue: pipeline label updated - All test import paths updated to app.glean.* 285 tests passing.
161 lines
5.1 KiB
Python
161 lines
5.1 KiB
Python
"""Wazuh SIEM alert parser.
|
|
|
|
Handles Wazuh's alerts.json format (JSON Lines — one alert object per line):
|
|
|
|
/var/ossec/logs/alerts/alerts.json (on the Wazuh manager)
|
|
|
|
Each line is a complete JSON object. Key fields used:
|
|
timestamp — ISO 8601 with timezone offset ("2024-01-15T10:23:45.123+0000")
|
|
rule.level — 1-15 (maps to Turnstone severity)
|
|
rule.id — Wazuh rule ID
|
|
rule.description — human-readable rule description (primary message text)
|
|
rule.groups — list of category tags
|
|
agent.name — hostname that generated the original event
|
|
agent.ip — agent IP address
|
|
full_log — original raw log line that triggered the alert
|
|
location — log file or input that was monitored
|
|
data — dict of decoded fields (srcip, dstip, url, etc.)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from typing import Iterator
|
|
|
|
from app.glean.base import (
|
|
SourceState, apply_patterns, make_entry_id, now_iso,
|
|
)
|
|
from app.services.models import LogPattern, RetrievedEntry
|
|
|
|
# Wazuh rule levels 1-15 → Turnstone severity labels.
|
|
# Levels < 4 are normally informational, 7+ begin to matter operationally,
|
|
# 10+ correspond to SIEM-worthy events, 13+ are critical.
|
|
_LEVEL_SEVERITY: dict[int, str] = {
|
|
1: "DEBUG", 2: "DEBUG", 3: "DEBUG",
|
|
4: "INFO", 5: "INFO", 6: "NOTICE",
|
|
7: "WARN", 8: "WARN", 9: "WARN",
|
|
10: "ERROR", 11: "ERROR", 12: "ERROR",
|
|
13: "CRITICAL", 14: "CRITICAL", 15: "CRITICAL",
|
|
}
|
|
|
|
|
|
def is_wazuh_alert(obj: dict) -> bool:
|
|
"""Return True if a parsed JSON object looks like a Wazuh alert."""
|
|
return (
|
|
isinstance(obj.get("rule"), dict)
|
|
and isinstance(obj.get("agent"), dict)
|
|
and ("timestamp" in obj or "manager" in obj)
|
|
)
|
|
|
|
|
|
def _parse_timestamp(raw: str) -> str:
|
|
"""Convert Wazuh's ISO 8601 timestamp to UTC ISO 8601."""
|
|
if not raw:
|
|
return ""
|
|
for fmt in (
|
|
"%Y-%m-%dT%H:%M:%S.%f%z",
|
|
"%Y-%m-%dT%H:%M:%S%z",
|
|
"%Y-%m-%dT%H:%M:%S.%fZ",
|
|
"%Y-%m-%dT%H:%M:%SZ",
|
|
):
|
|
try:
|
|
dt = datetime.strptime(raw, fmt)
|
|
return dt.astimezone(timezone.utc).isoformat()
|
|
except ValueError:
|
|
continue
|
|
return raw
|
|
|
|
|
|
def _build_text(alert: dict) -> str:
|
|
"""Compose a readable, searchable text representation of the alert."""
|
|
rule = alert.get("rule", {})
|
|
agent = alert.get("agent", {})
|
|
|
|
agent_name = agent.get("name", "unknown")
|
|
agent_ip = agent.get("ip", "")
|
|
rule_id = rule.get("id", "")
|
|
rule_desc = rule.get("description", "(no description)")
|
|
groups = rule.get("groups", [])
|
|
location = alert.get("location", "")
|
|
full_log = alert.get("full_log", "")
|
|
|
|
parts: list[str] = []
|
|
|
|
# Header line: agent + rule context
|
|
agent_tag = f"{agent_name}/{agent_ip}" if agent_ip else agent_name
|
|
group_tag = ",".join(groups) if groups else ""
|
|
header = f"[wazuh][agent:{agent_tag}][rule:{rule_id}]"
|
|
if group_tag:
|
|
header += f"[{group_tag}]"
|
|
parts.append(f"{header} {rule_desc}")
|
|
|
|
if location:
|
|
parts.append(f"location: {location}")
|
|
|
|
# Extra decoded fields (srcip, dstip, url, user, etc.)
|
|
data = alert.get("data", {})
|
|
if isinstance(data, dict) and data:
|
|
kv = " | ".join(f"{k}={v}" for k, v in sorted(data.items()) if v)
|
|
if kv:
|
|
parts.append(kv)
|
|
|
|
if full_log and full_log.strip() != rule_desc.strip():
|
|
parts.append(f"raw: {full_log.strip()}")
|
|
|
|
return "\n".join(parts)
|
|
|
|
|
|
def parse(
|
|
lines: Iterator[str],
|
|
source_id: str,
|
|
compiled_patterns: list[tuple[LogPattern, object]],
|
|
ingest_time: str | None = None,
|
|
) -> Iterator[RetrievedEntry]:
|
|
ingest_time = ingest_time or now_iso()
|
|
state = SourceState()
|
|
|
|
for raw_line in lines:
|
|
raw_line = raw_line.strip()
|
|
if not raw_line:
|
|
continue
|
|
try:
|
|
alert = json.loads(raw_line)
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
if not isinstance(alert, dict):
|
|
continue
|
|
|
|
rule = alert.get("rule", {})
|
|
agent = alert.get("agent", {})
|
|
|
|
ts_raw = alert.get("timestamp", "")
|
|
ts_iso = _parse_timestamp(ts_raw)
|
|
|
|
level = int(rule.get("level", 0))
|
|
severity = _LEVEL_SEVERITY.get(level, "INFO")
|
|
|
|
# Qualify source_id by agent so logs from different hosts stay separate.
|
|
agent_name = agent.get("name", "")
|
|
src = f"{source_id}:{agent_name}" if agent_name else source_id
|
|
|
|
text = _build_text(alert)
|
|
if not text:
|
|
continue
|
|
|
|
repeat, out_of_order = state.observe(text, ts_iso)
|
|
matched = apply_patterns(text, compiled_patterns)
|
|
|
|
yield RetrievedEntry(
|
|
entry_id=make_entry_id(src, state.sequence, text),
|
|
source_id=src,
|
|
sequence=state.sequence,
|
|
timestamp_raw=ts_raw,
|
|
timestamp_iso=ts_iso,
|
|
ingest_time=ingest_time,
|
|
severity=severity,
|
|
repeat_count=repeat,
|
|
out_of_order=out_of_order,
|
|
matched_patterns=matched,
|
|
text=text,
|
|
)
|