turnstone/app/glean/wazuh.py
pyr0ball aa80f307fe refactor: rename ingest → glean throughout codebase
Renames the app/ingest/ package to app/glean/ and updates all
references across Python modules, shell scripts, Vue components,
tests, and documentation.

Intentionally preserved:
- SQLite column name ingest_time (avoids schema migration)
- RetrievedEntry.ingest_time field (maps to the column above)
- Any public-facing JSON keys that reference ingest_time

Changes by category:
- app/ingest/ → app/glean/ (full package move, all parsers)
- app/tasks/ingest_scheduler.py → app/tasks/glean_scheduler.py
- scripts/ingest_corpus.py → scripts/glean_corpus.py
- tests/test_ingest_*.py → tests/test_glean_*.py
- Docstrings, log messages, comments: ingest → glean
- Env var: TURNSTONE_INGEST_INTERVAL → TURNSTONE_GLEAN_INTERVAL
- Shell scripts: glean.log, glean_corpus.py references
- README.md: multi-source ingest → multi-source glean
- .env.example: updated env var name
- patterns/: new diagnostic patterns from 2026-05-20 SSH incident
  (service_crash_loop, pkg_daemon_restart, ssh_forward_conflict)
- SourcesView.vue: pipeline label updated
- All test import paths updated to app.glean.*

285 tests passing.
2026-05-20 23:02:55 -07:00

161 lines
5.1 KiB
Python

"""Wazuh SIEM alert parser.
Handles Wazuh's alerts.json format (JSON Lines — one alert object per line):
/var/ossec/logs/alerts/alerts.json (on the Wazuh manager)
Each line is a complete JSON object. Key fields used:
timestamp — ISO 8601 with timezone offset ("2024-01-15T10:23:45.123+0000")
rule.level — 1-15 (maps to Turnstone severity)
rule.id — Wazuh rule ID
rule.description — human-readable rule description (primary message text)
rule.groups — list of category tags
agent.name — hostname that generated the original event
agent.ip — agent IP address
full_log — original raw log line that triggered the alert
location — log file or input that was monitored
data — dict of decoded fields (srcip, dstip, url, etc.)
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Iterator
from app.glean.base import (
SourceState, apply_patterns, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry
# Wazuh rule levels 1-15 → Turnstone severity labels.
# Levels < 4 are normally informational, 7+ begin to matter operationally,
# 10+ correspond to SIEM-worthy events, 13+ are critical.
_LEVEL_SEVERITY: dict[int, str] = {
1: "DEBUG", 2: "DEBUG", 3: "DEBUG",
4: "INFO", 5: "INFO", 6: "NOTICE",
7: "WARN", 8: "WARN", 9: "WARN",
10: "ERROR", 11: "ERROR", 12: "ERROR",
13: "CRITICAL", 14: "CRITICAL", 15: "CRITICAL",
}
def is_wazuh_alert(obj: dict) -> bool:
"""Return True if a parsed JSON object looks like a Wazuh alert."""
return (
isinstance(obj.get("rule"), dict)
and isinstance(obj.get("agent"), dict)
and ("timestamp" in obj or "manager" in obj)
)
def _parse_timestamp(raw: str) -> str:
"""Convert Wazuh's ISO 8601 timestamp to UTC ISO 8601."""
if not raw:
return ""
for fmt in (
"%Y-%m-%dT%H:%M:%S.%f%z",
"%Y-%m-%dT%H:%M:%S%z",
"%Y-%m-%dT%H:%M:%S.%fZ",
"%Y-%m-%dT%H:%M:%SZ",
):
try:
dt = datetime.strptime(raw, fmt)
return dt.astimezone(timezone.utc).isoformat()
except ValueError:
continue
return raw
def _build_text(alert: dict) -> str:
"""Compose a readable, searchable text representation of the alert."""
rule = alert.get("rule", {})
agent = alert.get("agent", {})
agent_name = agent.get("name", "unknown")
agent_ip = agent.get("ip", "")
rule_id = rule.get("id", "")
rule_desc = rule.get("description", "(no description)")
groups = rule.get("groups", [])
location = alert.get("location", "")
full_log = alert.get("full_log", "")
parts: list[str] = []
# Header line: agent + rule context
agent_tag = f"{agent_name}/{agent_ip}" if agent_ip else agent_name
group_tag = ",".join(groups) if groups else ""
header = f"[wazuh][agent:{agent_tag}][rule:{rule_id}]"
if group_tag:
header += f"[{group_tag}]"
parts.append(f"{header} {rule_desc}")
if location:
parts.append(f"location: {location}")
# Extra decoded fields (srcip, dstip, url, user, etc.)
data = alert.get("data", {})
if isinstance(data, dict) and data:
kv = " | ".join(f"{k}={v}" for k, v in sorted(data.items()) if v)
if kv:
parts.append(kv)
if full_log and full_log.strip() != rule_desc.strip():
parts.append(f"raw: {full_log.strip()}")
return "\n".join(parts)
def parse(
lines: Iterator[str],
source_id: str,
compiled_patterns: list[tuple[LogPattern, object]],
ingest_time: str | None = None,
) -> Iterator[RetrievedEntry]:
ingest_time = ingest_time or now_iso()
state = SourceState()
for raw_line in lines:
raw_line = raw_line.strip()
if not raw_line:
continue
try:
alert = json.loads(raw_line)
except json.JSONDecodeError:
continue
if not isinstance(alert, dict):
continue
rule = alert.get("rule", {})
agent = alert.get("agent", {})
ts_raw = alert.get("timestamp", "")
ts_iso = _parse_timestamp(ts_raw)
level = int(rule.get("level", 0))
severity = _LEVEL_SEVERITY.get(level, "INFO")
# Qualify source_id by agent so logs from different hosts stay separate.
agent_name = agent.get("name", "")
src = f"{source_id}:{agent_name}" if agent_name else source_id
text = _build_text(alert)
if not text:
continue
repeat, out_of_order = state.observe(text, ts_iso)
matched = apply_patterns(text, compiled_patterns)
yield RetrievedEntry(
entry_id=make_entry_id(src, state.sequence, text),
source_id=src,
sequence=state.sequence,
timestamp_raw=ts_raw,
timestamp_iso=ts_iso,
ingest_time=ingest_time,
severity=severity,
repeat_count=repeat,
out_of_order=out_of_order,
matched_patterns=matched,
text=text,
)