From 9ec60ea7ff94358dc2790aea9836d710bc2947ea Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 11 May 2026 06:57:38 -0700 Subject: [PATCH] feat: syslog and dmesg parsers with graceful journald fallback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add syslog.py — RFC 3164 parser for /var/log/syslog, /var/log/messages, auth.log, kern.log; ident prepended to message text for searchability - Add dmesg_log.py — handles both relative [secs.usecs] and human-readable [Dow Mon DD HH:MM:SS YYYY] formats; relative timestamps preserved as raw - Wire both into pipeline.py auto-detection (before plaintext fallback) - Update export_journal.sh: checks for journalctl availability, falls back gracefully on non-systemd systems; adds dmesg -T export (falls back to plain dmesg on older kernels) - Add syslog entries (commented) + dmesg source to sources.yaml - 30 tests covering both parsers (detection + parse correctness) --- app/ingest/dmesg_log.py | 102 ++++++++++++++++++++++++++++++++++++ app/ingest/pipeline.py | 10 +++- app/ingest/syslog.py | 100 +++++++++++++++++++++++++++++++++++ patterns/sources.yaml | 22 +++++++- scripts/export_journal.sh | 47 +++++++++++------ tests/test_ingest_dmesg.py | 90 +++++++++++++++++++++++++++++++ tests/test_ingest_syslog.py | 70 +++++++++++++++++++++++++ 7 files changed, 424 insertions(+), 17 deletions(-) create mode 100644 app/ingest/dmesg_log.py create mode 100644 app/ingest/syslog.py create mode 100644 tests/test_ingest_dmesg.py create mode 100644 tests/test_ingest_syslog.py diff --git a/app/ingest/dmesg_log.py b/app/ingest/dmesg_log.py new file mode 100644 index 0000000..84058aa --- /dev/null +++ b/app/ingest/dmesg_log.py @@ -0,0 +1,102 @@ +"""dmesg log parser. + +Handles two formats: + +Relative (always available): + [ 0.000000] Linux version 6.8.0-65-generic + [12345.678901] usb 1-1: USB disconnect, device number 2 + +Human-readable (dmesg -T, util-linux >= 2.21): + [Mon May 11 14:23:01 2026] usb 1-1: USB disconnect, device number 2 + +The export_journal.sh script exports with -T when available, falling back +to plain dmesg. Relative-timestamp entries get no timestamp_iso. +""" +from __future__ import annotations + +import re +from datetime import datetime, timezone +from typing import Iterator + +from app.ingest.base import ( + SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, +) +from app.services.models import LogPattern, RetrievedEntry + +_DAYS = {"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"} +_MONTHS_ABBR = { + "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, + "Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12, +} + +# [ 0.000000] or [12345.678901] +_RELATIVE_RE = re.compile(r"^\[\s*(?P\d+\.\d+)\]\s+(?P.+)$") +# [Mon May 11 14:23:01 2026] +_HUMAN_RE = re.compile( + r"^\[(?P\w{3})\s+(?P\w{3})\s+(?P\d{1,2})" + r"\s+(?P