turnstone/scripts/export_journal.sh
pyr0ball 9ec60ea7ff feat: syslog and dmesg parsers with graceful journald fallback
- Add syslog.py — RFC 3164 parser for /var/log/syslog, /var/log/messages,
  auth.log, kern.log; ident prepended to message text for searchability
- Add dmesg_log.py — handles both relative [secs.usecs] and human-readable
  [Dow Mon DD HH:MM:SS YYYY] formats; relative timestamps preserved as raw
- Wire both into pipeline.py auto-detection (before plaintext fallback)
- Update export_journal.sh: checks for journalctl availability, falls back
  gracefully on non-systemd systems; adds dmesg -T export (falls back to
  plain dmesg on older kernels)
- Add syslog entries (commented) + dmesg source to sources.yaml
- 30 tests covering both parsers (detection + parse correctness)
2026-05-11 06:57:38 -07:00

49 lines
2 KiB
Bash

#!/usr/bin/env bash
# Export recent system messages to files the Turnstone container can ingest.
#
# Exports:
# journal-export.jsonl — journald (if journalctl is available)
# dmesg-export.txt — kernel ring buffer (always)
#
# Output files land in /opt/turnstone/data/ which is bind-mounted at /data/
# inside the container.
#
# Usage (standalone):
# sudo bash /opt/turnstone/scripts/export_journal.sh
#
# Cron (combined with ingest):
# */15 * * * * bash /opt/turnstone/scripts/export_journal.sh && \
# podman exec turnstone python scripts/ingest_corpus.py \
# --sources /patterns/sources.yaml --db /data/turnstone.db \
# >> /var/log/turnstone-ingest.log 2>&1
set -euo pipefail
DATA_DIR=/opt/turnstone/data
# ── journald ─────────────────────────────────────────────────────────────────
# 20-minute window (slightly wider than the 15-min cron interval) ensures no
# gaps between runs. Ingest deduplicates via entry_id hash so overlap is safe.
if command -v journalctl &>/dev/null; then
journalctl \
--output=json \
--priority=0..5 \
--since "20 minutes ago" \
--no-pager \
> "${DATA_DIR}/journal-export.jsonl"
echo "journald: $(wc -l < "${DATA_DIR}/journal-export.jsonl") entries"
else
# No journald — write an empty file so the sources.yaml entry doesn't warn
: > "${DATA_DIR}/journal-export.jsonl"
echo "journald: not available (skipped)"
fi
# ── dmesg ─────────────────────────────────────────────────────────────────────
# Use -T for human-readable timestamps when available (util-linux >= 2.21).
# Fall back to plain dmesg if -T is not supported.
if dmesg -T &>/dev/null; then
dmesg -T > "${DATA_DIR}/dmesg-export.txt"
else
dmesg > "${DATA_DIR}/dmesg-export.txt"
fi
echo "dmesg: $(wc -l < "${DATA_DIR}/dmesg-export.txt") lines"