turnstone/app/ingest/mqtt_subscriber.py
pyr0ball 63c742a708 feat: periodic ingest scheduler + Orchard submission pipeline
Adds asyncio-native background scheduler (TURNSTONE_INGEST_INTERVAL,
default 900s) that runs batch ingest then pushes pattern-matched entries
to a remote CF harvest endpoint (TURNSTONE_SUBMIT_ENDPOINT).

- app/tasks/ingest_scheduler.py: IngestState, scheduler_loop, run_once,
  submit_matched, _query_matched_since — asyncio.Lock prevents concurrent runs
- app/rest.py: POST /api/ingest/batch (pre-parsed entry receiver),
  GET /api/tasks/ingest/status, POST /api/tasks/ingest (manual trigger),
  TURNSTONE_INGEST_INTERVAL + TURNSTONE_SUBMIT_ENDPOINT env wiring in lifespan
- docker-compose.submissions.yml: segregated contrib1 (8536) + contrib2 (8537)
  receiving instances on Heimdall, isolated DBs under
  /devl/docker/turnstone-submissions/<node>/
- podman-standalone.sh: pass-through for TURNSTONE_SUBMIT_ENDPOINT +
  TURNSTONE_SOURCE_HOST
- app/ingest/mqtt_subscriber.py: MQTT log source adapter
- app/ingest/wazuh.py: Wazuh alert JSON adapter
- tests/test_ingest_wazuh.py: Wazuh adapter test suite
2026-05-20 08:57:25 -07:00

166 lines
5.3 KiB
Python

"""Live MQTT ingest subscriber for Turnstone.
Reads ``type: mqtt`` entries from sources.yaml and subscribes to each broker
in the background. Incoming messages are normalized to RetrievedEntry and
written to the Turnstone SQLite database as they arrive.
This runs as an asyncio task alongside the batch ingest scheduler. It is
started from the FastAPI lifespan in rest.py.
MQTT source config format in sources.yaml::
sources:
- id: meshtastic-home
type: mqtt
broker_host: 10.1.10.5
broker_port: 1883 # optional, default 1883
broker_username: ~ # optional
broker_password: ~ # optional
topics:
- msh/# # one or more topic patterns
severity: INFO # optional default severity for all messages
- id: iot-sensors
type: mqtt
broker_host: localhost
topics:
- home/+/temperature
- home/+/humidity
"""
from __future__ import annotations
import asyncio
import hashlib
import json
import logging
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
import yaml
from app.services.models import RetrievedEntry
logger = logging.getLogger(__name__)
def _load_mqtt_sources(sources_file: Path) -> list[dict]:
"""Return only the ``type: mqtt`` entries from sources.yaml."""
if not sources_file.exists():
return []
with sources_file.open() as f:
data = yaml.safe_load(f) or {}
return [s for s in data.get("sources", []) if s.get("type") == "mqtt"]
def _make_entry_id(source_id: str, seq: int, text: str) -> str:
h = hashlib.sha1(f"{source_id}:{seq}:{text}".encode()).hexdigest()[:16]
return f"{source_id}:{seq}:{h}"
def _write_entry(db_path: Path, entry: RetrievedEntry) -> None:
with sqlite3.connect(db_path) as conn:
conn.execute(
"""
INSERT OR IGNORE INTO log_entries
(id, source_id, sequence, timestamp_raw, timestamp_iso,
ingest_time, severity, repeat_count, out_of_order,
matched_patterns, text)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
entry.entry_id,
entry.source_id,
entry.sequence,
entry.timestamp_raw,
entry.timestamp_iso,
entry.ingest_time,
entry.severity,
entry.repeat_count,
1 if entry.out_of_order else 0,
json.dumps(entry.matched_patterns),
entry.text,
),
)
async def _run_source_subscriber(source: dict, db_path: Path) -> None:
"""Maintain a subscription to one MQTT source, reconnecting on error."""
try:
from circuitforge_core.mqtt import MQTTClient, MQTTConfig
except ImportError:
logger.error(
"circuitforge-core[mqtt] is not installed — MQTT source %r skipped. "
"Run: pip install circuitforge-core[mqtt]",
source.get("id"),
)
return
source_id: str = source["id"]
host: str = source["broker_host"]
port: int = int(source.get("broker_port", 1883))
username: str | None = source.get("broker_username") or source.get("username")
password: str | None = source.get("broker_password") or source.get("password")
topics: list[str] = source.get("topics", ["#"])
default_severity: str = source.get("severity", "INFO")
cfg = MQTTConfig(
host=host,
port=port,
username=username,
password=password,
client_id=f"turnstone-{source_id}",
)
client = MQTTClient(cfg)
seq = 0
for topic in topics:
@client.on(topic)
async def _handle(msg, _src=source_id, _sev=default_severity):
nonlocal seq
seq += 1
now = datetime.now(tz=timezone.utc).isoformat()
text = msg.text()
entry = RetrievedEntry(
entry_id=_make_entry_id(_src, seq, text),
source_id=_src,
sequence=seq,
timestamp_raw=now,
timestamp_iso=now,
ingest_time=now,
severity=_sev,
repeat_count=1,
out_of_order=False,
matched_patterns=[],
text=f"[{msg.topic}] {text}",
)
_write_entry(db_path, entry)
logger.debug("MQTT[%s] %s: %s", _src, msg.topic, text[:120])
logger.info("MQTT subscriber starting: %s%s:%d topics=%s", source_id, host, port, topics)
await client.run()
async def run_mqtt_subscribers(sources_file: Path, db_path: Path) -> None:
"""Start one subscriber task per MQTT source. Runs until cancelled."""
sources = _load_mqtt_sources(sources_file)
if not sources:
logger.debug("No MQTT sources configured in %s", sources_file)
return
logger.info("Starting %d MQTT subscriber(s)", len(sources))
tasks = [
asyncio.create_task(
_run_source_subscriber(src, db_path),
name=f"mqtt-{src.get('id', i)}",
)
for i, src in enumerate(sources)
]
try:
await asyncio.gather(*tasks)
except asyncio.CancelledError:
for t in tasks:
t.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
raise