"""Periodic batch glean scheduler with optional CF submission. Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var, default 900s / 15 min). Set to 0 to disable. When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote Turnstone instance (the CF receiving store) after each glean run. """ from __future__ import annotations import asyncio import json import logging from app.db import get_conn, resolve_tenant_id from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from pathlib import Path from typing import Any import httpx from app.glean.pipeline import glean_sources from app.tasks.anomaly_scorer import run_once as _run_scorer from app.tasks.cybersec_scorer import run_once as _run_cybersec from app.tasks.incident_detector import run_once as _run_incident_detector logger = logging.getLogger(__name__) _lock = asyncio.Lock() @dataclass class IngestState: last_run_at: str | None = None last_duration_s: float | None = None last_stats: dict[str, int] = field(default_factory=dict) last_error: str | None = None run_count: int = 0 next_run_at: str | None = None running: bool = False last_submitted_at: str | None = None last_submit_count: int = 0 last_submit_error: str | None = None _state = IngestState() def get_state() -> IngestState: return _state def _query_matched_since(db_path: Path, since: str | None) -> list[dict]: """Return entries with non-empty matched_patterns, optionally filtered by ingest_time.""" tid = resolve_tenant_id() with get_conn(db_path) as conn: if since: rows = conn.execute( """ SELECT id, source_id, sequence, timestamp_raw, timestamp_iso, ingest_time, severity, repeat_count, out_of_order, matched_patterns, text FROM log_entries WHERE matched_patterns != '[]' AND ingest_time > ? AND (tenant_id = ? OR tenant_id = '') ORDER BY ingest_time LIMIT 5000 """, (since, tid), ).fetchall() else: rows = conn.execute( """ SELECT id, source_id, sequence, timestamp_raw, timestamp_iso, ingest_time, severity, repeat_count, out_of_order, matched_patterns, text FROM log_entries WHERE matched_patterns != '[]' AND (tenant_id = ? OR tenant_id = '') ORDER BY ingest_time DESC LIMIT 5000 """, (tid,), ).fetchall() return [dict(r) for r in rows] async def submit_matched( db_path: Path, submit_endpoint: str, source_host: str, since: str | None = None, ) -> dict[str, Any]: """Push pattern-matched entries to the remote CF receiving instance.""" loop = asyncio.get_running_loop() entries = await loop.run_in_executor( None, lambda: _query_matched_since(db_path, since) ) if not entries: return {"ok": True, "submitted": 0, "skipped": True} url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch" payload = {"source_host": source_host, "entries": entries} try: async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.post(url, json=payload) resp.raise_for_status() result = resp.json() submitted = result.get("gleaned", len(entries)) _state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat() _state.last_submit_count = submitted _state.last_submit_error = None logger.info("Submitted %d matched entries to %s", submitted, submit_endpoint) return {"ok": True, "submitted": submitted} except Exception as exc: _state.last_submit_error = str(exc) logger.warning("Submission to %s failed: %s", submit_endpoint, exc) return {"ok": False, "error": str(exc)} async def run_once( sources_file: Path, db_path: Path, pattern_file: Path | None = None, submit_endpoint: str | None = None, source_host: str = "unknown", force: bool = False, anomaly_model: str = "", anomaly_device: str = "cpu", anomaly_threshold: float = 0.75, cybersec_model: str = "", cybersec_device: str = "cpu", cybersec_threshold: float = 0.60, incidents_db_path: Path | None = None, auto_incident: bool = True, ) -> dict[str, Any]: """Ingest all sources once, then submit matched entries if configured. Pass ``force=True`` to bypass fingerprint checks and re-glean all local file sources regardless of whether they appear unchanged. """ if _lock.locked(): return {"ok": False, "error": "glean already running", "skipped": True} async with _lock: _state.running = True started = datetime.now(tz=timezone.utc) try: loop = asyncio.get_running_loop() stats: dict[str, int] = await loop.run_in_executor( None, lambda: glean_sources(sources_file, db_path, pattern_file, force=force), ) duration = (datetime.now(tz=timezone.utc) - started).total_seconds() _state.last_run_at = started.isoformat() _state.last_duration_s = round(duration, 2) _state.last_stats = stats _state.last_error = None _state.run_count += 1 logger.info("Batch glean complete in %.1fs — %s", duration, stats) except Exception as exc: duration = (datetime.now(tz=timezone.utc) - started).total_seconds() _state.last_run_at = started.isoformat() _state.last_duration_s = round(duration, 2) _state.last_error = str(exc) _state.run_count += 1 logger.error("Batch glean failed: %s", exc) _state.running = False return {"ok": False, "error": str(exc)} finally: _state.running = False if submit_endpoint: await submit_matched(db_path, submit_endpoint, source_host, since=_state.last_submitted_at) if anomaly_model: await _run_scorer(db_path, anomaly_model, anomaly_device, threshold=anomaly_threshold) if cybersec_model: await _run_cybersec(db_path, cybersec_model, cybersec_device, threshold=cybersec_threshold) if auto_incident and incidents_db_path: glean_started_iso = _state.last_run_at result = await _run_incident_detector(db_path, incidents_db_path, since=glean_started_iso) if result["created"]: logger.info("Incident detector: %d incident(s) auto-created", result["created"]) return {"ok": True, "stats": _state.last_stats, "duration_s": _state.last_duration_s} async def scheduler_loop( sources_file: Path, db_path: Path, pattern_file: Path | None, interval_s: int, submit_endpoint: str | None = None, source_host: str = "unknown", anomaly_model: str = "", anomaly_device: str = "cpu", anomaly_threshold: float = 0.75, cybersec_model: str = "", cybersec_device: str = "cpu", cybersec_threshold: float = 0.60, incidents_db_path: Path | None = None, auto_incident: bool = True, ) -> None: """Run glean + optional submission + optional anomaly/cybersec scoring every interval_s seconds.""" logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file) if submit_endpoint: logger.info("Submission enabled — endpoint: %s", submit_endpoint) if anomaly_model: logger.info("Anomaly scoring enabled — model: %s", anomaly_model) if cybersec_model: logger.info("Cybersec scoring enabled — model: %s", cybersec_model) if auto_incident and incidents_db_path: logger.info("Auto-incident detection enabled") while True: await run_once( sources_file, db_path, pattern_file, submit_endpoint, source_host, anomaly_model=anomaly_model, anomaly_device=anomaly_device, anomaly_threshold=anomaly_threshold, cybersec_model=cybersec_model, cybersec_device=cybersec_device, cybersec_threshold=cybersec_threshold, incidents_db_path=incidents_db_path, auto_incident=auto_incident, ) next_run = datetime.now(tz=timezone.utc) + timedelta(seconds=interval_s) _state.next_run_at = next_run.isoformat() try: await asyncio.sleep(interval_s) except asyncio.CancelledError: logger.info("Ingest scheduler cancelled") _state.next_run_at = None raise