turnstone/app/tasks/glean_scheduler.py
pyr0ball cffe6bcd31 feat: cybersec zero-shot scoring pipeline (#9)
Second-pass cybersec classifier using DeBERTa-v3-base-mnli (already
cached — no download required). Runs after each anomaly scoring pass on
entries flagged by the anomaly scorer or with pattern matches.

Architecture:
- app/services/cybersec.py: zero-shot-classification pipeline with 5
  cybersec candidate labels (auth failure, privilege escalation, network
  intrusion, malware, data exfiltration). Writes ml_score/ml_label/
  ml_scored_at to log_entries; inserts high-confidence hits into
  detections with scorer='cybersec'.
- app/tasks/cybersec_scorer.py: async background task (same shape as
  anomaly_scorer.py).
- REST: GET/POST /turnstone/api/cybersec/status|run|detections.
  GET /turnstone/api/anomaly/detections now accepts scorer= filter.

Schema: ml_score, ml_label, ml_scored_at added to log_entries; scorer
column added to detections (idempotent migrations + DDL for both SQLite
and Postgres).

UI: Security Alerts view gains Source dropdown (All / Anomaly / Cybersec)
and cybersec scorer status badge. Label dropdown split into optgroups.

Deployment: TURNSTONE_CYBERSEC_MODEL/DEVICE/THRESHOLD vars added to
.env.example, docker-compose.yml, docker-standalone.sh.

Tests: 10 new tests — no model, no eligible entries, scoring, detection
creation, normal label suppression, threshold filtering, pattern-tag
filtering, idempotency, list filtering, scorer column filter.
416/416 passing.

Closes: #9
2026-06-10 01:03:25 -07:00

222 lines
7.9 KiB
Python

"""Periodic batch glean scheduler with optional CF submission.
Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var,
default 900s / 15 min). Set to 0 to disable.
When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote
Turnstone instance (the CF receiving store) after each glean run.
"""
from __future__ import annotations
import asyncio
import json
import logging
from app.db import get_conn, resolve_tenant_id
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
import httpx
from app.glean.pipeline import glean_sources
from app.tasks.anomaly_scorer import run_once as _run_scorer
from app.tasks.cybersec_scorer import run_once as _run_cybersec
logger = logging.getLogger(__name__)
_lock = asyncio.Lock()
@dataclass
class IngestState:
last_run_at: str | None = None
last_duration_s: float | None = None
last_stats: dict[str, int] = field(default_factory=dict)
last_error: str | None = None
run_count: int = 0
next_run_at: str | None = None
running: bool = False
last_submitted_at: str | None = None
last_submit_count: int = 0
last_submit_error: str | None = None
_state = IngestState()
def get_state() -> IngestState:
return _state
def _query_matched_since(db_path: Path, since: str | None) -> list[dict]:
"""Return entries with non-empty matched_patterns, optionally filtered by ingest_time."""
tid = resolve_tenant_id()
with get_conn(db_path) as conn:
if since:
rows = conn.execute(
"""
SELECT id, source_id, sequence, timestamp_raw, timestamp_iso,
ingest_time, severity, repeat_count, out_of_order,
matched_patterns, text
FROM log_entries
WHERE matched_patterns != '[]'
AND ingest_time > ?
AND (tenant_id = ? OR tenant_id = '')
ORDER BY ingest_time
LIMIT 5000
""",
(since, tid),
).fetchall()
else:
rows = conn.execute(
"""
SELECT id, source_id, sequence, timestamp_raw, timestamp_iso,
ingest_time, severity, repeat_count, out_of_order,
matched_patterns, text
FROM log_entries
WHERE matched_patterns != '[]'
AND (tenant_id = ? OR tenant_id = '')
ORDER BY ingest_time DESC
LIMIT 5000
""",
(tid,),
).fetchall()
return [dict(r) for r in rows]
async def submit_matched(
db_path: Path,
submit_endpoint: str,
source_host: str,
since: str | None = None,
) -> dict[str, Any]:
"""Push pattern-matched entries to the remote CF receiving instance."""
loop = asyncio.get_running_loop()
entries = await loop.run_in_executor(
None, lambda: _query_matched_since(db_path, since)
)
if not entries:
return {"ok": True, "submitted": 0, "skipped": True}
url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch"
payload = {"source_host": source_host, "entries": entries}
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(url, json=payload)
resp.raise_for_status()
result = resp.json()
submitted = result.get("gleaned", len(entries))
_state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat()
_state.last_submit_count = submitted
_state.last_submit_error = None
logger.info("Submitted %d matched entries to %s", submitted, submit_endpoint)
return {"ok": True, "submitted": submitted}
except Exception as exc:
_state.last_submit_error = str(exc)
logger.warning("Submission to %s failed: %s", submit_endpoint, exc)
return {"ok": False, "error": str(exc)}
async def run_once(
sources_file: Path,
db_path: Path,
pattern_file: Path | None = None,
submit_endpoint: str | None = None,
source_host: str = "unknown",
force: bool = False,
anomaly_model: str = "",
anomaly_device: str = "cpu",
anomaly_threshold: float = 0.75,
cybersec_model: str = "",
cybersec_device: str = "cpu",
cybersec_threshold: float = 0.60,
) -> dict[str, Any]:
"""Ingest all sources once, then submit matched entries if configured.
Pass ``force=True`` to bypass fingerprint checks and re-glean all local
file sources regardless of whether they appear unchanged.
"""
if _lock.locked():
return {"ok": False, "error": "glean already running", "skipped": True}
async with _lock:
_state.running = True
started = datetime.now(tz=timezone.utc)
try:
loop = asyncio.get_running_loop()
stats: dict[str, int] = await loop.run_in_executor(
None,
lambda: glean_sources(sources_file, db_path, pattern_file, force=force),
)
duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
_state.last_run_at = started.isoformat()
_state.last_duration_s = round(duration, 2)
_state.last_stats = stats
_state.last_error = None
_state.run_count += 1
logger.info("Batch glean complete in %.1fs — %s", duration, stats)
except Exception as exc:
duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
_state.last_run_at = started.isoformat()
_state.last_duration_s = round(duration, 2)
_state.last_error = str(exc)
_state.run_count += 1
logger.error("Batch glean failed: %s", exc)
_state.running = False
return {"ok": False, "error": str(exc)}
finally:
_state.running = False
if submit_endpoint:
await submit_matched(db_path, submit_endpoint, source_host, since=_state.last_submitted_at)
if anomaly_model:
await _run_scorer(db_path, anomaly_model, anomaly_device, threshold=anomaly_threshold)
if cybersec_model:
await _run_cybersec(db_path, cybersec_model, cybersec_device, threshold=cybersec_threshold)
return {"ok": True, "stats": _state.last_stats, "duration_s": _state.last_duration_s}
async def scheduler_loop(
sources_file: Path,
db_path: Path,
pattern_file: Path | None,
interval_s: int,
submit_endpoint: str | None = None,
source_host: str = "unknown",
anomaly_model: str = "",
anomaly_device: str = "cpu",
anomaly_threshold: float = 0.75,
cybersec_model: str = "",
cybersec_device: str = "cpu",
cybersec_threshold: float = 0.60,
) -> None:
"""Run glean + optional submission + optional anomaly/cybersec scoring every interval_s seconds."""
logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file)
if submit_endpoint:
logger.info("Submission enabled — endpoint: %s", submit_endpoint)
if anomaly_model:
logger.info("Anomaly scoring enabled — model: %s", anomaly_model)
if cybersec_model:
logger.info("Cybersec scoring enabled — model: %s", cybersec_model)
while True:
await run_once(
sources_file, db_path, pattern_file, submit_endpoint, source_host,
anomaly_model=anomaly_model,
anomaly_device=anomaly_device,
anomaly_threshold=anomaly_threshold,
cybersec_model=cybersec_model,
cybersec_device=cybersec_device,
cybersec_threshold=cybersec_threshold,
)
next_run = datetime.now(tz=timezone.utc) + timedelta(seconds=interval_s)
_state.next_run_at = next_run.isoformat()
try:
await asyncio.sleep(interval_s)
except asyncio.CancelledError:
logger.info("Ingest scheduler cancelled")
_state.next_run_at = None
raise