turnstone/tests/test_export_corpus.py
pyr0ball afcac6ff05 feat: periodic corpus export — push ERROR/CRITICAL entries and incidents to Avocet
Watermark-based batch export script (scripts/export_corpus.py) pushes up to 500
ERROR/CRITICAL entries and labeled incidents per run to AVOCET_CORPUS_ENDPOINT.
Uses SQLite rowid watermark (entry log) and ISO timestamp watermark (incidents).
Skips silently when AVOCET_CORPUS_ENDPOINT is not set. 19 tests. Closes turnstone#6.
2026-05-11 17:08:35 -07:00

263 lines
8.8 KiB
Python

"""Tests for scripts/export_corpus.py"""
from __future__ import annotations
import json
import sqlite3
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent))
sys.path.insert(0, str(Path(__file__).parent.parent / "scripts"))
import export_corpus as ec
# ── Helpers ────────────────────────────────────────────────────────────────────
def _make_db(tmp_path: Path) -> sqlite3.Connection:
db = tmp_path / "turnstone.db"
conn = sqlite3.connect(str(db))
conn.row_factory = sqlite3.Row
conn.execute("""
CREATE TABLE log_entries (
id TEXT PRIMARY KEY, source_id TEXT, sequence INTEGER,
timestamp_raw TEXT, timestamp_iso TEXT, ingest_time TEXT,
severity TEXT, repeat_count INTEGER DEFAULT 1,
out_of_order INTEGER DEFAULT 0,
matched_patterns TEXT DEFAULT '[]', text TEXT
)
""")
conn.execute("""
CREATE TABLE incidents (
id TEXT PRIMARY KEY, label TEXT, issue_type TEXT,
started_at TEXT, ended_at TEXT, notes TEXT,
created_at TEXT, severity TEXT DEFAULT 'medium'
)
""")
conn.commit()
return conn
def _insert_entry(conn: sqlite3.Connection, rowid_hint: str, severity: str = "ERROR") -> None:
conn.execute(
"INSERT INTO log_entries VALUES (?,?,?,?,?,?,?,?,?,?,?)",
(rowid_hint, "sonarr", 1, None, "2026-05-11T10:00:00", "2026-05-11T10:00:05",
severity, 1, 0, '[]', f"Log line for {rowid_hint}"),
)
conn.commit()
def _insert_incident(conn: sqlite3.Connection, created_at: str) -> None:
conn.execute(
"INSERT INTO incidents VALUES (?,?,?,?,?,?,?,?)",
(f"inc-{created_at}", "plex crash", "plex_crash", "2026-05-11T09:55:00",
"2026-05-11T10:05:00", "audio stopped", created_at, "high"),
)
conn.commit()
# ── Watermark helpers ──────────────────────────────────────────────────────────
def test_read_rowid_watermark_missing(tmp_path):
db = tmp_path / "t.db"
assert ec.read_rowid_watermark(db) == 0
def test_read_rowid_watermark_valid(tmp_path):
db = tmp_path / "t.db"
(tmp_path / "corpus_watermark.txt").write_text("42")
assert ec.read_rowid_watermark(db) == 42
def test_read_rowid_watermark_corrupt(tmp_path):
db = tmp_path / "t.db"
(tmp_path / "corpus_watermark.txt").write_text("not-a-number")
assert ec.read_rowid_watermark(db) == 0
def test_write_read_rowid_roundtrip(tmp_path):
db = tmp_path / "t.db"
ec.write_rowid_watermark(db, 99)
assert ec.read_rowid_watermark(db) == 99
def test_read_ts_watermark_missing(tmp_path):
db = tmp_path / "t.db"
assert ec.read_ts_watermark(db) == "1970-01-01T00:00:00"
def test_write_read_ts_roundtrip(tmp_path):
db = tmp_path / "t.db"
ec.write_ts_watermark(db, "2026-05-11T12:00:00")
assert ec.read_ts_watermark(db) == "2026-05-11T12:00:00"
# ── export_raw_entries ─────────────────────────────────────────────────────────
def test_export_raw_entries_empty(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
posted = []
with patch.object(ec, "post_batch", side_effect=lambda *a, **kw: posted.append(a[2])):
count = ec.export_raw_entries(conn, db, "http://avocet/", "tok")
assert count == 0
assert posted == []
def test_export_raw_entries_skips_warn(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_entry(conn, "w1", severity="WARN")
posted = []
with patch.object(ec, "post_batch", side_effect=lambda *a, **kw: posted.append(a[2])):
count = ec.export_raw_entries(conn, db, "http://avocet/", "tok")
assert count == 0
assert posted == []
def test_export_raw_entries_sends_errors(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_entry(conn, "e1", severity="ERROR")
_insert_entry(conn, "e2", severity="CRITICAL")
posted = []
with patch.object(ec, "post_batch", side_effect=lambda *a, **kw: posted.append(a[2])):
count = ec.export_raw_entries(conn, db, "http://avocet/", "tok")
assert count == 2
assert len(posted) == 1
payload = posted[0]
assert payload["batch_type"] == "raw_entries"
assert len(payload["entries"]) == 2
assert all(e["severity"] in ("ERROR", "CRITICAL") for e in payload["entries"])
def test_export_raw_entries_updates_watermark(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_entry(conn, "e1", severity="ERROR")
with patch.object(ec, "post_batch"):
ec.export_raw_entries(conn, db, "http://avocet/", "tok")
assert ec.read_rowid_watermark(db) > 0
def test_export_raw_entries_respects_watermark(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_entry(conn, "e1", severity="ERROR")
# set watermark past first entry
with patch.object(ec, "post_batch"):
ec.export_raw_entries(conn, db, "http://avocet/", "tok")
# insert a second entry and export again
_insert_entry(conn, "e2", severity="ERROR")
posted = []
with patch.object(ec, "post_batch", side_effect=lambda *a, **kw: posted.append(a[2])):
count = ec.export_raw_entries(conn, db, "http://avocet/", "tok")
assert count == 1
assert len(posted[0]["entries"]) == 1
assert posted[0]["entries"][0]["entry_id"] == "e2"
def test_export_raw_entries_no_watermark_update_on_failure(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_entry(conn, "e1", severity="ERROR")
import httpx as _httpx
mock_resp = MagicMock()
mock_resp.status_code = 403
mock_resp.text = "Forbidden"
exc = _httpx.HTTPStatusError("403", request=MagicMock(), response=mock_resp)
with patch.object(ec, "post_batch", side_effect=exc):
with pytest.raises(_httpx.HTTPStatusError):
ec.export_raw_entries(conn, db, "http://avocet/", "tok")
assert ec.read_rowid_watermark(db) == 0
# ── export_incidents ───────────────────────────────────────────────────────────
def test_export_incidents_empty(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
posted = []
with patch.object(ec, "post_batch", side_effect=lambda *a, **kw: posted.append(a[2])):
count = ec.export_incidents(conn, db, "http://avocet/", "tok")
assert count == 0
assert posted == []
def test_export_incidents_sends_payload(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_incident(conn, "2026-05-11T10:00:00")
posted = []
with patch.object(ec, "post_batch", side_effect=lambda *a, **kw: posted.append(a[2])):
count = ec.export_incidents(conn, db, "http://avocet/", "tok")
assert count == 1
payload = posted[0]
assert payload["batch_type"] == "incident_bundles"
assert payload["entries"][0]["label"] == "plex crash"
def test_export_incidents_updates_watermark(tmp_path):
conn = _make_db(tmp_path)
db = tmp_path / "turnstone.db"
_insert_incident(conn, "2026-05-11T10:00:00")
with patch.object(ec, "post_batch"):
ec.export_incidents(conn, db, "http://avocet/", "tok")
assert ec.read_ts_watermark(db) == "2026-05-11T10:00:00"
# ── main ───────────────────────────────────────────────────────────────────────
def test_main_no_endpoint(monkeypatch):
monkeypatch.setattr(ec, "CORPUS_ENDPOINT", "")
assert ec.main() == 0
def test_main_no_token(monkeypatch):
monkeypatch.setattr(ec, "CORPUS_ENDPOINT", "http://avocet/")
monkeypatch.setattr(ec, "CONSENT_TOKEN", "")
assert ec.main() == 1
def test_main_db_missing(monkeypatch, tmp_path):
monkeypatch.setattr(ec, "CORPUS_ENDPOINT", "http://avocet/")
monkeypatch.setattr(ec, "CONSENT_TOKEN", "tok")
monkeypatch.setattr(ec, "DB_PATH", tmp_path / "missing.db")
assert ec.main() == 1
def test_main_success(monkeypatch, tmp_path):
conn = _make_db(tmp_path)
conn.close()
db = tmp_path / "turnstone.db"
monkeypatch.setattr(ec, "CORPUS_ENDPOINT", "http://avocet/")
monkeypatch.setattr(ec, "CONSENT_TOKEN", "tok")
monkeypatch.setattr(ec, "DB_PATH", db)
with patch.object(ec, "post_batch"):
result = ec.main()
assert result == 0