turnstone/scripts/glean_corpus.py
pyr0ball 688224840a fix(glean): add timeout=30s to all pipeline DB connections; add --force flag; new patterns
pipeline.py:
- Add timeout=30.0 to all sqlite3.connect() calls (5 total).
  Previously only ensure_context_schema() had it. The main glean
  writers would fail immediately under lock contention from the live
  watcher or concurrent manual glean runs.

glean_corpus.py:
- Add --force flag (passed through to glean_sources/glean_file/glean_dir).
  Without it, unchanged-fingerprint files were silently skipped even
  after pattern updates. Use after editing patterns/default.yaml.

patterns/default.yaml:
- Add 9 new patterns for Muninn / cluster-wide coverage:
    vpn_tunnel_fail     WireGuard/tunnel service failures
    vpn_handshake       WireGuard peer handshake events
    dns_degraded        systemd-resolved DNS fallback/degradation
    nvidia_api_mismatch NVIDIA kernel module vs userspace mismatch
    nvidia_xid          NVIDIA Xid GPU hardware faults
    nvidia_gpu_reset    NVIDIA GPU reset / NVLink faults
    acpi_error          ACPI firmware _DSM evaluation failures
    thermal_throttle    CPU/GPU thermal throttling / RAPL unavailable
    undervoltage        PSU undervoltage / brownout events
- Sync from /devl/turnstone-cluster/patterns/default.yaml (authoritative
  live copy updated first; repo copy was stale)
2026-05-26 22:36:45 -07:00

73 lines
2.5 KiB
Python

"""CLI: glean a log file or corpus directory into the Turnstone SQLite database.
Usage:
# Single file or directory (legacy)
python scripts/glean_corpus.py <file_or_dir> [db_path] [--force]
# Sources config (multi-service)
python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]
Options:
--force Bypass fingerprint checks and re-glean all files, re-applying
all patterns. Use after updating patterns/default.yaml.
"""
from __future__ import annotations
import logging
import sys
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.glean.pipeline import glean_dir, glean_file, glean_sources
def _print_stats(stats: dict[str, int]) -> None:
total = sum(stats.values())
for source, count in sorted(stats.items()):
print(f" {source}: {count:,}")
print(f" TOTAL: {total:,} entries")
if __name__ == "__main__":
args = sys.argv[1:]
if not args:
print(
"Usage:\n"
" glean_corpus.py <file_or_dir> [db_path] [--force]\n"
" glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]",
file=sys.stderr,
)
sys.exit(1)
force = "--force" in args
args = [a for a in args if a != "--force"]
if args[0] == "--sources":
if len(args) < 2:
print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]", file=sys.stderr)
sys.exit(1)
sources_file = Path(args[1])
db_path = Path("data/turnstone.db")
if "--db" in args:
db_path = Path(args[args.index("--db") + 1])
db_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Gleaning sources from {sources_file}{db_path}")
stats = glean_sources(sources_file, db_path, force=force)
_print_stats(stats)
else:
target = Path(args[0])
db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db")
db_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Gleaning {target}{db_path}")
if target.is_file():
stats = glean_file(target, db_path, force=force)
elif target.is_dir():
stats = glean_dir(target, db_path, force=force)
else:
print(f"Error: {target} is not a file or directory", file=sys.stderr)
sys.exit(1)
_print_stats(stats)