Ingest pipeline (journald / Caddy / Docker-wrapped formats) with per-source state tracking (repeat dedup, out-of-order detection), named pattern tagging at ingest time, and idempotent SHA1-keyed writes. FTS5 search layer with porter stemmer, severity/source/pattern/time filters, and BM25 ranking. MCP server (FastMCP stdio) with three tools: search_logs, diagnose, list_log_sources — compatible with both Claude Code and Copilot CLI. WAL mode enabled on all connections. FTS index auto-built after ingest. MCP configs included for Claude Code (.mcp.json) and Copilot CLI (.github/copilot/mcp.json).
28 lines
928 B
Python
28 lines
928 B
Python
"""CLI: ingest a corpus directory into the Turnstone SQLite database."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
|
|
# Allow running from repo root
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
|
|
from app.ingest.pipeline import ingest
|
|
|
|
if __name__ == "__main__":
|
|
corpus_dir = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("corpus/raw")
|
|
db_path = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("data/turnstone.db")
|
|
pattern_file = Path("patterns/default.yaml")
|
|
|
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
print(f"Ingesting {corpus_dir} → {db_path}")
|
|
stats = ingest(corpus_dir, db_path, pattern_file)
|
|
|
|
total = sum(stats.values())
|
|
for fname, count in sorted(stats.items()):
|
|
print(f" {fname}: {count:,}")
|
|
print(f" TOTAL: {total:,} entries")
|