"""CLI: ingest a log file or corpus directory into the Turnstone SQLite database. Usage: # Single file or directory (legacy) python scripts/ingest_corpus.py [db_path] # Sources config (multi-service) python scripts/ingest_corpus.py --sources [--db ] """ from __future__ import annotations import logging import sys from pathlib import Path logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") sys.path.insert(0, str(Path(__file__).parent.parent)) from app.ingest.pipeline import ingest, ingest_file, ingest_sources def _print_stats(stats: dict[str, int]) -> None: total = sum(stats.values()) for source, count in sorted(stats.items()): print(f" {source}: {count:,}") print(f" TOTAL: {total:,} entries") if __name__ == "__main__": args = sys.argv[1:] if not args: print( "Usage:\n" " ingest_corpus.py [db_path]\n" " ingest_corpus.py --sources [--db ]", file=sys.stderr, ) sys.exit(1) if args[0] == "--sources": if len(args) < 2: print("Usage: ingest_corpus.py --sources [--db ]", file=sys.stderr) sys.exit(1) sources_file = Path(args[1]) db_path = Path("data/turnstone.db") if "--db" in args: db_path = Path(args[args.index("--db") + 1]) db_path.parent.mkdir(parents=True, exist_ok=True) print(f"Ingesting sources from {sources_file} → {db_path}") stats = ingest_sources(sources_file, db_path) _print_stats(stats) else: target = Path(args[0]) db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db") db_path.parent.mkdir(parents=True, exist_ok=True) print(f"Ingesting {target} → {db_path}") if target.is_file(): stats = ingest_file(target, db_path) elif target.is_dir(): stats = ingest(target, db_path) else: print(f"Error: {target} is not a file or directory", file=sys.stderr) sys.exit(1) _print_stats(stats)