turnstone/app/mcp_server.py
pyr0ball 828b69768a refactor: rename ingest → glean throughout codebase
Renames the app/ingest/ package to app/glean/ and updates all
references across Python modules, shell scripts, Vue components,
tests, and documentation.

Intentionally preserved:
- SQLite column name ingest_time (avoids schema migration)
- RetrievedEntry.ingest_time field (maps to the column above)
- Any public-facing JSON keys that reference ingest_time

Changes by category:
- app/ingest/ → app/glean/ (full package move, all parsers)
- app/tasks/ingest_scheduler.py → app/tasks/glean_scheduler.py
- scripts/ingest_corpus.py → scripts/glean_corpus.py
- tests/test_ingest_*.py → tests/test_glean_*.py
- Docstrings, log messages, comments: ingest → glean
- Env var: TURNSTONE_INGEST_INTERVAL → TURNSTONE_GLEAN_INTERVAL
- Shell scripts: glean.log, glean_corpus.py references
- README.md: multi-source ingest → multi-source glean
- .env.example: updated env var name
- patterns/: new diagnostic patterns from 2026-05-20 SSH incident
  (service_crash_loop, pkg_daemon_restart, ssh_forward_conflict)
- SourcesView.vue: pipeline label updated
- All test import paths updated to app.glean.*

285 tests passing.
2026-05-20 23:02:55 -07:00

198 lines
6.6 KiB
Python

"""Turnstone MCP server — log search and diagnostics via stdio transport.
Works with both Claude Code and Copilot CLI (both use MCP stdio).
Run directly: python -m app.mcp_server
Or via: python app/mcp_server.py
Configure TURNSTONE_DB env var to override the default DB path.
"""
from __future__ import annotations
import logging
import os
import sqlite3
import sys
from pathlib import Path
from mcp.server.fastmcp import FastMCP
# All logging must go to stderr — stdout is reserved for MCP JSON-RPC
logging.basicConfig(
stream=sys.stderr,
level=logging.INFO,
format="%(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
# Allow running as a script from the project root
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.services.search import (
build_fts_index,
format_results,
list_sources,
search,
)
_DEFAULT_DB = Path(__file__).parent.parent / "data" / "turnstone.db"
DB_PATH = Path(os.environ.get("TURNSTONE_DB", _DEFAULT_DB))
mcp = FastMCP(
"turnstone",
instructions=(
"Turnstone is a diagnostic intelligence layer for server, service, and device logs. "
"Call list_log_sources first to understand what data is available and identify "
"source_id values for filtering. Use search_logs for targeted FTS queries. "
"Use diagnose for symptom-driven investigation — it runs layered searches and "
"returns evidence ranked by severity and relevance."
),
)
_index_ready = False
def _ensure_index() -> None:
"""Build FTS index on first use; skip if already present."""
global _index_ready
if _index_ready:
return
try:
conn = sqlite3.connect(str(DB_PATH))
count = conn.execute("SELECT COUNT(*) FROM log_fts").fetchone()[0]
conn.close()
if count > 0:
_index_ready = True
logger.info("FTS index present (%d entries)", count)
return
except sqlite3.OperationalError:
pass
logger.info("Building FTS index for %s — this may take a minute...", DB_PATH)
build_fts_index(DB_PATH)
_index_ready = True
logger.info("FTS index ready")
@mcp.tool()
def search_logs(
query: str,
severity: str | None = None,
source: str | None = None,
pattern: str | None = None,
since: str | None = None,
until: str | None = None,
limit: int = 20,
include_repeats: bool = False,
) -> str:
"""Search log entries using full-text search with optional filters.
Args:
query: FTS5 search expression. Supports AND, OR, NOT, phrase quotes, prefix*.
Example: '"connection refused" OR "connection lost"'
severity: Filter by level — EMERGENCY, ALERT, CRITICAL, ERROR, WARN, NOTICE, INFO, DEBUG.
source: Partial match on source_id. Format is 'corpus:host:service'.
Example: 'example-node:caddy' matches all Caddy entries from example-node.
pattern: Filter by named pattern tag applied at glean time.
Known tags: auth_failure, connection_lost, oom, segfault, disk_full,
timeout, caddy_tls_error, caddy_config_error, caddy_auth_error,
caddy_upstream_error, service_restart, service_update,
power_failure, network_interface, ip_change.
since: Earliest timestamp, ISO 8601 (e.g. 2026-05-06T18:00:00).
until: Latest timestamp, ISO 8601.
limit: Result count cap (default 20, max 100).
include_repeats: Include deduplicated repeat log lines (default False).
Returns:
Formatted log entries ranked by FTS relevance, or a no-results message.
"""
_ensure_index()
results = search(
DB_PATH,
query=query,
severity=severity,
source_filter=source,
pattern_filter=pattern,
since=since,
until=until,
limit=min(limit, 100),
include_repeats=include_repeats,
)
return format_results(results)
@mcp.tool()
def diagnose(
symptom: str,
source: str | None = None,
since: str | None = None,
until: str | None = None,
) -> str:
"""Investigate a symptom or problem description across the log corpus.
Runs layered searches — broad relevance first, then CRITICAL/ERROR filtered —
and merges results deduplicated by entry ID, sorted chronologically.
Use this for open-ended questions like "why did auth break?" or
"what happened during the network outage?".
Args:
symptom: Plain-language description of the problem or symptom.
source: Limit to a host or service (partial match on source_id).
since: Earliest timestamp, ISO 8601.
until: Latest timestamp, ISO 8601.
Returns:
Consolidated log evidence sorted by time, formatted for analysis.
"""
_ensure_index()
common = dict(source_filter=source, since=since, until=until, include_repeats=False)
broad = search(DB_PATH, query=symptom, limit=15, **common)
critical = search(DB_PATH, query=symptom, severity="CRITICAL", limit=5, **common)
errors = search(DB_PATH, query=symptom, severity="ERROR", limit=10, **common)
seen: set[str] = set()
combined = []
for r in broad + critical + errors:
if r.entry_id not in seen:
seen.add(r.entry_id)
combined.append(r)
combined.sort(key=lambda r: (r.timestamp_iso or "\xff", r.sequence))
return format_results(combined[:20])
@mcp.tool()
def list_log_sources() -> str:
"""List all log sources in the corpus with entry counts and time ranges.
Call this before searching to understand what data is available and to
identify source_id values for use in the source= filter parameter.
Returns:
Each source on its own block with entry count, error count, and time range.
"""
sources = list_sources(DB_PATH)
if not sources:
return "No log sources found. Has the corpus been gleaned? Run: python scripts/glean_corpus.py"
lines = [f"Corpus: {DB_PATH}", f"Sources ({len(sources)} total):\n"]
for s in sources:
err_note = f"{s['error_count']:,} errors" if s["error_count"] else ""
lines.append(
f" {s['source_id']}\n"
f" entries: {s['entry_count']:,}{err_note}\n"
f" range: {s['earliest'] or 'unknown'}{s['latest'] or 'unknown'}"
)
return "\n".join(lines)
if __name__ == "__main__":
if not DB_PATH.exists():
logger.error("Database not found: %s", DB_PATH)
logger.error("Run: python scripts/glean_corpus.py <corpus_dir> <db_path>")
sys.exit(1)
logger.info("Starting Turnstone MCP server (DB: %s)", DB_PATH)
mcp.run()