diff --git a/app/glean/base.py b/app/glean/base.py index a222c9b..a22486e 100644 --- a/app/glean/base.py +++ b/app/glean/base.py @@ -33,6 +33,7 @@ def load_patterns(path: Path) -> list[LogPattern]: pattern=p["pattern"], severity=p["severity"], description=p["description"], + domain=p.get("domain", ""), ) for p in raw.get("patterns", []) ] diff --git a/app/rest.py b/app/rest.py index 2f7cac2..36f2d0c 100644 --- a/app/rest.py +++ b/app/rest.py @@ -129,11 +129,13 @@ if GPU_SERVER_URL: _watcher = Watcher(DB_PATH, PATTERN_FILE) _compiled_patterns: list = [] +# pattern name → domain; populated at startup from _compiled_patterns. +_pattern_domain: dict[str, str] = {} @asynccontextmanager async def _lifespan(app: FastAPI): - global _compiled_patterns + global _compiled_patterns, _pattern_domain # Route turnstone.audit through uvicorn's own handler so it appears in api.log. _audit_log.setLevel(logging.INFO) for h in logging.getLogger("uvicorn.error").handlers: @@ -147,6 +149,7 @@ async def _lifespan(app: FastAPI): "Migrated %d incident/bundle rows from main DB to incidents DB", migrated ) _compiled_patterns = load_compiled_patterns(PATTERN_FILE) + _pattern_domain = {p.name: p.domain for p, _ in _compiled_patterns if p.domain} watch_cfg_path = PATTERN_DIR / "watch.yaml" configs = load_watch_config(watch_cfg_path) if configs: @@ -324,6 +327,23 @@ def _check_api_key(request: Request) -> None: raise HTTPException(status_code=403, detail="Invalid API key") +def _domain_counts(results: list) -> dict[str, int]: + """Count hits per domain across a list of SearchResult objects. + + Uses the module-level _pattern_domain lookup built at startup. + Entries whose matched_patterns contain no known domain are skipped. + """ + counts: dict[str, int] = {} + for r in results: + seen_domains: set[str] = set() + for tag in (r.matched_patterns or []): + domain = _pattern_domain.get(tag, "") + if domain and domain not in seen_domains: + seen_domains.add(domain) + counts[domain] = counts.get(domain, 0) + 1 + return counts + + # API router — all routes accessible at /turnstone/api/* and /turnstone/health. router = APIRouter(prefix="/turnstone", dependencies=[Depends(_check_api_key)]) @@ -338,6 +358,7 @@ def search_logs( q: Annotated[str, Query(description="Search query")] = "", source: Annotated[str | None, Query(description="Filter by log source ID (partial match)")] = None, severity: Annotated[str | None, Query(description="Filter by severity (DEBUG/INFO/WARN/ERROR/CRITICAL)")] = None, + domain: Annotated[str | None, Query(description="Filter by service health domain (networking, storage, auth, etc.)")] = None, since: Annotated[str | None, Query(description="ISO timestamp lower bound")] = None, until: Annotated[str | None, Query(description="ISO timestamp upper bound")] = None, limit: Annotated[int, Query(ge=1, le=500)] = 50, @@ -355,6 +376,8 @@ def search_logs( limit=limit, semantic=semantic, ) + if domain: + results = [r for r in results if domain in {_pattern_domain.get(t, "") for t in r.matched_patterns}] return {"count": len(results), "results": [dataclasses.asdict(r) for r in results]} @@ -428,6 +451,7 @@ def diagnose_post(body: DiagnoseRequest) -> dict: "summary": { "total": 0, "window_start": None, "window_end": None, "time_detected": False, "by_severity": {}, "by_source": {}, + "by_domain": {}, }, "entries": [], } @@ -442,8 +466,9 @@ def diagnose_post(body: DiagnoseRequest) -> dict: llm_model=prefs.get("llm_model") or None, llm_api_key=prefs.get("llm_api_key") or None, ) + summary = {**result["summary"], "by_domain": _domain_counts(result["entries"])} return { - "summary": result["summary"], + "summary": summary, "reasoning": result.get("reasoning"), "entries": [dataclasses.asdict(r) for r in result["entries"]], } @@ -466,6 +491,7 @@ async def diagnose_post_stream(body: DiagnoseRequest) -> StreamingResponse: context_db_path=CONTEXT_DB_PATH, incidents_db_path=INCIDENTS_DB_PATH, tech_level=prefs.get("tech_level", "sysadmin"), + pattern_domain=_pattern_domain or None, ): yield f"data: {json.dumps(event)}\n\n" diff --git a/app/services/diagnose/__init__.py b/app/services/diagnose/__init__.py index 3af0d0f..8bbe11e 100644 --- a/app/services/diagnose/__init__.py +++ b/app/services/diagnose/__init__.py @@ -198,6 +198,7 @@ async def diagnose_stream( context_db_path: Path | None = None, incidents_db_path: Path | None = None, tech_level: str = "sysadmin", + pattern_domain: dict[str, str] | None = None, ) -> AsyncGenerator[dict[str, Any], None]: """Async generator yielding SSE event dicts for the diagnose pipeline. @@ -295,6 +296,16 @@ async def diagnose_stream( by_severity[sev] += 1 by_source[r.source_id] = by_source.get(r.source_id, 0) + 1 + by_domain: dict[str, int] = {} + if pattern_domain: + for r in combined: + seen: set[str] = set() + for tag in (r.matched_patterns or []): + d = pattern_domain.get(tag, "") + if d and d not in seen: + seen.add(d) + by_domain[d] = by_domain.get(d, 0) + 1 + yield { "type": "summary", "data": { @@ -304,6 +315,7 @@ async def diagnose_stream( "time_detected": time_detected, "by_severity": by_severity, "by_source": by_source, + "by_domain": by_domain, }, } yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]} diff --git a/app/services/models.py b/app/services/models.py index 784b46d..136ef2c 100644 --- a/app/services/models.py +++ b/app/services/models.py @@ -31,6 +31,7 @@ class LogPattern: pattern: str # regex string severity: str # suggested severity if not present in log line description: str # human-readable explanation for the UI + domain: str = "" # service health domain (networking, storage, auth, etc.) @dataclass(frozen=True) diff --git a/patterns/default.yaml b/patterns/default.yaml index b308895..6142a7c 100644 --- a/patterns/default.yaml +++ b/patterns/default.yaml @@ -2,83 +2,101 @@ # Each matched pattern name is stored on RetrievedEntry.matched_patterns and # used to boost retrieval relevance for diagnostic queries. # -# Add domain-specific patterns here. Patterns are applied in order; multiple -# can match a single entry. +# domain: groups patterns into service health domains for triage-level summaries. +# Valid domains: service_health | networking | auth | storage | memory | +# kernel | power | web_proxy | media | gpu +# +# Patterns are applied in order; multiple can match a single entry. patterns: - name: service_restart pattern: "(restarting|restart requested|service.*start)" severity: WARN + domain: service_health description: Service restart detected - name: connection_lost pattern: "(connection (lost|dropped|refused|timed? out)|disconnect(ed)?)" severity: ERROR + domain: networking description: Network or device connection failure - name: auth_failure pattern: "(auth(entication)? (failed?|error|denied)|permission denied|unauthorized)" severity: ERROR + domain: auth description: Authentication or authorization failure - name: oom pattern: "(out of memory|OOM|killed process|cannot allocate)" severity: CRITICAL + domain: memory description: Out-of-memory condition - name: segfault pattern: "(segmentation fault|segfault|SIGSEGV|core dump)" severity: CRITICAL + domain: kernel description: Process crash or memory corruption - name: disk_full pattern: "(no space left|disk full|filesystem.*full|ENOSPC)" severity: ERROR + domain: storage description: Storage capacity exhausted - name: timeout pattern: "(timed? out|deadline exceeded|operation timed?)" severity: WARN + domain: networking description: Operation timeout - name: caddy_tls_error pattern: "(acme|certificate|tls).*(error|fail|invalid|expired|renew)" severity: ERROR + domain: web_proxy description: Caddy TLS or certificate error - name: caddy_config_error pattern: "(config|caddyfile|directive).*(error|invalid|unknown|unrecognized)" severity: ERROR + domain: web_proxy description: Caddy configuration error - name: caddy_auth_error pattern: "(forward_auth|basicauth|basic_auth).*(error|fail|denied|invalid|unreachable)" severity: ERROR + domain: web_proxy description: Caddy authentication middleware failure - name: caddy_upstream_error pattern: "(upstream|backend|reverse.proxy).*(error|fail|unreachable|refused|timeout)" severity: ERROR + domain: web_proxy description: Caddy upstream/backend failure - name: service_update pattern: "(upgraded?|updated?|installing|dpkg|apt|package).*(caddy|nginx|apache|proxy)" severity: INFO + domain: web_proxy description: Web server package update detected - name: power_failure pattern: "(power (fail|loss|outage|cut)|ups|battery|shutdown.*power|lost power)" severity: CRITICAL + domain: power description: Power failure or UPS event - name: network_interface pattern: "(eth[0-9]|ens[0-9]|enp[0-9]|wlan[0-9]).*(down|up|carrier|link)" severity: WARN + domain: networking description: Network interface state change - name: ip_change pattern: "(new ip|ip.*(changed|assigned|address)|dhcp.*(ack|offer|bound|renew))" severity: INFO + domain: networking description: IP address change or DHCP event # ── System / journald patterns ───────────────────────────────────────────── @@ -86,46 +104,55 @@ patterns: - name: systemd_fail pattern: "(Failed to start|failed with result|entered failed state|start request repeated too quickly|Main process exited)" severity: ERROR + domain: service_health description: systemd service failed to start or crashed - name: oom_kill pattern: "(Killed process|oom.kill|oom_kill_process|Out of memory: Kill|memory cgroup out of memory)" severity: CRITICAL + domain: memory description: Kernel OOM killer terminated a process - name: disk_hw_error pattern: "(ata[0-9]|sd[a-z]|nvme[0-9]).*(error|failed|reset|timeout|exception|EH|FAILED COMMAND)" severity: ERROR + domain: storage description: Storage device hardware error or reset - name: fs_error pattern: "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|blk_update_request.*error|buffer I/O error)" severity: ERROR + domain: storage description: Filesystem or block I/O error - name: kernel_error pattern: "(kernel: BUG|kernel panic|Oops:|general protection fault|Call Trace|RIP:.*[0-9a-f]{16})" severity: CRITICAL + domain: kernel description: Kernel bug, panic, or oops — system may be unstable - name: ssh_brute pattern: "(Failed password|Invalid user|authentication failure|Connection closed by authenticating user).*(sshd|ssh)" severity: WARN + domain: auth description: SSH authentication failure — possible brute force - name: container_crash pattern: "(container.*exited|oci runtime.*error|podman.*error|docker.*error|container.*killed|OCI.*failed)" severity: ERROR + domain: service_health description: Container runtime error or unexpected exit - name: smart_error pattern: "(smartd|SMART.*error|reallocated sector|pending sector|uncorrectable sector|Current_Pending_Sector)" severity: CRITICAL + domain: storage description: SMART disk health warning — potential drive failure - name: nfs_error pattern: "(nfs.*error|nfs.*timeout|RPC.*timed out|nfs4.*server.*not responding|mount.*nfs.*failed)" severity: ERROR + domain: networking description: NFS mount or RPC timeout # Add device/service-specific patterns below this line: @@ -133,46 +160,55 @@ patterns: - name: qbit_tracker_error pattern: "(tracker|announce).*(not working|error|fail|unreachable|timeout|refused|invalid)" severity: WARN + domain: media description: qBittorrent tracker connection or announce failure - name: qbit_port_bind pattern: "(couldn't? listen|bind.*fail|port.*in use|listening.*fail)" severity: CRITICAL + domain: media description: qBittorrent failed to bind listen port — firewall or port conflict - name: qbit_disk_error pattern: "(cannot (write|open|create)|disk.*error|i/o error|file.*fail|write.*fail)" severity: ERROR + domain: media description: qBittorrent disk write or file access failure - name: qbit_hash_fail pattern: "(hash.*(check|fail|mismatch)|recheck|piece.*fail)" severity: WARN + domain: media description: qBittorrent torrent hash verification failure — possible corrupt data - name: qbit_peer_ban pattern: "(peer.*ban|banned.*peer|blocked.*peer)" severity: INFO + domain: media description: qBittorrent peer banned (encryption enforcement or bad actor) - name: qbit_download_complete pattern: "(download.*complet|torrent.*finish|has finished downloading)" severity: INFO + domain: media description: qBittorrent torrent download completed - name: qbit_ratio_limit pattern: "(ratio.*reach|seeding.*limit|stop.*seeding|upload.*limit)" severity: INFO + domain: media description: qBittorrent seeding ratio or time limit reached - name: qbit_session_error pattern: "(session.*error|couldn't? resume|resume.*fail|torrent.*error)" severity: ERROR + domain: media description: qBittorrent session or resume data error - name: plex_eae_failure pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)" severity: ERROR + domain: media description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required # - name: ext_device_device_error @@ -185,16 +221,19 @@ patterns: - name: vpn_tunnel_fail pattern: "(wg-quick@|wireguard|spirit-city-tunnel|cf-orch-tunnel|cf-tunnel|openvpn|vpn).*(failed|error|exit.code|timeout|connection reset)" severity: ERROR + domain: networking description: VPN or WireGuard tunnel service failed — remote node may be unreachable - name: vpn_handshake pattern: "(handshake|peer.*allowed|WireGuard|wg-quick).*(initiating|complete|timeout|fail|retrying)" severity: WARN + domain: networking description: WireGuard peer handshake event — track for timeout/retry patterns - name: dns_degraded pattern: "(degraded feature set|DNS.*fall.?back|resolver.*fail|NXDOMAIN|DNS.*timeout|SERVFAIL)" severity: WARN + domain: networking description: DNS resolver degradation or fallback — often precedes connectivity failures # ── GPU / NVIDIA driver patterns ─────────────────────────────────────────── @@ -202,16 +241,19 @@ patterns: - name: nvidia_api_mismatch pattern: "(NVRM: API mismatch|nvidia.*version mismatch|driver.*mismatch|kernel module.*mismatch)" severity: ERROR + domain: gpu description: NVIDIA kernel module version does not match userspace driver — GPU ops will fail until driver reinstalled - name: nvidia_xid pattern: "(NVRM: Xid|Xid.*(error|critical)|GPU.*Xid)" severity: CRITICAL + domain: gpu description: NVIDIA Xid error — GPU hardware fault or driver crash (check nvidia-smi error code) - name: nvidia_gpu_reset pattern: "(nvidia.*reset|GPU.*reset|NVRM.*reset|nvml.*error|NVLink.*fail)" severity: ERROR + domain: gpu description: NVIDIA GPU reset or NVLink fault — possible hardware instability # ── Power / thermal patterns ─────────────────────────────────────────────── @@ -219,14 +261,17 @@ patterns: - name: acpi_error pattern: "(ACPI.*failed|ACPI.*error|ACPI.*_DSM|acpi.*_PPC|ACPI BIOS Error)" severity: WARN + domain: kernel description: ACPI firmware evaluation failure — often harmless but can indicate BIOS/power management issues - name: thermal_throttle pattern: "(CPU.*throttl|thermal throttl|Package temp|TjMax|temperature.*critical|No RAPL|RAPL.*not available)" severity: WARN + domain: power description: CPU/GPU thermal throttling or thermal management subsystem unavailable - name: undervoltage pattern: "(under.?voltage|brownout|voltage.*(low|critical)|power supply.*insufficient)" severity: ERROR + domain: power description: Undervoltage event — instability risk, check PSU and cable connections diff --git a/tests/test_blocklist_endpoints.py b/tests/test_blocklist_endpoints.py index 0d89cae..ce9117f 100644 --- a/tests/test_blocklist_endpoints.py +++ b/tests/test_blocklist_endpoints.py @@ -19,7 +19,8 @@ def client(tmp_path): patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \ patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \ patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \ - patch.object(rest_module, "_compiled_patterns", []): + patch.object(rest_module, "_compiled_patterns", []), \ + patch.object(rest_module, "_pattern_domain", {}): with TestClient(rest_module.app, raise_server_exceptions=True) as c: yield c @@ -46,7 +47,8 @@ def client_with_candidate(tmp_path): patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \ patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \ patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \ - patch.object(rest_module, "_compiled_patterns", []): + patch.object(rest_module, "_compiled_patterns", []), \ + patch.object(rest_module, "_pattern_domain", {}): with TestClient(rest_module.app, raise_server_exceptions=True) as c: yield c, cid diff --git a/tests/test_glean_tautulli.py b/tests/test_glean_tautulli.py index 4b12b08..cb761d0 100644 --- a/tests/test_glean_tautulli.py +++ b/tests/test_glean_tautulli.py @@ -260,8 +260,10 @@ class TestEndpoint: ensure_schema(db) with patch.object(rest_module, "DB_PATH", db), \ + patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \ patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \ - patch.object(rest_module, "_compiled_patterns", []): + patch.object(rest_module, "_compiled_patterns", []), \ + patch.object(rest_module, "_pattern_domain", {}): with TestClient(rest_module.app, raise_server_exceptions=True) as c: yield c @@ -284,8 +286,10 @@ class TestEndpoint: prefs_path.write_text(_json.dumps({"tautulli_token": "secret"})) with patch.object(rest_module, "DB_PATH", db), \ + patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \ patch.object(rest_module, "PREFS_PATH", prefs_path), \ - patch.object(rest_module, "_compiled_patterns", []): + patch.object(rest_module, "_compiled_patterns", []), \ + patch.object(rest_module, "_pattern_domain", {}): with TestClient(rest_module.app, raise_server_exceptions=True) as c: resp = c.post( "/turnstone/api/glean/tautulli",