Merge feat/32-domain-view: domain-view mapping for patterns and diagnose output (#32)
This commit is contained in:
commit
6d60be5e66
7 changed files with 99 additions and 8 deletions
|
|
@ -33,6 +33,7 @@ def load_patterns(path: Path) -> list[LogPattern]:
|
||||||
pattern=p["pattern"],
|
pattern=p["pattern"],
|
||||||
severity=p["severity"],
|
severity=p["severity"],
|
||||||
description=p["description"],
|
description=p["description"],
|
||||||
|
domain=p.get("domain", ""),
|
||||||
)
|
)
|
||||||
for p in raw.get("patterns", [])
|
for p in raw.get("patterns", [])
|
||||||
]
|
]
|
||||||
|
|
|
||||||
30
app/rest.py
30
app/rest.py
|
|
@ -129,11 +129,13 @@ if GPU_SERVER_URL:
|
||||||
|
|
||||||
_watcher = Watcher(DB_PATH, PATTERN_FILE)
|
_watcher = Watcher(DB_PATH, PATTERN_FILE)
|
||||||
_compiled_patterns: list = []
|
_compiled_patterns: list = []
|
||||||
|
# pattern name → domain; populated at startup from _compiled_patterns.
|
||||||
|
_pattern_domain: dict[str, str] = {}
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def _lifespan(app: FastAPI):
|
async def _lifespan(app: FastAPI):
|
||||||
global _compiled_patterns
|
global _compiled_patterns, _pattern_domain
|
||||||
# Route turnstone.audit through uvicorn's own handler so it appears in api.log.
|
# Route turnstone.audit through uvicorn's own handler so it appears in api.log.
|
||||||
_audit_log.setLevel(logging.INFO)
|
_audit_log.setLevel(logging.INFO)
|
||||||
for h in logging.getLogger("uvicorn.error").handlers:
|
for h in logging.getLogger("uvicorn.error").handlers:
|
||||||
|
|
@ -147,6 +149,7 @@ async def _lifespan(app: FastAPI):
|
||||||
"Migrated %d incident/bundle rows from main DB to incidents DB", migrated
|
"Migrated %d incident/bundle rows from main DB to incidents DB", migrated
|
||||||
)
|
)
|
||||||
_compiled_patterns = load_compiled_patterns(PATTERN_FILE)
|
_compiled_patterns = load_compiled_patterns(PATTERN_FILE)
|
||||||
|
_pattern_domain = {p.name: p.domain for p, _ in _compiled_patterns if p.domain}
|
||||||
watch_cfg_path = PATTERN_DIR / "watch.yaml"
|
watch_cfg_path = PATTERN_DIR / "watch.yaml"
|
||||||
configs = load_watch_config(watch_cfg_path)
|
configs = load_watch_config(watch_cfg_path)
|
||||||
if configs:
|
if configs:
|
||||||
|
|
@ -324,6 +327,23 @@ def _check_api_key(request: Request) -> None:
|
||||||
raise HTTPException(status_code=403, detail="Invalid API key")
|
raise HTTPException(status_code=403, detail="Invalid API key")
|
||||||
|
|
||||||
|
|
||||||
|
def _domain_counts(results: list) -> dict[str, int]:
|
||||||
|
"""Count hits per domain across a list of SearchResult objects.
|
||||||
|
|
||||||
|
Uses the module-level _pattern_domain lookup built at startup.
|
||||||
|
Entries whose matched_patterns contain no known domain are skipped.
|
||||||
|
"""
|
||||||
|
counts: dict[str, int] = {}
|
||||||
|
for r in results:
|
||||||
|
seen_domains: set[str] = set()
|
||||||
|
for tag in (r.matched_patterns or []):
|
||||||
|
domain = _pattern_domain.get(tag, "")
|
||||||
|
if domain and domain not in seen_domains:
|
||||||
|
seen_domains.add(domain)
|
||||||
|
counts[domain] = counts.get(domain, 0) + 1
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
# API router — all routes accessible at /turnstone/api/* and /turnstone/health.
|
# API router — all routes accessible at /turnstone/api/* and /turnstone/health.
|
||||||
router = APIRouter(prefix="/turnstone", dependencies=[Depends(_check_api_key)])
|
router = APIRouter(prefix="/turnstone", dependencies=[Depends(_check_api_key)])
|
||||||
|
|
||||||
|
|
@ -338,6 +358,7 @@ def search_logs(
|
||||||
q: Annotated[str, Query(description="Search query")] = "",
|
q: Annotated[str, Query(description="Search query")] = "",
|
||||||
source: Annotated[str | None, Query(description="Filter by log source ID (partial match)")] = None,
|
source: Annotated[str | None, Query(description="Filter by log source ID (partial match)")] = None,
|
||||||
severity: Annotated[str | None, Query(description="Filter by severity (DEBUG/INFO/WARN/ERROR/CRITICAL)")] = None,
|
severity: Annotated[str | None, Query(description="Filter by severity (DEBUG/INFO/WARN/ERROR/CRITICAL)")] = None,
|
||||||
|
domain: Annotated[str | None, Query(description="Filter by service health domain (networking, storage, auth, etc.)")] = None,
|
||||||
since: Annotated[str | None, Query(description="ISO timestamp lower bound")] = None,
|
since: Annotated[str | None, Query(description="ISO timestamp lower bound")] = None,
|
||||||
until: Annotated[str | None, Query(description="ISO timestamp upper bound")] = None,
|
until: Annotated[str | None, Query(description="ISO timestamp upper bound")] = None,
|
||||||
limit: Annotated[int, Query(ge=1, le=500)] = 50,
|
limit: Annotated[int, Query(ge=1, le=500)] = 50,
|
||||||
|
|
@ -355,6 +376,8 @@ def search_logs(
|
||||||
limit=limit,
|
limit=limit,
|
||||||
semantic=semantic,
|
semantic=semantic,
|
||||||
)
|
)
|
||||||
|
if domain:
|
||||||
|
results = [r for r in results if domain in {_pattern_domain.get(t, "") for t in r.matched_patterns}]
|
||||||
return {"count": len(results), "results": [dataclasses.asdict(r) for r in results]}
|
return {"count": len(results), "results": [dataclasses.asdict(r) for r in results]}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -428,6 +451,7 @@ def diagnose_post(body: DiagnoseRequest) -> dict:
|
||||||
"summary": {
|
"summary": {
|
||||||
"total": 0, "window_start": None, "window_end": None,
|
"total": 0, "window_start": None, "window_end": None,
|
||||||
"time_detected": False, "by_severity": {}, "by_source": {},
|
"time_detected": False, "by_severity": {}, "by_source": {},
|
||||||
|
"by_domain": {},
|
||||||
},
|
},
|
||||||
"entries": [],
|
"entries": [],
|
||||||
}
|
}
|
||||||
|
|
@ -442,8 +466,9 @@ def diagnose_post(body: DiagnoseRequest) -> dict:
|
||||||
llm_model=prefs.get("llm_model") or None,
|
llm_model=prefs.get("llm_model") or None,
|
||||||
llm_api_key=prefs.get("llm_api_key") or None,
|
llm_api_key=prefs.get("llm_api_key") or None,
|
||||||
)
|
)
|
||||||
|
summary = {**result["summary"], "by_domain": _domain_counts(result["entries"])}
|
||||||
return {
|
return {
|
||||||
"summary": result["summary"],
|
"summary": summary,
|
||||||
"reasoning": result.get("reasoning"),
|
"reasoning": result.get("reasoning"),
|
||||||
"entries": [dataclasses.asdict(r) for r in result["entries"]],
|
"entries": [dataclasses.asdict(r) for r in result["entries"]],
|
||||||
}
|
}
|
||||||
|
|
@ -466,6 +491,7 @@ async def diagnose_post_stream(body: DiagnoseRequest) -> StreamingResponse:
|
||||||
context_db_path=CONTEXT_DB_PATH,
|
context_db_path=CONTEXT_DB_PATH,
|
||||||
incidents_db_path=INCIDENTS_DB_PATH,
|
incidents_db_path=INCIDENTS_DB_PATH,
|
||||||
tech_level=prefs.get("tech_level", "sysadmin"),
|
tech_level=prefs.get("tech_level", "sysadmin"),
|
||||||
|
pattern_domain=_pattern_domain or None,
|
||||||
):
|
):
|
||||||
yield f"data: {json.dumps(event)}\n\n"
|
yield f"data: {json.dumps(event)}\n\n"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -198,6 +198,7 @@ async def diagnose_stream(
|
||||||
context_db_path: Path | None = None,
|
context_db_path: Path | None = None,
|
||||||
incidents_db_path: Path | None = None,
|
incidents_db_path: Path | None = None,
|
||||||
tech_level: str = "sysadmin",
|
tech_level: str = "sysadmin",
|
||||||
|
pattern_domain: dict[str, str] | None = None,
|
||||||
) -> AsyncGenerator[dict[str, Any], None]:
|
) -> AsyncGenerator[dict[str, Any], None]:
|
||||||
"""Async generator yielding SSE event dicts for the diagnose pipeline.
|
"""Async generator yielding SSE event dicts for the diagnose pipeline.
|
||||||
|
|
||||||
|
|
@ -295,6 +296,16 @@ async def diagnose_stream(
|
||||||
by_severity[sev] += 1
|
by_severity[sev] += 1
|
||||||
by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
|
by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
|
||||||
|
|
||||||
|
by_domain: dict[str, int] = {}
|
||||||
|
if pattern_domain:
|
||||||
|
for r in combined:
|
||||||
|
seen: set[str] = set()
|
||||||
|
for tag in (r.matched_patterns or []):
|
||||||
|
d = pattern_domain.get(tag, "")
|
||||||
|
if d and d not in seen:
|
||||||
|
seen.add(d)
|
||||||
|
by_domain[d] = by_domain.get(d, 0) + 1
|
||||||
|
|
||||||
yield {
|
yield {
|
||||||
"type": "summary",
|
"type": "summary",
|
||||||
"data": {
|
"data": {
|
||||||
|
|
@ -304,6 +315,7 @@ async def diagnose_stream(
|
||||||
"time_detected": time_detected,
|
"time_detected": time_detected,
|
||||||
"by_severity": by_severity,
|
"by_severity": by_severity,
|
||||||
"by_source": by_source,
|
"by_source": by_source,
|
||||||
|
"by_domain": by_domain,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]}
|
yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]}
|
||||||
|
|
|
||||||
|
|
@ -31,6 +31,7 @@ class LogPattern:
|
||||||
pattern: str # regex string
|
pattern: str # regex string
|
||||||
severity: str # suggested severity if not present in log line
|
severity: str # suggested severity if not present in log line
|
||||||
description: str # human-readable explanation for the UI
|
description: str # human-readable explanation for the UI
|
||||||
|
domain: str = "" # service health domain (networking, storage, auth, etc.)
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
|
|
|
||||||
|
|
@ -2,83 +2,101 @@
|
||||||
# Each matched pattern name is stored on RetrievedEntry.matched_patterns and
|
# Each matched pattern name is stored on RetrievedEntry.matched_patterns and
|
||||||
# used to boost retrieval relevance for diagnostic queries.
|
# used to boost retrieval relevance for diagnostic queries.
|
||||||
#
|
#
|
||||||
# Add domain-specific patterns here. Patterns are applied in order; multiple
|
# domain: groups patterns into service health domains for triage-level summaries.
|
||||||
# can match a single entry.
|
# Valid domains: service_health | networking | auth | storage | memory |
|
||||||
|
# kernel | power | web_proxy | media | gpu
|
||||||
|
#
|
||||||
|
# Patterns are applied in order; multiple can match a single entry.
|
||||||
|
|
||||||
patterns:
|
patterns:
|
||||||
- name: service_restart
|
- name: service_restart
|
||||||
pattern: "(restarting|restart requested|service.*start)"
|
pattern: "(restarting|restart requested|service.*start)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: service_health
|
||||||
description: Service restart detected
|
description: Service restart detected
|
||||||
|
|
||||||
- name: connection_lost
|
- name: connection_lost
|
||||||
pattern: "(connection (lost|dropped|refused|timed? out)|disconnect(ed)?)"
|
pattern: "(connection (lost|dropped|refused|timed? out)|disconnect(ed)?)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: networking
|
||||||
description: Network or device connection failure
|
description: Network or device connection failure
|
||||||
|
|
||||||
- name: auth_failure
|
- name: auth_failure
|
||||||
pattern: "(auth(entication)? (failed?|error|denied)|permission denied|unauthorized)"
|
pattern: "(auth(entication)? (failed?|error|denied)|permission denied|unauthorized)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: auth
|
||||||
description: Authentication or authorization failure
|
description: Authentication or authorization failure
|
||||||
|
|
||||||
- name: oom
|
- name: oom
|
||||||
pattern: "(out of memory|OOM|killed process|cannot allocate)"
|
pattern: "(out of memory|OOM|killed process|cannot allocate)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: memory
|
||||||
description: Out-of-memory condition
|
description: Out-of-memory condition
|
||||||
|
|
||||||
- name: segfault
|
- name: segfault
|
||||||
pattern: "(segmentation fault|segfault|SIGSEGV|core dump)"
|
pattern: "(segmentation fault|segfault|SIGSEGV|core dump)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: kernel
|
||||||
description: Process crash or memory corruption
|
description: Process crash or memory corruption
|
||||||
|
|
||||||
- name: disk_full
|
- name: disk_full
|
||||||
pattern: "(no space left|disk full|filesystem.*full|ENOSPC)"
|
pattern: "(no space left|disk full|filesystem.*full|ENOSPC)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: storage
|
||||||
description: Storage capacity exhausted
|
description: Storage capacity exhausted
|
||||||
|
|
||||||
- name: timeout
|
- name: timeout
|
||||||
pattern: "(timed? out|deadline exceeded|operation timed?)"
|
pattern: "(timed? out|deadline exceeded|operation timed?)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: networking
|
||||||
description: Operation timeout
|
description: Operation timeout
|
||||||
|
|
||||||
- name: caddy_tls_error
|
- name: caddy_tls_error
|
||||||
pattern: "(acme|certificate|tls).*(error|fail|invalid|expired|renew)"
|
pattern: "(acme|certificate|tls).*(error|fail|invalid|expired|renew)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: web_proxy
|
||||||
description: Caddy TLS or certificate error
|
description: Caddy TLS or certificate error
|
||||||
|
|
||||||
- name: caddy_config_error
|
- name: caddy_config_error
|
||||||
pattern: "(config|caddyfile|directive).*(error|invalid|unknown|unrecognized)"
|
pattern: "(config|caddyfile|directive).*(error|invalid|unknown|unrecognized)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: web_proxy
|
||||||
description: Caddy configuration error
|
description: Caddy configuration error
|
||||||
|
|
||||||
- name: caddy_auth_error
|
- name: caddy_auth_error
|
||||||
pattern: "(forward_auth|basicauth|basic_auth).*(error|fail|denied|invalid|unreachable)"
|
pattern: "(forward_auth|basicauth|basic_auth).*(error|fail|denied|invalid|unreachable)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: web_proxy
|
||||||
description: Caddy authentication middleware failure
|
description: Caddy authentication middleware failure
|
||||||
|
|
||||||
- name: caddy_upstream_error
|
- name: caddy_upstream_error
|
||||||
pattern: "(upstream|backend|reverse.proxy).*(error|fail|unreachable|refused|timeout)"
|
pattern: "(upstream|backend|reverse.proxy).*(error|fail|unreachable|refused|timeout)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: web_proxy
|
||||||
description: Caddy upstream/backend failure
|
description: Caddy upstream/backend failure
|
||||||
|
|
||||||
- name: service_update
|
- name: service_update
|
||||||
pattern: "(upgraded?|updated?|installing|dpkg|apt|package).*(caddy|nginx|apache|proxy)"
|
pattern: "(upgraded?|updated?|installing|dpkg|apt|package).*(caddy|nginx|apache|proxy)"
|
||||||
severity: INFO
|
severity: INFO
|
||||||
|
domain: web_proxy
|
||||||
description: Web server package update detected
|
description: Web server package update detected
|
||||||
|
|
||||||
- name: power_failure
|
- name: power_failure
|
||||||
pattern: "(power (fail|loss|outage|cut)|ups|battery|shutdown.*power|lost power)"
|
pattern: "(power (fail|loss|outage|cut)|ups|battery|shutdown.*power|lost power)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: power
|
||||||
description: Power failure or UPS event
|
description: Power failure or UPS event
|
||||||
|
|
||||||
- name: network_interface
|
- name: network_interface
|
||||||
pattern: "(eth[0-9]|ens[0-9]|enp[0-9]|wlan[0-9]).*(down|up|carrier|link)"
|
pattern: "(eth[0-9]|ens[0-9]|enp[0-9]|wlan[0-9]).*(down|up|carrier|link)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: networking
|
||||||
description: Network interface state change
|
description: Network interface state change
|
||||||
|
|
||||||
- name: ip_change
|
- name: ip_change
|
||||||
pattern: "(new ip|ip.*(changed|assigned|address)|dhcp.*(ack|offer|bound|renew))"
|
pattern: "(new ip|ip.*(changed|assigned|address)|dhcp.*(ack|offer|bound|renew))"
|
||||||
severity: INFO
|
severity: INFO
|
||||||
|
domain: networking
|
||||||
description: IP address change or DHCP event
|
description: IP address change or DHCP event
|
||||||
|
|
||||||
# ── System / journald patterns ─────────────────────────────────────────────
|
# ── System / journald patterns ─────────────────────────────────────────────
|
||||||
|
|
@ -86,46 +104,55 @@ patterns:
|
||||||
- name: systemd_fail
|
- name: systemd_fail
|
||||||
pattern: "(Failed to start|failed with result|entered failed state|start request repeated too quickly|Main process exited)"
|
pattern: "(Failed to start|failed with result|entered failed state|start request repeated too quickly|Main process exited)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: service_health
|
||||||
description: systemd service failed to start or crashed
|
description: systemd service failed to start or crashed
|
||||||
|
|
||||||
- name: oom_kill
|
- name: oom_kill
|
||||||
pattern: "(Killed process|oom.kill|oom_kill_process|Out of memory: Kill|memory cgroup out of memory)"
|
pattern: "(Killed process|oom.kill|oom_kill_process|Out of memory: Kill|memory cgroup out of memory)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: memory
|
||||||
description: Kernel OOM killer terminated a process
|
description: Kernel OOM killer terminated a process
|
||||||
|
|
||||||
- name: disk_hw_error
|
- name: disk_hw_error
|
||||||
pattern: "(ata[0-9]|sd[a-z]|nvme[0-9]).*(error|failed|reset|timeout|exception|EH|FAILED COMMAND)"
|
pattern: "(ata[0-9]|sd[a-z]|nvme[0-9]).*(error|failed|reset|timeout|exception|EH|FAILED COMMAND)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: storage
|
||||||
description: Storage device hardware error or reset
|
description: Storage device hardware error or reset
|
||||||
|
|
||||||
- name: fs_error
|
- name: fs_error
|
||||||
pattern: "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|blk_update_request.*error|buffer I/O error)"
|
pattern: "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|blk_update_request.*error|buffer I/O error)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: storage
|
||||||
description: Filesystem or block I/O error
|
description: Filesystem or block I/O error
|
||||||
|
|
||||||
- name: kernel_error
|
- name: kernel_error
|
||||||
pattern: "(kernel: BUG|kernel panic|Oops:|general protection fault|Call Trace|RIP:.*[0-9a-f]{16})"
|
pattern: "(kernel: BUG|kernel panic|Oops:|general protection fault|Call Trace|RIP:.*[0-9a-f]{16})"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: kernel
|
||||||
description: Kernel bug, panic, or oops — system may be unstable
|
description: Kernel bug, panic, or oops — system may be unstable
|
||||||
|
|
||||||
- name: ssh_brute
|
- name: ssh_brute
|
||||||
pattern: "(Failed password|Invalid user|authentication failure|Connection closed by authenticating user).*(sshd|ssh)"
|
pattern: "(Failed password|Invalid user|authentication failure|Connection closed by authenticating user).*(sshd|ssh)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: auth
|
||||||
description: SSH authentication failure — possible brute force
|
description: SSH authentication failure — possible brute force
|
||||||
|
|
||||||
- name: container_crash
|
- name: container_crash
|
||||||
pattern: "(container.*exited|oci runtime.*error|podman.*error|docker.*error|container.*killed|OCI.*failed)"
|
pattern: "(container.*exited|oci runtime.*error|podman.*error|docker.*error|container.*killed|OCI.*failed)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: service_health
|
||||||
description: Container runtime error or unexpected exit
|
description: Container runtime error or unexpected exit
|
||||||
|
|
||||||
- name: smart_error
|
- name: smart_error
|
||||||
pattern: "(smartd|SMART.*error|reallocated sector|pending sector|uncorrectable sector|Current_Pending_Sector)"
|
pattern: "(smartd|SMART.*error|reallocated sector|pending sector|uncorrectable sector|Current_Pending_Sector)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: storage
|
||||||
description: SMART disk health warning — potential drive failure
|
description: SMART disk health warning — potential drive failure
|
||||||
|
|
||||||
- name: nfs_error
|
- name: nfs_error
|
||||||
pattern: "(nfs.*error|nfs.*timeout|RPC.*timed out|nfs4.*server.*not responding|mount.*nfs.*failed)"
|
pattern: "(nfs.*error|nfs.*timeout|RPC.*timed out|nfs4.*server.*not responding|mount.*nfs.*failed)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: networking
|
||||||
description: NFS mount or RPC timeout
|
description: NFS mount or RPC timeout
|
||||||
|
|
||||||
# Add device/service-specific patterns below this line:
|
# Add device/service-specific patterns below this line:
|
||||||
|
|
@ -133,46 +160,55 @@ patterns:
|
||||||
- name: qbit_tracker_error
|
- name: qbit_tracker_error
|
||||||
pattern: "(tracker|announce).*(not working|error|fail|unreachable|timeout|refused|invalid)"
|
pattern: "(tracker|announce).*(not working|error|fail|unreachable|timeout|refused|invalid)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: media
|
||||||
description: qBittorrent tracker connection or announce failure
|
description: qBittorrent tracker connection or announce failure
|
||||||
|
|
||||||
- name: qbit_port_bind
|
- name: qbit_port_bind
|
||||||
pattern: "(couldn't? listen|bind.*fail|port.*in use|listening.*fail)"
|
pattern: "(couldn't? listen|bind.*fail|port.*in use|listening.*fail)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: media
|
||||||
description: qBittorrent failed to bind listen port — firewall or port conflict
|
description: qBittorrent failed to bind listen port — firewall or port conflict
|
||||||
|
|
||||||
- name: qbit_disk_error
|
- name: qbit_disk_error
|
||||||
pattern: "(cannot (write|open|create)|disk.*error|i/o error|file.*fail|write.*fail)"
|
pattern: "(cannot (write|open|create)|disk.*error|i/o error|file.*fail|write.*fail)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: media
|
||||||
description: qBittorrent disk write or file access failure
|
description: qBittorrent disk write or file access failure
|
||||||
|
|
||||||
- name: qbit_hash_fail
|
- name: qbit_hash_fail
|
||||||
pattern: "(hash.*(check|fail|mismatch)|recheck|piece.*fail)"
|
pattern: "(hash.*(check|fail|mismatch)|recheck|piece.*fail)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: media
|
||||||
description: qBittorrent torrent hash verification failure — possible corrupt data
|
description: qBittorrent torrent hash verification failure — possible corrupt data
|
||||||
|
|
||||||
- name: qbit_peer_ban
|
- name: qbit_peer_ban
|
||||||
pattern: "(peer.*ban|banned.*peer|blocked.*peer)"
|
pattern: "(peer.*ban|banned.*peer|blocked.*peer)"
|
||||||
severity: INFO
|
severity: INFO
|
||||||
|
domain: media
|
||||||
description: qBittorrent peer banned (encryption enforcement or bad actor)
|
description: qBittorrent peer banned (encryption enforcement or bad actor)
|
||||||
|
|
||||||
- name: qbit_download_complete
|
- name: qbit_download_complete
|
||||||
pattern: "(download.*complet|torrent.*finish|has finished downloading)"
|
pattern: "(download.*complet|torrent.*finish|has finished downloading)"
|
||||||
severity: INFO
|
severity: INFO
|
||||||
|
domain: media
|
||||||
description: qBittorrent torrent download completed
|
description: qBittorrent torrent download completed
|
||||||
|
|
||||||
- name: qbit_ratio_limit
|
- name: qbit_ratio_limit
|
||||||
pattern: "(ratio.*reach|seeding.*limit|stop.*seeding|upload.*limit)"
|
pattern: "(ratio.*reach|seeding.*limit|stop.*seeding|upload.*limit)"
|
||||||
severity: INFO
|
severity: INFO
|
||||||
|
domain: media
|
||||||
description: qBittorrent seeding ratio or time limit reached
|
description: qBittorrent seeding ratio or time limit reached
|
||||||
|
|
||||||
- name: qbit_session_error
|
- name: qbit_session_error
|
||||||
pattern: "(session.*error|couldn't? resume|resume.*fail|torrent.*error)"
|
pattern: "(session.*error|couldn't? resume|resume.*fail|torrent.*error)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: media
|
||||||
description: qBittorrent session or resume data error
|
description: qBittorrent session or resume data error
|
||||||
|
|
||||||
- name: plex_eae_failure
|
- name: plex_eae_failure
|
||||||
pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)"
|
pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: media
|
||||||
description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required
|
description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required
|
||||||
|
|
||||||
# - name: avcx_device_error
|
# - name: avcx_device_error
|
||||||
|
|
@ -185,16 +221,19 @@ patterns:
|
||||||
- name: vpn_tunnel_fail
|
- name: vpn_tunnel_fail
|
||||||
pattern: "(wg-quick@|wireguard|spirit-city-tunnel|cf-orch-tunnel|cf-tunnel|openvpn|vpn).*(failed|error|exit.code|timeout|connection reset)"
|
pattern: "(wg-quick@|wireguard|spirit-city-tunnel|cf-orch-tunnel|cf-tunnel|openvpn|vpn).*(failed|error|exit.code|timeout|connection reset)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: networking
|
||||||
description: VPN or WireGuard tunnel service failed — remote node may be unreachable
|
description: VPN or WireGuard tunnel service failed — remote node may be unreachable
|
||||||
|
|
||||||
- name: vpn_handshake
|
- name: vpn_handshake
|
||||||
pattern: "(handshake|peer.*allowed|WireGuard|wg-quick).*(initiating|complete|timeout|fail|retrying)"
|
pattern: "(handshake|peer.*allowed|WireGuard|wg-quick).*(initiating|complete|timeout|fail|retrying)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: networking
|
||||||
description: WireGuard peer handshake event — track for timeout/retry patterns
|
description: WireGuard peer handshake event — track for timeout/retry patterns
|
||||||
|
|
||||||
- name: dns_degraded
|
- name: dns_degraded
|
||||||
pattern: "(degraded feature set|DNS.*fall.?back|resolver.*fail|NXDOMAIN|DNS.*timeout|SERVFAIL)"
|
pattern: "(degraded feature set|DNS.*fall.?back|resolver.*fail|NXDOMAIN|DNS.*timeout|SERVFAIL)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: networking
|
||||||
description: DNS resolver degradation or fallback — often precedes connectivity failures
|
description: DNS resolver degradation or fallback — often precedes connectivity failures
|
||||||
|
|
||||||
# ── GPU / NVIDIA driver patterns ───────────────────────────────────────────
|
# ── GPU / NVIDIA driver patterns ───────────────────────────────────────────
|
||||||
|
|
@ -202,16 +241,19 @@ patterns:
|
||||||
- name: nvidia_api_mismatch
|
- name: nvidia_api_mismatch
|
||||||
pattern: "(NVRM: API mismatch|nvidia.*version mismatch|driver.*mismatch|kernel module.*mismatch)"
|
pattern: "(NVRM: API mismatch|nvidia.*version mismatch|driver.*mismatch|kernel module.*mismatch)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: gpu
|
||||||
description: NVIDIA kernel module version does not match userspace driver — GPU ops will fail until driver reinstalled
|
description: NVIDIA kernel module version does not match userspace driver — GPU ops will fail until driver reinstalled
|
||||||
|
|
||||||
- name: nvidia_xid
|
- name: nvidia_xid
|
||||||
pattern: "(NVRM: Xid|Xid.*(error|critical)|GPU.*Xid)"
|
pattern: "(NVRM: Xid|Xid.*(error|critical)|GPU.*Xid)"
|
||||||
severity: CRITICAL
|
severity: CRITICAL
|
||||||
|
domain: gpu
|
||||||
description: NVIDIA Xid error — GPU hardware fault or driver crash (check nvidia-smi error code)
|
description: NVIDIA Xid error — GPU hardware fault or driver crash (check nvidia-smi error code)
|
||||||
|
|
||||||
- name: nvidia_gpu_reset
|
- name: nvidia_gpu_reset
|
||||||
pattern: "(nvidia.*reset|GPU.*reset|NVRM.*reset|nvml.*error|NVLink.*fail)"
|
pattern: "(nvidia.*reset|GPU.*reset|NVRM.*reset|nvml.*error|NVLink.*fail)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: gpu
|
||||||
description: NVIDIA GPU reset or NVLink fault — possible hardware instability
|
description: NVIDIA GPU reset or NVLink fault — possible hardware instability
|
||||||
|
|
||||||
# ── Power / thermal patterns ───────────────────────────────────────────────
|
# ── Power / thermal patterns ───────────────────────────────────────────────
|
||||||
|
|
@ -219,14 +261,17 @@ patterns:
|
||||||
- name: acpi_error
|
- name: acpi_error
|
||||||
pattern: "(ACPI.*failed|ACPI.*error|ACPI.*_DSM|acpi.*_PPC|ACPI BIOS Error)"
|
pattern: "(ACPI.*failed|ACPI.*error|ACPI.*_DSM|acpi.*_PPC|ACPI BIOS Error)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: kernel
|
||||||
description: ACPI firmware evaluation failure — often harmless but can indicate BIOS/power management issues
|
description: ACPI firmware evaluation failure — often harmless but can indicate BIOS/power management issues
|
||||||
|
|
||||||
- name: thermal_throttle
|
- name: thermal_throttle
|
||||||
pattern: "(CPU.*throttl|thermal throttl|Package temp|TjMax|temperature.*critical|No RAPL|RAPL.*not available)"
|
pattern: "(CPU.*throttl|thermal throttl|Package temp|TjMax|temperature.*critical|No RAPL|RAPL.*not available)"
|
||||||
severity: WARN
|
severity: WARN
|
||||||
|
domain: power
|
||||||
description: CPU/GPU thermal throttling or thermal management subsystem unavailable
|
description: CPU/GPU thermal throttling or thermal management subsystem unavailable
|
||||||
|
|
||||||
- name: undervoltage
|
- name: undervoltage
|
||||||
pattern: "(under.?voltage|brownout|voltage.*(low|critical)|power supply.*insufficient)"
|
pattern: "(under.?voltage|brownout|voltage.*(low|critical)|power supply.*insufficient)"
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
|
domain: power
|
||||||
description: Undervoltage event — instability risk, check PSU and cable connections
|
description: Undervoltage event — instability risk, check PSU and cable connections
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,8 @@ def client(tmp_path):
|
||||||
patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
|
patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
|
||||||
patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \
|
patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \
|
||||||
patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
|
patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
|
||||||
patch.object(rest_module, "_compiled_patterns", []):
|
patch.object(rest_module, "_compiled_patterns", []), \
|
||||||
|
patch.object(rest_module, "_pattern_domain", {}):
|
||||||
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
||||||
yield c
|
yield c
|
||||||
|
|
||||||
|
|
@ -46,7 +47,8 @@ def client_with_candidate(tmp_path):
|
||||||
patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
|
patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
|
||||||
patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \
|
patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \
|
||||||
patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
|
patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
|
||||||
patch.object(rest_module, "_compiled_patterns", []):
|
patch.object(rest_module, "_compiled_patterns", []), \
|
||||||
|
patch.object(rest_module, "_pattern_domain", {}):
|
||||||
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
||||||
yield c, cid
|
yield c, cid
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -260,8 +260,10 @@ class TestEndpoint:
|
||||||
ensure_schema(db)
|
ensure_schema(db)
|
||||||
|
|
||||||
with patch.object(rest_module, "DB_PATH", db), \
|
with patch.object(rest_module, "DB_PATH", db), \
|
||||||
|
patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
|
||||||
patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
|
patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
|
||||||
patch.object(rest_module, "_compiled_patterns", []):
|
patch.object(rest_module, "_compiled_patterns", []), \
|
||||||
|
patch.object(rest_module, "_pattern_domain", {}):
|
||||||
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
||||||
yield c
|
yield c
|
||||||
|
|
||||||
|
|
@ -284,8 +286,10 @@ class TestEndpoint:
|
||||||
prefs_path.write_text(_json.dumps({"tautulli_token": "secret"}))
|
prefs_path.write_text(_json.dumps({"tautulli_token": "secret"}))
|
||||||
|
|
||||||
with patch.object(rest_module, "DB_PATH", db), \
|
with patch.object(rest_module, "DB_PATH", db), \
|
||||||
|
patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
|
||||||
patch.object(rest_module, "PREFS_PATH", prefs_path), \
|
patch.object(rest_module, "PREFS_PATH", prefs_path), \
|
||||||
patch.object(rest_module, "_compiled_patterns", []):
|
patch.object(rest_module, "_compiled_patterns", []), \
|
||||||
|
patch.object(rest_module, "_pattern_domain", {}):
|
||||||
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
||||||
resp = c.post(
|
resp = c.post(
|
||||||
"/turnstone/api/glean/tautulli",
|
"/turnstone/api/glean/tautulli",
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue