fix(glean): add timeout=30s to all pipeline DB connections; add --force flag; new patterns
pipeline.py:
- Add timeout=30.0 to all sqlite3.connect() calls (5 total).
Previously only ensure_context_schema() had it. The main glean
writers would fail immediately under lock contention from the live
watcher or concurrent manual glean runs.
glean_corpus.py:
- Add --force flag (passed through to glean_sources/glean_file/glean_dir).
Without it, unchanged-fingerprint files were silently skipped even
after pattern updates. Use after editing patterns/default.yaml.
patterns/default.yaml:
- Add 9 new patterns for Muninn / cluster-wide coverage:
vpn_tunnel_fail WireGuard/tunnel service failures
vpn_handshake WireGuard peer handshake events
dns_degraded systemd-resolved DNS fallback/degradation
nvidia_api_mismatch NVIDIA kernel module vs userspace mismatch
nvidia_xid NVIDIA Xid GPU hardware faults
nvidia_gpu_reset NVIDIA GPU reset / NVLink faults
acpi_error ACPI firmware _DSM evaluation failures
thermal_throttle CPU/GPU thermal throttling / RAPL unavailable
undervoltage PSU undervoltage / brownout events
- Sync from /devl/turnstone-cluster/patterns/default.yaml (authoritative
live copy updated first; repo copy was stale)
This commit is contained in:
parent
27a1bea0f7
commit
ee39ffbd44
3 changed files with 71 additions and 28 deletions
|
|
@ -165,7 +165,7 @@ CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
|
||||||
|
|
||||||
def ensure_schema(db_path: Path) -> None:
|
def ensure_schema(db_path: Path) -> None:
|
||||||
"""Create all tables and apply additive migrations. Safe to call on every startup."""
|
"""Create all tables and apply additive migrations. Safe to call on every startup."""
|
||||||
conn = sqlite3.connect(str(db_path))
|
conn = sqlite3.connect(str(db_path), timeout=30.0)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
conn.executescript(_SCHEMA)
|
conn.executescript(_SCHEMA)
|
||||||
# Additive column migrations — ALTER TABLE silently skips if column exists
|
# Additive column migrations — ALTER TABLE silently skips if column exists
|
||||||
|
|
@ -338,7 +338,7 @@ def _glean_files(
|
||||||
ingest_time = now_iso()
|
ingest_time = now_iso()
|
||||||
source_id_map = source_id_map or {}
|
source_id_map = source_id_map or {}
|
||||||
|
|
||||||
conn = sqlite3.connect(str(db_path))
|
conn = sqlite3.connect(str(db_path), timeout=30.0)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
conn.executescript(_SCHEMA)
|
conn.executescript(_SCHEMA)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
@ -521,7 +521,7 @@ def glean_ssh_source(
|
||||||
compiled = _compile(load_patterns(effective_pattern_file))
|
compiled = _compile(load_patterns(effective_pattern_file))
|
||||||
ingest_time = now_iso()
|
ingest_time = now_iso()
|
||||||
|
|
||||||
conn = sqlite3.connect(str(db_path))
|
conn = sqlite3.connect(str(db_path), timeout=30.0)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
conn.executescript(_SCHEMA)
|
conn.executescript(_SCHEMA)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
@ -643,7 +643,7 @@ def glean_sources(
|
||||||
compiled = _compile(load_patterns(effective_pattern_file))
|
compiled = _compile(load_patterns(effective_pattern_file))
|
||||||
ingest_time = now_iso()
|
ingest_time = now_iso()
|
||||||
|
|
||||||
conn = sqlite3.connect(str(db_path))
|
conn = sqlite3.connect(str(db_path), timeout=30.0)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
conn.executescript(_SCHEMA)
|
conn.executescript(_SCHEMA)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
# Turnstone pattern library — named regex patterns for log tagging at glean time.
|
# Turnstone pattern library — named regex patterns for log tagging at ingest time.
|
||||||
# Each matched pattern name is stored on RetrievedEntry.matched_patterns and
|
# Each matched pattern name is stored on RetrievedEntry.matched_patterns and
|
||||||
# used to boost retrieval relevance for diagnostic queries.
|
# used to boost retrieval relevance for diagnostic queries.
|
||||||
#
|
#
|
||||||
|
|
@ -128,21 +128,6 @@ patterns:
|
||||||
severity: ERROR
|
severity: ERROR
|
||||||
description: NFS mount or RPC timeout
|
description: NFS mount or RPC timeout
|
||||||
|
|
||||||
- name: service_crash_loop
|
|
||||||
pattern: "(restart counter is at [0-9]|start request repeated too quickly|Restart limit hit)"
|
|
||||||
severity: WARN
|
|
||||||
description: systemd service crash-looping — restart counter incrementing or rate-limit hit; check for DNS resolution failures, missing dependencies, or bad config
|
|
||||||
|
|
||||||
- name: pkg_daemon_restart
|
|
||||||
pattern: "(invoke-rc\\.d|Unit process.*(apt-get|dpkg|preinst).*remains running after unit stopped|Stopped.*service.*openssh|Restarting.*OpenBSD Secure Shell)"
|
|
||||||
severity: WARN
|
|
||||||
description: Package manager restarted a system daemon — active SSH or service sessions may have been interrupted
|
|
||||||
|
|
||||||
- name: ssh_forward_conflict
|
|
||||||
pattern: "(channel_setup_fwd_listener_tcpip: cannot listen to port|error: bind.*Address already in use)"
|
|
||||||
severity: WARN
|
|
||||||
description: SSH port-forward conflict — previous session port still bound; stale sessions accumulating or rapid reconnects
|
|
||||||
|
|
||||||
# Add device/service-specific patterns below this line:
|
# Add device/service-specific patterns below this line:
|
||||||
|
|
||||||
- name: qbit_tracker_error
|
- name: qbit_tracker_error
|
||||||
|
|
@ -194,3 +179,54 @@ patterns:
|
||||||
# pattern: "ERR-\d{4}"
|
# pattern: "ERR-\d{4}"
|
||||||
# severity: ERROR
|
# severity: ERROR
|
||||||
# description: AVCX device error code
|
# description: AVCX device error code
|
||||||
|
|
||||||
|
# ── VPN / tunnel patterns ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
- name: vpn_tunnel_fail
|
||||||
|
pattern: "(wg-quick@|wireguard|spirit-city-tunnel|cf-orch-tunnel|cf-tunnel|openvpn|vpn).*(failed|error|exit.code|timeout|connection reset)"
|
||||||
|
severity: ERROR
|
||||||
|
description: VPN or WireGuard tunnel service failed — remote node may be unreachable
|
||||||
|
|
||||||
|
- name: vpn_handshake
|
||||||
|
pattern: "(handshake|peer.*allowed|WireGuard|wg-quick).*(initiating|complete|timeout|fail|retrying)"
|
||||||
|
severity: WARN
|
||||||
|
description: WireGuard peer handshake event — track for timeout/retry patterns
|
||||||
|
|
||||||
|
- name: dns_degraded
|
||||||
|
pattern: "(degraded feature set|DNS.*fall.?back|resolver.*fail|NXDOMAIN|DNS.*timeout|SERVFAIL)"
|
||||||
|
severity: WARN
|
||||||
|
description: DNS resolver degradation or fallback — often precedes connectivity failures
|
||||||
|
|
||||||
|
# ── GPU / NVIDIA driver patterns ───────────────────────────────────────────
|
||||||
|
|
||||||
|
- name: nvidia_api_mismatch
|
||||||
|
pattern: "(NVRM: API mismatch|nvidia.*version mismatch|driver.*mismatch|kernel module.*mismatch)"
|
||||||
|
severity: ERROR
|
||||||
|
description: NVIDIA kernel module version does not match userspace driver — GPU ops will fail until driver reinstalled
|
||||||
|
|
||||||
|
- name: nvidia_xid
|
||||||
|
pattern: "(NVRM: Xid|Xid.*(error|critical)|GPU.*Xid)"
|
||||||
|
severity: CRITICAL
|
||||||
|
description: NVIDIA Xid error — GPU hardware fault or driver crash (check nvidia-smi error code)
|
||||||
|
|
||||||
|
- name: nvidia_gpu_reset
|
||||||
|
pattern: "(nvidia.*reset|GPU.*reset|NVRM.*reset|nvml.*error|NVLink.*fail)"
|
||||||
|
severity: ERROR
|
||||||
|
description: NVIDIA GPU reset or NVLink fault — possible hardware instability
|
||||||
|
|
||||||
|
# ── Power / thermal patterns ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
- name: acpi_error
|
||||||
|
pattern: "(ACPI.*failed|ACPI.*error|ACPI.*_DSM|acpi.*_PPC|ACPI BIOS Error)"
|
||||||
|
severity: WARN
|
||||||
|
description: ACPI firmware evaluation failure — often harmless but can indicate BIOS/power management issues
|
||||||
|
|
||||||
|
- name: thermal_throttle
|
||||||
|
pattern: "(CPU.*throttl|thermal throttl|Package temp|TjMax|temperature.*critical|No RAPL|RAPL.*not available)"
|
||||||
|
severity: WARN
|
||||||
|
description: CPU/GPU thermal throttling or thermal management subsystem unavailable
|
||||||
|
|
||||||
|
- name: undervoltage
|
||||||
|
pattern: "(under.?voltage|brownout|voltage.*(low|critical)|power supply.*insufficient)"
|
||||||
|
severity: ERROR
|
||||||
|
description: Undervoltage event — instability risk, check PSU and cable connections
|
||||||
|
|
|
||||||
|
|
@ -2,10 +2,14 @@
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
# Single file or directory (legacy)
|
# Single file or directory (legacy)
|
||||||
python scripts/glean_corpus.py <file_or_dir> [db_path]
|
python scripts/glean_corpus.py <file_or_dir> [db_path] [--force]
|
||||||
|
|
||||||
# Sources config (multi-service)
|
# Sources config (multi-service)
|
||||||
python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>]
|
python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--force Bypass fingerprint checks and re-glean all files, re-applying
|
||||||
|
all patterns. Use after updating patterns/default.yaml.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
@ -33,15 +37,18 @@ if __name__ == "__main__":
|
||||||
if not args:
|
if not args:
|
||||||
print(
|
print(
|
||||||
"Usage:\n"
|
"Usage:\n"
|
||||||
" glean_corpus.py <file_or_dir> [db_path]\n"
|
" glean_corpus.py <file_or_dir> [db_path] [--force]\n"
|
||||||
" glean_corpus.py --sources <sources.yaml> [--db <db_path>]",
|
" glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]",
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
force = "--force" in args
|
||||||
|
args = [a for a in args if a != "--force"]
|
||||||
|
|
||||||
if args[0] == "--sources":
|
if args[0] == "--sources":
|
||||||
if len(args) < 2:
|
if len(args) < 2:
|
||||||
print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
|
print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
sources_file = Path(args[1])
|
sources_file = Path(args[1])
|
||||||
db_path = Path("data/turnstone.db")
|
db_path = Path("data/turnstone.db")
|
||||||
|
|
@ -49,7 +56,7 @@ if __name__ == "__main__":
|
||||||
db_path = Path(args[args.index("--db") + 1])
|
db_path = Path(args[args.index("--db") + 1])
|
||||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
print(f"Gleaning sources from {sources_file} → {db_path}")
|
print(f"Gleaning sources from {sources_file} → {db_path}")
|
||||||
stats = glean_sources(sources_file, db_path)
|
stats = glean_sources(sources_file, db_path, force=force)
|
||||||
_print_stats(stats)
|
_print_stats(stats)
|
||||||
else:
|
else:
|
||||||
target = Path(args[0])
|
target = Path(args[0])
|
||||||
|
|
@ -57,9 +64,9 @@ if __name__ == "__main__":
|
||||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
print(f"Gleaning {target} → {db_path}")
|
print(f"Gleaning {target} → {db_path}")
|
||||||
if target.is_file():
|
if target.is_file():
|
||||||
stats = glean_file(target, db_path)
|
stats = glean_file(target, db_path, force=force)
|
||||||
elif target.is_dir():
|
elif target.is_dir():
|
||||||
stats = glean_dir(target, db_path)
|
stats = glean_dir(target, db_path, force=force)
|
||||||
else:
|
else:
|
||||||
print(f"Error: {target} is not a file or directory", file=sys.stderr)
|
print(f"Error: {target} is not a file or directory", file=sys.stderr)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue