"""Environment auto-discovery for the onboarding wizard. All checks are best-effort — every function returns an empty list on failure so the wizard degrades gracefully in containers, VMs, and minimal environments. """ from __future__ import annotations import json import logging import os import re import shutil import subprocess import time from pathlib import Path from typing import Any logger = logging.getLogger(__name__) # Common log file candidates: (id, path, description) _KNOWN_PATHS: list[tuple[str, str, str]] = [ ("syslog", "/var/log/syslog", "System syslog (Debian/Ubuntu)"), ("syslog", "/var/log/messages", "System messages (RHEL/Rocky)"), ("auth", "/var/log/auth.log", "Auth log"), ("kern", "/var/log/kern.log", "Kernel log"), ("nginx-access", "/var/log/nginx/access.log", "Nginx access log"), ("nginx-error", "/var/log/nginx/error.log", "Nginx error log"), ("apache", "/var/log/apache2/access.log", "Apache access log"), ("apache-error", "/var/log/apache2/error.log", "Apache error log"), ("caddy", "/var/log/caddy/access.log", "Caddy access log"), ("docker-daemon","/var/log/docker.log", "Docker daemon log"), ("fail2ban", "/var/log/fail2ban.log", "Fail2ban log"), ("ufw", "/var/log/ufw.log", "UFW firewall log"), ] def _run(cmd: list[str], timeout: float = 5.0) -> str | None: """Run a command and return stdout, or None on any error.""" try: result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) return result.stdout if result.returncode == 0 else None except Exception: return None def discover_journald() -> list[dict[str, Any]]: """Return a journald source candidate if journalctl is available.""" if not shutil.which("journalctl"): return [] hostname = _run(["hostname"]) or "localhost" hostname = hostname.strip() return [{ "type": "journald", "id": f"journal:{hostname}", "label": f"System journal ({hostname})", "description": "All systemd journal output from this host", "available": True, }] def discover_docker() -> list[dict[str, Any]]: """Return Docker container candidates if Docker is running.""" for runtime in ("docker", "podman"): if not shutil.which(runtime): continue out = _run([runtime, "ps", "--format", "{{json .}}"]) if out is None: continue containers = [] for line in out.splitlines(): line = line.strip() if not line: continue try: obj = json.loads(line) name = obj.get("Names") or obj.get("Name") or obj.get("ID", "unknown") # podman returns a list for Names if isinstance(name, list): name = name[0] if name else "unknown" name = name.lstrip("/") containers.append({ "type": "docker", "id": f"{runtime}:{name}", "label": f"{runtime.capitalize()} — {name}", "description": f"Container log stream for {name}", "container": name, "runtime": runtime, "available": True, }) except (json.JSONDecodeError, KeyError): continue if containers: return containers return [] def discover_files() -> list[dict[str, Any]]: """Return file-based source candidates for well-known log paths.""" found = [] seen_ids: set[str] = set() for source_id, path, description in _KNOWN_PATHS: if not os.path.exists(path): continue # deduplicate when both syslog and messages exist — take first match if source_id in seen_ids: continue seen_ids.add(source_id) found.append({ "type": "file", "id": source_id, "label": description, "path": path, "description": f"Read from {path}", "available": True, }) return found def discover_all() -> dict[str, Any]: """Run all discovery checks and return a structured candidate list.""" candidates: list[dict[str, Any]] = [] candidates.extend(discover_journald()) candidates.extend(discover_docker()) candidates.extend(discover_files()) return { "candidates": candidates, "has_journald": any(c["type"] == "journald" for c in candidates), "has_docker": any(c["type"] == "docker" for c in candidates), "has_files": any(c["type"] == "file" for c in candidates), } def build_sources_yaml(selected: list[dict[str, Any]]) -> str: """Generate sources.yaml content from a list of selected candidates. Each item must have: type, id, and type-specific fields (path, container, etc.). """ lines = [ "# Turnstone log sources — generated by the setup wizard.", "# Edit this file to add, remove, or modify sources.", "sources:", ] for src in selected: src_type = src.get("type", "file") src_id = src.get("id", "unknown") if src_type == "journald": unit = src.get("unit") lines.append(f" - id: {src_id}") lines.append(f" type: journald") if unit: lines.append(f" unit: {unit}") elif src_type == "docker": runtime = src.get("runtime", "docker") container = src.get("container", src_id.split(":")[-1]) lines.append(f" - id: {src_id}") lines.append(f" type: docker") lines.append(f" runtime: {runtime}") lines.append(f" container: {container}") else: path = src.get("path", "") lines.append(f" - id: {src_id}") lines.append(f" path: {path}") return "\n".join(lines) + "\n" def validate_source(src: dict[str, Any]) -> str | None: """Return an error string if the source definition is invalid, else None.""" if not src.get("id"): return "Source is missing 'id'" src_type = src.get("type", "file") if src_type == "file" and not src.get("path"): return f"File source '{src['id']}' is missing 'path'" if src_type == "docker" and not src.get("container"): return f"Docker source '{src['id']}' is missing 'container'" return None # Extensions considered as log files in the filesystem scanner. _LOG_EXTENSIONS = {"", ".log", ".txt", ".out", ".err"} # Max file size to consider (500 MB). _MAX_SIZE = 500 * 1024 * 1024 # Recency half-life in days — files older than this are scored near 0. _RECENCY_HALFLIFE_DAYS = 30 def _path_to_source_id(path: Path) -> str: """Convert an absolute path to a kebab-case source ID.""" raw = re.sub(r"[^a-zA-Z0-9]+", "-", str(path)).strip("-").lower() return raw[:64] def scan_log_directories( query: str | None = None, dirs: list[str] | None = None, max_depth: int = 4, max_results: int = 25, ) -> list[dict[str, Any]]: """Scan filesystem directories for log files ranked by recency and keyword match. Scoring weights: - Recency (0-1): mtime within the last 30 days, decays exponentially - Size (0-1): prefer 1 KB – 50 MB; empty or huge files score low - Keyword (0-1): stem matches between query words and path components Returns up to *max_results* candidates sorted by descending score. """ if dirs is None: dirs = ["/var/log", "/opt"] now = time.time() query_stems: list[str] = [] if query: query_stems = [w.lower() for w in re.split(r"\W+", query) if len(w) >= 3] candidates: list[dict[str, Any]] = [] def _walk(root: Path, depth: int) -> None: if depth > max_depth: return try: entries = list(root.iterdir()) except OSError: return for entry in entries: if entry.name.startswith("."): continue if entry.is_symlink(): continue if entry.is_dir(): _walk(entry, depth + 1) continue if not entry.is_file(): continue if entry.suffix.lower() not in _LOG_EXTENSIONS: continue # Skip compressed archives if entry.name.endswith((".gz", ".bz2", ".xz", ".zst")): continue try: stat = entry.stat() except OSError: continue if stat.st_size == 0 or stat.st_size > _MAX_SIZE: continue if not os.access(entry, os.R_OK): continue age_days = (now - stat.st_mtime) / 86400 recency = max(0.0, 1.0 - age_days / _RECENCY_HALFLIFE_DAYS) if stat.st_size < 1024: size_score = 0.3 elif stat.st_size <= 50 * 1024 * 1024: size_score = 1.0 else: # Large files: linear decay from 50 MB to 500 MB size_score = max(0.1, 1.0 - (stat.st_size - 50 * 1024 * 1024) / _MAX_SIZE) keyword_score = 0.0 if query_stems: path_lower = str(entry).lower() matches = sum(1 for stem in query_stems if stem in path_lower) keyword_score = min(1.0, matches / max(len(query_stems), 1)) if query_stems: total = recency * 0.4 + size_score * 0.2 + keyword_score * 0.4 else: total = recency * 0.7 + size_score * 0.3 candidates.append({ "type": "file", "id": _path_to_source_id(entry), "path": str(entry), "label": entry.name, "size_bytes": stat.st_size, "mtime": stat.st_mtime, "score": round(total, 3), "available": True, }) for d in dirs: _walk(Path(d), depth=0) candidates.sort(key=lambda c: c["score"], reverse=True) return candidates[:max_results]