diff --git a/app/rest.py b/app/rest.py index 246b5cc..9d98f2d 100644 --- a/app/rest.py +++ b/app/rest.py @@ -52,7 +52,7 @@ from app.services.blocklist import ( update_candidate_status, ) from app.services.pihole import PiholeClient -from app.services.discover import discover_all, build_sources_yaml, validate_source +from app.services.discover import discover_all, build_sources_yaml, validate_source, scan_log_directories from app.services.nl_source import interpret as _nl_interpret from app.services.incidents import ( build_bundle, @@ -820,6 +820,28 @@ def setup_discover() -> dict: return discover_all() +@router.get("/api/setup/scan") +def setup_scan( + query: str = "", + dirs: str = "", + max_results: int = 25, +) -> dict: + """Scan the filesystem for log files ranked by recency and keyword match. + + Accepts an optional ?query= to weight results toward files matching the + problem context (e.g. 'nginx 502', 'docker timeout', 'ssh refused'). + Accepts an optional ?dirs= comma-separated list to override default scan + directories (/var/log, /opt). + """ + scan_dirs = [d.strip() for d in dirs.split(",") if d.strip()] or None + candidates = scan_log_directories( + query=query or None, + dirs=scan_dirs, + max_results=min(max_results, 100), + ) + return {"candidates": candidates, "query": query or None} + + @router.post("/api/setup/write") def setup_write(body: SetupWriteBody, background_tasks: BackgroundTasks) -> dict: """Validate and write sources.yaml from a list of selected source definitions. diff --git a/app/services/discover.py b/app/services/discover.py index e511d9a..6c8498c 100644 --- a/app/services/discover.py +++ b/app/services/discover.py @@ -8,8 +8,10 @@ from __future__ import annotations import json import logging import os +import re import shutil import subprocess +import time from pathlib import Path from typing import Any @@ -171,3 +173,113 @@ def validate_source(src: dict[str, Any]) -> str | None: if src_type == "docker" and not src.get("container"): return f"Docker source '{src['id']}' is missing 'container'" return None + + +# Extensions considered as log files in the filesystem scanner. +_LOG_EXTENSIONS = {"", ".log", ".txt", ".out", ".err"} +# Max file size to consider (500 MB). +_MAX_SIZE = 500 * 1024 * 1024 +# Recency half-life in days — files older than this are scored near 0. +_RECENCY_HALFLIFE_DAYS = 30 + + +def _path_to_source_id(path: Path) -> str: + """Convert an absolute path to a kebab-case source ID.""" + raw = re.sub(r"[^a-zA-Z0-9]+", "-", str(path)).strip("-").lower() + return raw[:64] + + +def scan_log_directories( + query: str | None = None, + dirs: list[str] | None = None, + max_depth: int = 4, + max_results: int = 25, +) -> list[dict[str, Any]]: + """Scan filesystem directories for log files ranked by recency and keyword match. + + Scoring weights: + - Recency (0-1): mtime within the last 30 days, decays exponentially + - Size (0-1): prefer 1 KB – 50 MB; empty or huge files score low + - Keyword (0-1): stem matches between query words and path components + + Returns up to *max_results* candidates sorted by descending score. + """ + if dirs is None: + dirs = ["/var/log", "/opt"] + + now = time.time() + query_stems: list[str] = [] + if query: + query_stems = [w.lower() for w in re.split(r"\W+", query) if len(w) >= 3] + + candidates: list[dict[str, Any]] = [] + + def _walk(root: Path, depth: int) -> None: + if depth > max_depth: + return + try: + entries = list(root.iterdir()) + except OSError: + return + for entry in entries: + if entry.name.startswith("."): + continue + if entry.is_symlink(): + continue + if entry.is_dir(): + _walk(entry, depth + 1) + continue + if not entry.is_file(): + continue + if entry.suffix.lower() not in _LOG_EXTENSIONS: + continue + # Skip compressed archives + if entry.name.endswith((".gz", ".bz2", ".xz", ".zst")): + continue + try: + stat = entry.stat() + except OSError: + continue + if stat.st_size == 0 or stat.st_size > _MAX_SIZE: + continue + if not os.access(entry, os.R_OK): + continue + + age_days = (now - stat.st_mtime) / 86400 + recency = max(0.0, 1.0 - age_days / _RECENCY_HALFLIFE_DAYS) + + if stat.st_size < 1024: + size_score = 0.3 + elif stat.st_size <= 50 * 1024 * 1024: + size_score = 1.0 + else: + # Large files: linear decay from 50 MB to 500 MB + size_score = max(0.1, 1.0 - (stat.st_size - 50 * 1024 * 1024) / _MAX_SIZE) + + keyword_score = 0.0 + if query_stems: + path_lower = str(entry).lower() + matches = sum(1 for stem in query_stems if stem in path_lower) + keyword_score = min(1.0, matches / max(len(query_stems), 1)) + + if query_stems: + total = recency * 0.4 + size_score * 0.2 + keyword_score * 0.4 + else: + total = recency * 0.7 + size_score * 0.3 + + candidates.append({ + "type": "file", + "id": _path_to_source_id(entry), + "path": str(entry), + "label": entry.name, + "size_bytes": stat.st_size, + "mtime": stat.st_mtime, + "score": round(total, 3), + "available": True, + }) + + for d in dirs: + _walk(Path(d), depth=0) + + candidates.sort(key=lambda c: c["score"], reverse=True) + return candidates[:max_results] diff --git a/tests/test_discover_scan.py b/tests/test_discover_scan.py new file mode 100644 index 0000000..1d16624 --- /dev/null +++ b/tests/test_discover_scan.py @@ -0,0 +1,133 @@ +"""Tests for scan_log_directories in app.services.discover.""" +from __future__ import annotations + +import os +import time +from pathlib import Path + +import pytest + +from app.services.discover import scan_log_directories, _path_to_source_id + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_log(tmp_path: Path, name: str, content: str = "hello\n", age_days: float = 0) -> Path: + p = tmp_path / name + p.write_text(content) + mtime = time.time() - age_days * 86400 + os.utime(p, (mtime, mtime)) + return p + + +# --------------------------------------------------------------------------- +# _path_to_source_id +# --------------------------------------------------------------------------- + +def test_path_to_source_id_basic(): + result = _path_to_source_id(Path("/var/log/nginx/access.log")) + assert result.startswith("var-log-nginx-access") + assert "/" not in result + assert " " not in result + + +def test_path_to_source_id_max_length(): + long_path = Path("/" + "a" * 200 + ".log") + assert len(_path_to_source_id(long_path)) <= 64 + + +# --------------------------------------------------------------------------- +# scan_log_directories +# --------------------------------------------------------------------------- + +def test_scan_finds_log_files(tmp_path): + _make_log(tmp_path, "app.log", "error: something\n") + _make_log(tmp_path, "system.log", "kernel: ok\n") + results = scan_log_directories(dirs=[str(tmp_path)]) + paths = [r["path"] for r in results] + assert str(tmp_path / "app.log") in paths + assert str(tmp_path / "system.log") in paths + + +def test_scan_ignores_empty_files(tmp_path): + _make_log(tmp_path, "empty.log", "") + results = scan_log_directories(dirs=[str(tmp_path)]) + assert not any(r["label"] == "empty.log" for r in results) + + +def test_scan_ignores_non_log_extensions(tmp_path): + (tmp_path / "config.yaml").write_text("key: value\n") + (tmp_path / "data.json").write_text('{"a":1}\n') + results = scan_log_directories(dirs=[str(tmp_path)]) + names = [r["label"] for r in results] + assert "config.yaml" not in names + assert "data.json" not in names + + +def test_scan_ignores_compressed(tmp_path): + _make_log(tmp_path, "old.log.gz", "compressed content") + results = scan_log_directories(dirs=[str(tmp_path)]) + assert not any(r["label"].endswith(".gz") for r in results) + + +def test_scan_respects_max_results(tmp_path): + for i in range(20): + _make_log(tmp_path, f"app{i}.log", f"log line {i}\n") + results = scan_log_directories(dirs=[str(tmp_path)], max_results=5) + assert len(results) <= 5 + + +def test_scan_recent_files_score_higher(tmp_path): + recent = _make_log(tmp_path, "recent.log", "new stuff\n", age_days=0) + old = _make_log(tmp_path, "old.log", "old stuff\n", age_days=60) + results = scan_log_directories(dirs=[str(tmp_path)]) + scores = {r["path"]: r["score"] for r in results} + assert scores[str(recent)] > scores[str(old)] + + +def test_scan_keyword_match_boosts_score(tmp_path): + nginx_log = _make_log(tmp_path, "nginx.log", "GET / 200\n", age_days=5) + other_log = _make_log(tmp_path, "kernel.log", "boot ok\n", age_days=5) + results = scan_log_directories(query="nginx 502 error", dirs=[str(tmp_path)]) + scores = {r["path"]: r["score"] for r in results} + assert scores[str(nginx_log)] > scores[str(other_log)] + + +def test_scan_returns_required_fields(tmp_path): + _make_log(tmp_path, "test.log", "data\n") + results = scan_log_directories(dirs=[str(tmp_path)]) + assert results + r = results[0] + assert r["type"] == "file" + assert "id" in r + assert "path" in r + assert "label" in r + assert "size_bytes" in r + assert "mtime" in r + assert "score" in r + assert r["available"] is True + + +def test_scan_missing_dir_is_graceful(): + results = scan_log_directories(dirs=["/nonexistent/path/xyz"]) + assert results == [] + + +def test_scan_subdirectory_recursive(tmp_path): + subdir = tmp_path / "subapp" + subdir.mkdir() + _make_log(subdir, "subapp.log", "nested log\n") + results = scan_log_directories(dirs=[str(tmp_path)]) + paths = [r["path"] for r in results] + assert str(subdir / "subapp.log") in paths + + +def test_scan_no_query_weights_recency_heavily(tmp_path): + """Without a query, recency (0.7) dominates over size (0.3).""" + fresh = _make_log(tmp_path, "fresh.log", "x" * 100, age_days=0) + stale = _make_log(tmp_path, "stale.log", "x" * 10000, age_days=20) + results = scan_log_directories(query=None, dirs=[str(tmp_path)]) + scores = {r["path"]: r["score"] for r in results} + assert scores[str(fresh)] > scores[str(stale)] diff --git a/web/src/views/SourcesView.vue b/web/src/views/SourcesView.vue index d718874..8ff9c5d 100644 --- a/web/src/views/SourcesView.vue +++ b/web/src/views/SourcesView.vue @@ -6,6 +6,12 @@
All hosts and services in the gleaned corpus.
+ {{ scanCandidates.length }} file{{ scanCandidates.length === 1 ? '' : 's' }} found — ranked by recency{{ scanQuery ? ' and keyword match' : '' }}. + Select files to add as sources. +
+