feat(sources): context-aware filesystem log scanner (#23)

Add scan_log_directories() to discover.py that recursively walks
/var/log and /opt, filters to readable log files, and scores each
candidate by recency (mtime, 0.7 weight), file size (0.3), and
keyword match against an optional problem-context query (shifts
weights to 0.4/0.2/0.4 when a query is provided).

- GET /api/setup/scan?query=...&max_results=N — new API endpoint
- SourcesView: "Scan" button opens a panel with ranked candidates,
  checkboxes, and "Add selected" to write to sources.yaml
- 13 new unit tests, 466 passing total

Closes: #23
This commit is contained in:
pyr0ball 2026-06-14 14:01:45 -07:00
parent 7ed01fbd48
commit 600e5a9eac
4 changed files with 436 additions and 1 deletions

View file

@ -52,7 +52,7 @@ from app.services.blocklist import (
update_candidate_status, update_candidate_status,
) )
from app.services.pihole import PiholeClient from app.services.pihole import PiholeClient
from app.services.discover import discover_all, build_sources_yaml, validate_source from app.services.discover import discover_all, build_sources_yaml, validate_source, scan_log_directories
from app.services.nl_source import interpret as _nl_interpret from app.services.nl_source import interpret as _nl_interpret
from app.services.incidents import ( from app.services.incidents import (
build_bundle, build_bundle,
@ -820,6 +820,28 @@ def setup_discover() -> dict:
return discover_all() return discover_all()
@router.get("/api/setup/scan")
def setup_scan(
query: str = "",
dirs: str = "",
max_results: int = 25,
) -> dict:
"""Scan the filesystem for log files ranked by recency and keyword match.
Accepts an optional ?query= to weight results toward files matching the
problem context (e.g. 'nginx 502', 'docker timeout', 'ssh refused').
Accepts an optional ?dirs= comma-separated list to override default scan
directories (/var/log, /opt).
"""
scan_dirs = [d.strip() for d in dirs.split(",") if d.strip()] or None
candidates = scan_log_directories(
query=query or None,
dirs=scan_dirs,
max_results=min(max_results, 100),
)
return {"candidates": candidates, "query": query or None}
@router.post("/api/setup/write") @router.post("/api/setup/write")
def setup_write(body: SetupWriteBody, background_tasks: BackgroundTasks) -> dict: def setup_write(body: SetupWriteBody, background_tasks: BackgroundTasks) -> dict:
"""Validate and write sources.yaml from a list of selected source definitions. """Validate and write sources.yaml from a list of selected source definitions.

View file

@ -8,8 +8,10 @@ from __future__ import annotations
import json import json
import logging import logging
import os import os
import re
import shutil import shutil
import subprocess import subprocess
import time
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@ -171,3 +173,113 @@ def validate_source(src: dict[str, Any]) -> str | None:
if src_type == "docker" and not src.get("container"): if src_type == "docker" and not src.get("container"):
return f"Docker source '{src['id']}' is missing 'container'" return f"Docker source '{src['id']}' is missing 'container'"
return None return None
# Extensions considered as log files in the filesystem scanner.
_LOG_EXTENSIONS = {"", ".log", ".txt", ".out", ".err"}
# Max file size to consider (500 MB).
_MAX_SIZE = 500 * 1024 * 1024
# Recency half-life in days — files older than this are scored near 0.
_RECENCY_HALFLIFE_DAYS = 30
def _path_to_source_id(path: Path) -> str:
"""Convert an absolute path to a kebab-case source ID."""
raw = re.sub(r"[^a-zA-Z0-9]+", "-", str(path)).strip("-").lower()
return raw[:64]
def scan_log_directories(
query: str | None = None,
dirs: list[str] | None = None,
max_depth: int = 4,
max_results: int = 25,
) -> list[dict[str, Any]]:
"""Scan filesystem directories for log files ranked by recency and keyword match.
Scoring weights:
- Recency (0-1): mtime within the last 30 days, decays exponentially
- Size (0-1): prefer 1 KB 50 MB; empty or huge files score low
- Keyword (0-1): stem matches between query words and path components
Returns up to *max_results* candidates sorted by descending score.
"""
if dirs is None:
dirs = ["/var/log", "/opt"]
now = time.time()
query_stems: list[str] = []
if query:
query_stems = [w.lower() for w in re.split(r"\W+", query) if len(w) >= 3]
candidates: list[dict[str, Any]] = []
def _walk(root: Path, depth: int) -> None:
if depth > max_depth:
return
try:
entries = list(root.iterdir())
except OSError:
return
for entry in entries:
if entry.name.startswith("."):
continue
if entry.is_symlink():
continue
if entry.is_dir():
_walk(entry, depth + 1)
continue
if not entry.is_file():
continue
if entry.suffix.lower() not in _LOG_EXTENSIONS:
continue
# Skip compressed archives
if entry.name.endswith((".gz", ".bz2", ".xz", ".zst")):
continue
try:
stat = entry.stat()
except OSError:
continue
if stat.st_size == 0 or stat.st_size > _MAX_SIZE:
continue
if not os.access(entry, os.R_OK):
continue
age_days = (now - stat.st_mtime) / 86400
recency = max(0.0, 1.0 - age_days / _RECENCY_HALFLIFE_DAYS)
if stat.st_size < 1024:
size_score = 0.3
elif stat.st_size <= 50 * 1024 * 1024:
size_score = 1.0
else:
# Large files: linear decay from 50 MB to 500 MB
size_score = max(0.1, 1.0 - (stat.st_size - 50 * 1024 * 1024) / _MAX_SIZE)
keyword_score = 0.0
if query_stems:
path_lower = str(entry).lower()
matches = sum(1 for stem in query_stems if stem in path_lower)
keyword_score = min(1.0, matches / max(len(query_stems), 1))
if query_stems:
total = recency * 0.4 + size_score * 0.2 + keyword_score * 0.4
else:
total = recency * 0.7 + size_score * 0.3
candidates.append({
"type": "file",
"id": _path_to_source_id(entry),
"path": str(entry),
"label": entry.name,
"size_bytes": stat.st_size,
"mtime": stat.st_mtime,
"score": round(total, 3),
"available": True,
})
for d in dirs:
_walk(Path(d), depth=0)
candidates.sort(key=lambda c: c["score"], reverse=True)
return candidates[:max_results]

133
tests/test_discover_scan.py Normal file
View file

@ -0,0 +1,133 @@
"""Tests for scan_log_directories in app.services.discover."""
from __future__ import annotations
import os
import time
from pathlib import Path
import pytest
from app.services.discover import scan_log_directories, _path_to_source_id
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_log(tmp_path: Path, name: str, content: str = "hello\n", age_days: float = 0) -> Path:
p = tmp_path / name
p.write_text(content)
mtime = time.time() - age_days * 86400
os.utime(p, (mtime, mtime))
return p
# ---------------------------------------------------------------------------
# _path_to_source_id
# ---------------------------------------------------------------------------
def test_path_to_source_id_basic():
result = _path_to_source_id(Path("/var/log/nginx/access.log"))
assert result.startswith("var-log-nginx-access")
assert "/" not in result
assert " " not in result
def test_path_to_source_id_max_length():
long_path = Path("/" + "a" * 200 + ".log")
assert len(_path_to_source_id(long_path)) <= 64
# ---------------------------------------------------------------------------
# scan_log_directories
# ---------------------------------------------------------------------------
def test_scan_finds_log_files(tmp_path):
_make_log(tmp_path, "app.log", "error: something\n")
_make_log(tmp_path, "system.log", "kernel: ok\n")
results = scan_log_directories(dirs=[str(tmp_path)])
paths = [r["path"] for r in results]
assert str(tmp_path / "app.log") in paths
assert str(tmp_path / "system.log") in paths
def test_scan_ignores_empty_files(tmp_path):
_make_log(tmp_path, "empty.log", "")
results = scan_log_directories(dirs=[str(tmp_path)])
assert not any(r["label"] == "empty.log" for r in results)
def test_scan_ignores_non_log_extensions(tmp_path):
(tmp_path / "config.yaml").write_text("key: value\n")
(tmp_path / "data.json").write_text('{"a":1}\n')
results = scan_log_directories(dirs=[str(tmp_path)])
names = [r["label"] for r in results]
assert "config.yaml" not in names
assert "data.json" not in names
def test_scan_ignores_compressed(tmp_path):
_make_log(tmp_path, "old.log.gz", "compressed content")
results = scan_log_directories(dirs=[str(tmp_path)])
assert not any(r["label"].endswith(".gz") for r in results)
def test_scan_respects_max_results(tmp_path):
for i in range(20):
_make_log(tmp_path, f"app{i}.log", f"log line {i}\n")
results = scan_log_directories(dirs=[str(tmp_path)], max_results=5)
assert len(results) <= 5
def test_scan_recent_files_score_higher(tmp_path):
recent = _make_log(tmp_path, "recent.log", "new stuff\n", age_days=0)
old = _make_log(tmp_path, "old.log", "old stuff\n", age_days=60)
results = scan_log_directories(dirs=[str(tmp_path)])
scores = {r["path"]: r["score"] for r in results}
assert scores[str(recent)] > scores[str(old)]
def test_scan_keyword_match_boosts_score(tmp_path):
nginx_log = _make_log(tmp_path, "nginx.log", "GET / 200\n", age_days=5)
other_log = _make_log(tmp_path, "kernel.log", "boot ok\n", age_days=5)
results = scan_log_directories(query="nginx 502 error", dirs=[str(tmp_path)])
scores = {r["path"]: r["score"] for r in results}
assert scores[str(nginx_log)] > scores[str(other_log)]
def test_scan_returns_required_fields(tmp_path):
_make_log(tmp_path, "test.log", "data\n")
results = scan_log_directories(dirs=[str(tmp_path)])
assert results
r = results[0]
assert r["type"] == "file"
assert "id" in r
assert "path" in r
assert "label" in r
assert "size_bytes" in r
assert "mtime" in r
assert "score" in r
assert r["available"] is True
def test_scan_missing_dir_is_graceful():
results = scan_log_directories(dirs=["/nonexistent/path/xyz"])
assert results == []
def test_scan_subdirectory_recursive(tmp_path):
subdir = tmp_path / "subapp"
subdir.mkdir()
_make_log(subdir, "subapp.log", "nested log\n")
results = scan_log_directories(dirs=[str(tmp_path)])
paths = [r["path"] for r in results]
assert str(subdir / "subapp.log") in paths
def test_scan_no_query_weights_recency_heavily(tmp_path):
"""Without a query, recency (0.7) dominates over size (0.3)."""
fresh = _make_log(tmp_path, "fresh.log", "x" * 100, age_days=0)
stale = _make_log(tmp_path, "stale.log", "x" * 10000, age_days=20)
results = scan_log_directories(query=None, dirs=[str(tmp_path)])
scores = {r["path"]: r["score"] for r in results}
assert scores[str(fresh)] > scores[str(stale)]

View file

@ -6,6 +6,12 @@
<p class="text-text-dim text-sm">All hosts and services in the gleaned corpus.</p> <p class="text-text-dim text-sm">All hosts and services in the gleaned corpus.</p>
</div> </div>
<div class="flex items-center gap-2 shrink-0"> <div class="flex items-center gap-2 shrink-0">
<button
@click="toggleScanPanel"
class="btn-secondary text-sm"
>
Scan
</button>
<button <button
@click="showAddPanel = !showAddPanel" @click="showAddPanel = !showAddPanel"
class="btn-secondary text-sm" class="btn-secondary text-sm"
@ -27,6 +33,73 @@
/> />
</div> </div>
<!-- Filesystem scan panel -->
<div v-if="showScanPanel && !showWizard" class="mb-6 rounded border border-surface-border bg-surface-raised p-4">
<h2 class="text-text-primary font-medium text-sm mb-3">Scan for log files</h2>
<div class="flex gap-2 mb-4">
<input
v-model="scanQuery"
type="text"
placeholder="Optional: describe the problem (e.g. 'nginx 502 gateway error')"
class="input-field flex-1 text-sm"
@keydown.enter="runScan"
/>
<button @click="runScan" :disabled="scanning" class="btn-primary text-sm px-4">
{{ scanning ? 'Scanning…' : 'Scan' }}
</button>
</div>
<div v-if="scanError" class="text-sev-error text-sm mb-3">{{ scanError }}</div>
<div v-if="scanCandidates.length > 0">
<p class="text-text-dim text-xs mb-2">
{{ scanCandidates.length }} file{{ scanCandidates.length === 1 ? '' : 's' }} found ranked by recency{{ scanQuery ? ' and keyword match' : '' }}.
Select files to add as sources.
</p>
<div class="divide-y divide-surface-border border border-surface-border rounded overflow-hidden mb-3">
<label
v-for="c in scanCandidates"
:key="c.path"
class="flex items-start gap-3 px-3 py-2 hover:bg-surface cursor-pointer"
>
<input
type="checkbox"
:value="c"
v-model="scanSelected"
class="mt-0.5 shrink-0"
/>
<div class="min-w-0 flex-1">
<div class="flex items-center gap-2 flex-wrap">
<span class="font-mono text-xs text-accent truncate">{{ c.path }}</span>
<span class="text-text-dim text-xs shrink-0">{{ formatBytes(c.size_bytes) }}</span>
<span class="text-text-dim text-xs shrink-0">{{ formatAge(c.mtime) }}</span>
<span
v-if="scanQuery"
class="text-text-dim text-xs shrink-0"
:title="`Relevance score: ${c.score}`"
>score {{ (c.score * 100).toFixed(0) }}%</span>
</div>
</div>
</label>
</div>
<div class="flex items-center gap-3">
<button
:disabled="scanSelected.length === 0 || scanAdding"
@click="addScanSelected"
class="btn-primary text-sm"
>
{{ scanAdding ? 'Adding…' : `Add ${scanSelected.length || ''} selected` }}
</button>
<button @click="scanSelected = []" class="btn-secondary text-sm">Deselect all</button>
<button @click="scanSelected = [...scanCandidates]" class="btn-secondary text-sm">Select all</button>
</div>
</div>
<div v-else-if="scanRan && !scanning" class="text-text-dim text-sm">
No log files found in the scanned directories.
</div>
</div>
<!-- Post-setup Add Source panel (condensed wizard steps 1-2) --> <!-- Post-setup Add Source panel (condensed wizard steps 1-2) -->
<div v-else-if="showAddPanel" class="mb-6"> <div v-else-if="showAddPanel" class="mb-6">
<SetupWizard <SetupWizard
@ -184,6 +257,17 @@ interface DbSource {
latest: string | null latest: string | null
} }
interface ScanCandidate {
type: string
id: string
path: string
label: string
size_bytes: number
mtime: number
score: number
available: boolean
}
const sources = ref<SourceRow[]>([]) const sources = ref<SourceRow[]>([])
const loading = ref(true) const loading = ref(true)
const busy = ref(new Set<string>()) const busy = ref(new Set<string>())
@ -191,6 +275,14 @@ const actionMsg = ref('')
const actionError = ref(false) const actionError = ref(false)
const showWizard = ref(false) const showWizard = ref(false)
const showAddPanel = ref(false) const showAddPanel = ref(false)
const showScanPanel = ref(false)
const scanQuery = ref('')
const scanning = ref(false)
const scanRan = ref(false)
const scanError = ref('')
const scanCandidates = ref<ScanCandidate[]>([])
const scanSelected = ref<ScanCandidate[]>([])
const scanAdding = ref(false)
const BASE = import.meta.env.BASE_URL.replace(/\/$/, '') const BASE = import.meta.env.BASE_URL.replace(/\/$/, '')
@ -347,6 +439,82 @@ async function handleUpload(e: Event): Promise<void> {
;(e.target as HTMLInputElement).value = '' ;(e.target as HTMLInputElement).value = ''
} }
function toggleScanPanel(): void {
showScanPanel.value = !showScanPanel.value
if (!showScanPanel.value) {
scanCandidates.value = []
scanSelected.value = []
scanRan.value = false
scanError.value = ''
}
}
async function runScan(): Promise<void> {
scanning.value = true
scanError.value = ''
scanCandidates.value = []
scanSelected.value = []
try {
const params = new URLSearchParams({ max_results: '30' })
if (scanQuery.value.trim()) params.set('query', scanQuery.value.trim())
const res = await fetch(`${BASE}/api/setup/scan?${params}`)
if (!res.ok) {
const data = await res.json().catch(() => ({}))
scanError.value = data.detail ?? 'Scan failed'
return
}
const data = await res.json()
scanCandidates.value = data.candidates ?? []
scanRan.value = true
} catch (err) {
scanError.value = String(err)
} finally {
scanning.value = false
}
}
async function addScanSelected(): Promise<void> {
if (scanSelected.value.length === 0) return
scanAdding.value = true
actionMsg.value = ''
try {
const res = await fetch(`${BASE}/api/setup/write`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ sources: scanSelected.value }),
})
const data = await res.json()
if (res.ok) {
actionMsg.value = `Added ${scanSelected.value.length} source${scanSelected.value.length === 1 ? '' : 's'} to sources.yaml`
actionError.value = false
showScanPanel.value = false
scanCandidates.value = []
scanSelected.value = []
scanRan.value = false
await loadSources()
} else {
actionMsg.value = data.detail ?? 'Failed to add sources'
actionError.value = true
}
} finally {
scanAdding.value = false
}
}
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
return `${(bytes / 1024 / 1024).toFixed(1)} MB`
}
function formatAge(mtime: number): string {
const ageDays = (Date.now() / 1000 - mtime) / 86400
if (ageDays < 1) return 'today'
if (ageDays < 2) return 'yesterday'
if (ageDays < 30) return `${Math.floor(ageDays)}d ago`
return `${Math.floor(ageDays / 30)}mo ago`
}
function formatTs(iso: string | null): string { function formatTs(iso: string | null): string {
if (!iso) return '—' if (!iso) return '—'
try { try {