feat(sources): context-aware filesystem log scanner (#23)

Add scan_log_directories() to discover.py that recursively walks
/var/log and /opt, filters to readable log files, and scores each
candidate by recency (mtime, 0.7 weight), file size (0.3), and
keyword match against an optional problem-context query (shifts
weights to 0.4/0.2/0.4 when a query is provided).

- GET /api/setup/scan?query=...&max_results=N — new API endpoint
- SourcesView: "Scan" button opens a panel with ranked candidates,
  checkboxes, and "Add selected" to write to sources.yaml
- 13 new unit tests, 466 passing total

Closes: #23
This commit is contained in:
pyr0ball 2026-06-14 14:01:45 -07:00
parent 7ed01fbd48
commit 600e5a9eac
4 changed files with 436 additions and 1 deletions

View file

@ -52,7 +52,7 @@ from app.services.blocklist import (
update_candidate_status,
)
from app.services.pihole import PiholeClient
from app.services.discover import discover_all, build_sources_yaml, validate_source
from app.services.discover import discover_all, build_sources_yaml, validate_source, scan_log_directories
from app.services.nl_source import interpret as _nl_interpret
from app.services.incidents import (
build_bundle,
@ -820,6 +820,28 @@ def setup_discover() -> dict:
return discover_all()
@router.get("/api/setup/scan")
def setup_scan(
query: str = "",
dirs: str = "",
max_results: int = 25,
) -> dict:
"""Scan the filesystem for log files ranked by recency and keyword match.
Accepts an optional ?query= to weight results toward files matching the
problem context (e.g. 'nginx 502', 'docker timeout', 'ssh refused').
Accepts an optional ?dirs= comma-separated list to override default scan
directories (/var/log, /opt).
"""
scan_dirs = [d.strip() for d in dirs.split(",") if d.strip()] or None
candidates = scan_log_directories(
query=query or None,
dirs=scan_dirs,
max_results=min(max_results, 100),
)
return {"candidates": candidates, "query": query or None}
@router.post("/api/setup/write")
def setup_write(body: SetupWriteBody, background_tasks: BackgroundTasks) -> dict:
"""Validate and write sources.yaml from a list of selected source definitions.

View file

@ -8,8 +8,10 @@ from __future__ import annotations
import json
import logging
import os
import re
import shutil
import subprocess
import time
from pathlib import Path
from typing import Any
@ -171,3 +173,113 @@ def validate_source(src: dict[str, Any]) -> str | None:
if src_type == "docker" and not src.get("container"):
return f"Docker source '{src['id']}' is missing 'container'"
return None
# Extensions considered as log files in the filesystem scanner.
_LOG_EXTENSIONS = {"", ".log", ".txt", ".out", ".err"}
# Max file size to consider (500 MB).
_MAX_SIZE = 500 * 1024 * 1024
# Recency half-life in days — files older than this are scored near 0.
_RECENCY_HALFLIFE_DAYS = 30
def _path_to_source_id(path: Path) -> str:
"""Convert an absolute path to a kebab-case source ID."""
raw = re.sub(r"[^a-zA-Z0-9]+", "-", str(path)).strip("-").lower()
return raw[:64]
def scan_log_directories(
query: str | None = None,
dirs: list[str] | None = None,
max_depth: int = 4,
max_results: int = 25,
) -> list[dict[str, Any]]:
"""Scan filesystem directories for log files ranked by recency and keyword match.
Scoring weights:
- Recency (0-1): mtime within the last 30 days, decays exponentially
- Size (0-1): prefer 1 KB 50 MB; empty or huge files score low
- Keyword (0-1): stem matches between query words and path components
Returns up to *max_results* candidates sorted by descending score.
"""
if dirs is None:
dirs = ["/var/log", "/opt"]
now = time.time()
query_stems: list[str] = []
if query:
query_stems = [w.lower() for w in re.split(r"\W+", query) if len(w) >= 3]
candidates: list[dict[str, Any]] = []
def _walk(root: Path, depth: int) -> None:
if depth > max_depth:
return
try:
entries = list(root.iterdir())
except OSError:
return
for entry in entries:
if entry.name.startswith("."):
continue
if entry.is_symlink():
continue
if entry.is_dir():
_walk(entry, depth + 1)
continue
if not entry.is_file():
continue
if entry.suffix.lower() not in _LOG_EXTENSIONS:
continue
# Skip compressed archives
if entry.name.endswith((".gz", ".bz2", ".xz", ".zst")):
continue
try:
stat = entry.stat()
except OSError:
continue
if stat.st_size == 0 or stat.st_size > _MAX_SIZE:
continue
if not os.access(entry, os.R_OK):
continue
age_days = (now - stat.st_mtime) / 86400
recency = max(0.0, 1.0 - age_days / _RECENCY_HALFLIFE_DAYS)
if stat.st_size < 1024:
size_score = 0.3
elif stat.st_size <= 50 * 1024 * 1024:
size_score = 1.0
else:
# Large files: linear decay from 50 MB to 500 MB
size_score = max(0.1, 1.0 - (stat.st_size - 50 * 1024 * 1024) / _MAX_SIZE)
keyword_score = 0.0
if query_stems:
path_lower = str(entry).lower()
matches = sum(1 for stem in query_stems if stem in path_lower)
keyword_score = min(1.0, matches / max(len(query_stems), 1))
if query_stems:
total = recency * 0.4 + size_score * 0.2 + keyword_score * 0.4
else:
total = recency * 0.7 + size_score * 0.3
candidates.append({
"type": "file",
"id": _path_to_source_id(entry),
"path": str(entry),
"label": entry.name,
"size_bytes": stat.st_size,
"mtime": stat.st_mtime,
"score": round(total, 3),
"available": True,
})
for d in dirs:
_walk(Path(d), depth=0)
candidates.sort(key=lambda c: c["score"], reverse=True)
return candidates[:max_results]

133
tests/test_discover_scan.py Normal file
View file

@ -0,0 +1,133 @@
"""Tests for scan_log_directories in app.services.discover."""
from __future__ import annotations
import os
import time
from pathlib import Path
import pytest
from app.services.discover import scan_log_directories, _path_to_source_id
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_log(tmp_path: Path, name: str, content: str = "hello\n", age_days: float = 0) -> Path:
p = tmp_path / name
p.write_text(content)
mtime = time.time() - age_days * 86400
os.utime(p, (mtime, mtime))
return p
# ---------------------------------------------------------------------------
# _path_to_source_id
# ---------------------------------------------------------------------------
def test_path_to_source_id_basic():
result = _path_to_source_id(Path("/var/log/nginx/access.log"))
assert result.startswith("var-log-nginx-access")
assert "/" not in result
assert " " not in result
def test_path_to_source_id_max_length():
long_path = Path("/" + "a" * 200 + ".log")
assert len(_path_to_source_id(long_path)) <= 64
# ---------------------------------------------------------------------------
# scan_log_directories
# ---------------------------------------------------------------------------
def test_scan_finds_log_files(tmp_path):
_make_log(tmp_path, "app.log", "error: something\n")
_make_log(tmp_path, "system.log", "kernel: ok\n")
results = scan_log_directories(dirs=[str(tmp_path)])
paths = [r["path"] for r in results]
assert str(tmp_path / "app.log") in paths
assert str(tmp_path / "system.log") in paths
def test_scan_ignores_empty_files(tmp_path):
_make_log(tmp_path, "empty.log", "")
results = scan_log_directories(dirs=[str(tmp_path)])
assert not any(r["label"] == "empty.log" for r in results)
def test_scan_ignores_non_log_extensions(tmp_path):
(tmp_path / "config.yaml").write_text("key: value\n")
(tmp_path / "data.json").write_text('{"a":1}\n')
results = scan_log_directories(dirs=[str(tmp_path)])
names = [r["label"] for r in results]
assert "config.yaml" not in names
assert "data.json" not in names
def test_scan_ignores_compressed(tmp_path):
_make_log(tmp_path, "old.log.gz", "compressed content")
results = scan_log_directories(dirs=[str(tmp_path)])
assert not any(r["label"].endswith(".gz") for r in results)
def test_scan_respects_max_results(tmp_path):
for i in range(20):
_make_log(tmp_path, f"app{i}.log", f"log line {i}\n")
results = scan_log_directories(dirs=[str(tmp_path)], max_results=5)
assert len(results) <= 5
def test_scan_recent_files_score_higher(tmp_path):
recent = _make_log(tmp_path, "recent.log", "new stuff\n", age_days=0)
old = _make_log(tmp_path, "old.log", "old stuff\n", age_days=60)
results = scan_log_directories(dirs=[str(tmp_path)])
scores = {r["path"]: r["score"] for r in results}
assert scores[str(recent)] > scores[str(old)]
def test_scan_keyword_match_boosts_score(tmp_path):
nginx_log = _make_log(tmp_path, "nginx.log", "GET / 200\n", age_days=5)
other_log = _make_log(tmp_path, "kernel.log", "boot ok\n", age_days=5)
results = scan_log_directories(query="nginx 502 error", dirs=[str(tmp_path)])
scores = {r["path"]: r["score"] for r in results}
assert scores[str(nginx_log)] > scores[str(other_log)]
def test_scan_returns_required_fields(tmp_path):
_make_log(tmp_path, "test.log", "data\n")
results = scan_log_directories(dirs=[str(tmp_path)])
assert results
r = results[0]
assert r["type"] == "file"
assert "id" in r
assert "path" in r
assert "label" in r
assert "size_bytes" in r
assert "mtime" in r
assert "score" in r
assert r["available"] is True
def test_scan_missing_dir_is_graceful():
results = scan_log_directories(dirs=["/nonexistent/path/xyz"])
assert results == []
def test_scan_subdirectory_recursive(tmp_path):
subdir = tmp_path / "subapp"
subdir.mkdir()
_make_log(subdir, "subapp.log", "nested log\n")
results = scan_log_directories(dirs=[str(tmp_path)])
paths = [r["path"] for r in results]
assert str(subdir / "subapp.log") in paths
def test_scan_no_query_weights_recency_heavily(tmp_path):
"""Without a query, recency (0.7) dominates over size (0.3)."""
fresh = _make_log(tmp_path, "fresh.log", "x" * 100, age_days=0)
stale = _make_log(tmp_path, "stale.log", "x" * 10000, age_days=20)
results = scan_log_directories(query=None, dirs=[str(tmp_path)])
scores = {r["path"]: r["score"] for r in results}
assert scores[str(fresh)] > scores[str(stale)]

View file

@ -6,6 +6,12 @@
<p class="text-text-dim text-sm">All hosts and services in the gleaned corpus.</p>
</div>
<div class="flex items-center gap-2 shrink-0">
<button
@click="toggleScanPanel"
class="btn-secondary text-sm"
>
Scan
</button>
<button
@click="showAddPanel = !showAddPanel"
class="btn-secondary text-sm"
@ -27,6 +33,73 @@
/>
</div>
<!-- Filesystem scan panel -->
<div v-if="showScanPanel && !showWizard" class="mb-6 rounded border border-surface-border bg-surface-raised p-4">
<h2 class="text-text-primary font-medium text-sm mb-3">Scan for log files</h2>
<div class="flex gap-2 mb-4">
<input
v-model="scanQuery"
type="text"
placeholder="Optional: describe the problem (e.g. 'nginx 502 gateway error')"
class="input-field flex-1 text-sm"
@keydown.enter="runScan"
/>
<button @click="runScan" :disabled="scanning" class="btn-primary text-sm px-4">
{{ scanning ? 'Scanning…' : 'Scan' }}
</button>
</div>
<div v-if="scanError" class="text-sev-error text-sm mb-3">{{ scanError }}</div>
<div v-if="scanCandidates.length > 0">
<p class="text-text-dim text-xs mb-2">
{{ scanCandidates.length }} file{{ scanCandidates.length === 1 ? '' : 's' }} found ranked by recency{{ scanQuery ? ' and keyword match' : '' }}.
Select files to add as sources.
</p>
<div class="divide-y divide-surface-border border border-surface-border rounded overflow-hidden mb-3">
<label
v-for="c in scanCandidates"
:key="c.path"
class="flex items-start gap-3 px-3 py-2 hover:bg-surface cursor-pointer"
>
<input
type="checkbox"
:value="c"
v-model="scanSelected"
class="mt-0.5 shrink-0"
/>
<div class="min-w-0 flex-1">
<div class="flex items-center gap-2 flex-wrap">
<span class="font-mono text-xs text-accent truncate">{{ c.path }}</span>
<span class="text-text-dim text-xs shrink-0">{{ formatBytes(c.size_bytes) }}</span>
<span class="text-text-dim text-xs shrink-0">{{ formatAge(c.mtime) }}</span>
<span
v-if="scanQuery"
class="text-text-dim text-xs shrink-0"
:title="`Relevance score: ${c.score}`"
>score {{ (c.score * 100).toFixed(0) }}%</span>
</div>
</div>
</label>
</div>
<div class="flex items-center gap-3">
<button
:disabled="scanSelected.length === 0 || scanAdding"
@click="addScanSelected"
class="btn-primary text-sm"
>
{{ scanAdding ? 'Adding…' : `Add ${scanSelected.length || ''} selected` }}
</button>
<button @click="scanSelected = []" class="btn-secondary text-sm">Deselect all</button>
<button @click="scanSelected = [...scanCandidates]" class="btn-secondary text-sm">Select all</button>
</div>
</div>
<div v-else-if="scanRan && !scanning" class="text-text-dim text-sm">
No log files found in the scanned directories.
</div>
</div>
<!-- Post-setup Add Source panel (condensed wizard steps 1-2) -->
<div v-else-if="showAddPanel" class="mb-6">
<SetupWizard
@ -184,6 +257,17 @@ interface DbSource {
latest: string | null
}
interface ScanCandidate {
type: string
id: string
path: string
label: string
size_bytes: number
mtime: number
score: number
available: boolean
}
const sources = ref<SourceRow[]>([])
const loading = ref(true)
const busy = ref(new Set<string>())
@ -191,6 +275,14 @@ const actionMsg = ref('')
const actionError = ref(false)
const showWizard = ref(false)
const showAddPanel = ref(false)
const showScanPanel = ref(false)
const scanQuery = ref('')
const scanning = ref(false)
const scanRan = ref(false)
const scanError = ref('')
const scanCandidates = ref<ScanCandidate[]>([])
const scanSelected = ref<ScanCandidate[]>([])
const scanAdding = ref(false)
const BASE = import.meta.env.BASE_URL.replace(/\/$/, '')
@ -347,6 +439,82 @@ async function handleUpload(e: Event): Promise<void> {
;(e.target as HTMLInputElement).value = ''
}
function toggleScanPanel(): void {
showScanPanel.value = !showScanPanel.value
if (!showScanPanel.value) {
scanCandidates.value = []
scanSelected.value = []
scanRan.value = false
scanError.value = ''
}
}
async function runScan(): Promise<void> {
scanning.value = true
scanError.value = ''
scanCandidates.value = []
scanSelected.value = []
try {
const params = new URLSearchParams({ max_results: '30' })
if (scanQuery.value.trim()) params.set('query', scanQuery.value.trim())
const res = await fetch(`${BASE}/api/setup/scan?${params}`)
if (!res.ok) {
const data = await res.json().catch(() => ({}))
scanError.value = data.detail ?? 'Scan failed'
return
}
const data = await res.json()
scanCandidates.value = data.candidates ?? []
scanRan.value = true
} catch (err) {
scanError.value = String(err)
} finally {
scanning.value = false
}
}
async function addScanSelected(): Promise<void> {
if (scanSelected.value.length === 0) return
scanAdding.value = true
actionMsg.value = ''
try {
const res = await fetch(`${BASE}/api/setup/write`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ sources: scanSelected.value }),
})
const data = await res.json()
if (res.ok) {
actionMsg.value = `Added ${scanSelected.value.length} source${scanSelected.value.length === 1 ? '' : 's'} to sources.yaml`
actionError.value = false
showScanPanel.value = false
scanCandidates.value = []
scanSelected.value = []
scanRan.value = false
await loadSources()
} else {
actionMsg.value = data.detail ?? 'Failed to add sources'
actionError.value = true
}
} finally {
scanAdding.value = false
}
}
function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`
return `${(bytes / 1024 / 1024).toFixed(1)} MB`
}
function formatAge(mtime: number): string {
const ageDays = (Date.now() / 1000 - mtime) / 86400
if (ageDays < 1) return 'today'
if (ageDays < 2) return 'yesterday'
if (ageDays < 30) return `${Math.floor(ageDays)}d ago`
return `${Math.floor(ageDays / 30)}mo ago`
}
function formatTs(iso: string | null): string {
if (!iso) return '—'
try {