feat(blocklist): extraction scan + candidate CRUD + full test suite

This commit is contained in:
pyr0ball 2026-05-15 21:05:49 -07:00
parent 7d213b8aca
commit 0695c42908
2 changed files with 392 additions and 0 deletions

View file

@ -2,6 +2,11 @@
from __future__ import annotations from __future__ import annotations
import dataclasses import dataclasses
import json
import re
import sqlite3
import uuid
from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
import yaml import yaml
@ -62,3 +67,222 @@ def matches_telemetry(domain: str, rules: list[TelemetryRule]) -> TelemetryRule
if d == rd or d.endswith("." + rd): if d == rd or d.endswith("." + rd):
return rule return rule
return None return None
# ---------------------------------------------------------------------------
# Regex extractors for router log entries
# ---------------------------------------------------------------------------
_DNSMASQ_RE = re.compile(
r"query\[A{1,4}\]\s+(?P<domain>\S+)\s+from\s+(?P<src>[\d.]+)"
)
_IPTABLES_RE = re.compile(
r"SRC=(?P<src>[\d.]+).*?DST=(?P<dst>[\d.a-zA-Z.-]+)"
)
_VALID_STATUSES = {"pending", "approved", "rejected", "pushed", "unblocked"}
# ---------------------------------------------------------------------------
# DB helpers
# ---------------------------------------------------------------------------
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _row_to_candidate(row: tuple) -> BlocklistCandidate:
return BlocklistCandidate(
id=row[0],
domain_or_ip=row[1],
source_device_ip=row[2],
source_device_name=row[3],
first_seen=row[4],
last_seen=row[5],
hit_count=row[6],
status=row[7],
pushed_at=row[8],
log_evidence=json.loads(row[9] or "[]"),
matched_rule=row[10],
llm_score=row[11],
llm_reason=row[12],
)
def _upsert_candidate(
conn: sqlite3.Connection,
domain_or_ip: str,
source_device_ip: str | None,
source_device_name: str | None,
matched_rule: str | None,
entry_id: str,
now: str,
) -> bool:
"""Insert or update a candidate. Returns True if a new row was created."""
row = conn.execute(
"SELECT id, hit_count, log_evidence FROM blocklist_candidates "
"WHERE domain_or_ip = ? AND source_device_ip IS ?",
(domain_or_ip, source_device_ip),
).fetchone()
if row is None:
conn.execute(
"""INSERT INTO blocklist_candidates
(id, domain_or_ip, source_device_ip, source_device_name,
first_seen, last_seen, hit_count, status, pushed_at, log_evidence, matched_rule)
VALUES (?, ?, ?, ?, ?, ?, 1, 'pending', NULL, ?, ?)""",
(
str(uuid.uuid4()), domain_or_ip, source_device_ip, source_device_name,
now, now, json.dumps([entry_id]), matched_rule,
),
)
return True
existing_id, hit_count, existing_evidence = row
evidence = json.loads(existing_evidence or "[]")
if entry_id not in evidence:
evidence.append(entry_id)
evidence = evidence[-10:] # cap at 10
conn.execute(
"UPDATE blocklist_candidates SET last_seen=?, hit_count=?, log_evidence=? WHERE id=?",
(now, hit_count + 1, json.dumps(evidence), existing_id),
)
return False
# ---------------------------------------------------------------------------
# Extraction scan
# ---------------------------------------------------------------------------
def run_scan(
db_path: Path,
router_source_ids: list[str],
device_map: dict[str, str],
telemetry_rules: list[TelemetryRule],
) -> int:
"""Scan log_entries from router sources, upsert blocklist candidates.
Only entries whose source IP is in device_map are recorded.
Returns the total number of rows created or updated.
"""
if not router_source_ids or not device_map:
return 0
placeholders = ",".join("?" for _ in router_source_ids)
now = _now_iso()
count = 0
conn = sqlite3.connect(str(db_path))
try:
rows = conn.execute(
f"SELECT id, text FROM log_entries WHERE source_id IN ({placeholders})",
router_source_ids,
).fetchall()
for entry_id, text in rows:
src_ip: str | None = None
dst: str | None = None
m = _DNSMASQ_RE.search(text)
if m:
src_ip = m.group("src")
dst = m.group("domain")
else:
m = _IPTABLES_RE.search(text)
if m:
src_ip = m.group("src")
dst = m.group("dst")
if src_ip is None or src_ip not in device_map:
continue
device_name = device_map[src_ip]
rule = matches_telemetry(dst, telemetry_rules) if dst else None
matched_rule_name = rule.name if rule else None
_upsert_candidate(conn, dst or "unknown", src_ip, device_name, matched_rule_name, entry_id, now)
count += 1
conn.commit()
finally:
conn.close()
return count
# ---------------------------------------------------------------------------
# Candidate CRUD
# ---------------------------------------------------------------------------
_CANDIDATE_SELECT = (
"SELECT id,domain_or_ip,source_device_ip,source_device_name,"
"first_seen,last_seen,hit_count,status,pushed_at,log_evidence,"
"matched_rule,llm_score,llm_reason FROM blocklist_candidates"
)
def list_candidates(
db_path: Path,
status: str | None = None,
device_ip: str | None = None,
) -> list[BlocklistCandidate]:
conn = sqlite3.connect(str(db_path))
try:
query = f"{_CANDIDATE_SELECT} WHERE 1=1"
params: list = []
if status and status != "all":
query += " AND status = ?"
params.append(status)
if device_ip:
query += " AND source_device_ip = ?"
params.append(device_ip)
query += " ORDER BY last_seen DESC"
rows = conn.execute(query, params).fetchall()
finally:
conn.close()
return [_row_to_candidate(r) for r in rows]
def _get_candidate(conn: sqlite3.Connection, candidate_id: str) -> BlocklistCandidate:
row = conn.execute(
f"{_CANDIDATE_SELECT} WHERE id=?",
(candidate_id,),
).fetchone()
if row is None:
raise KeyError(f"Candidate {candidate_id!r} not found")
return _row_to_candidate(row)
def update_candidate_status(db_path: Path, candidate_id: str, new_status: str) -> BlocklistCandidate:
if new_status not in _VALID_STATUSES:
raise ValueError(f"Invalid status {new_status!r}. Must be one of {_VALID_STATUSES}")
conn = sqlite3.connect(str(db_path))
try:
conn.execute("UPDATE blocklist_candidates SET status=? WHERE id=?", (new_status, candidate_id))
conn.commit()
return _get_candidate(conn, candidate_id)
finally:
conn.close()
def mark_pushed(db_path: Path, candidate_id: str) -> BlocklistCandidate:
conn = sqlite3.connect(str(db_path))
try:
conn.execute(
"UPDATE blocklist_candidates SET status='pushed', pushed_at=? WHERE id=?",
(_now_iso(), candidate_id),
)
conn.commit()
return _get_candidate(conn, candidate_id)
finally:
conn.close()
def mark_unblocked(db_path: Path, candidate_id: str) -> BlocklistCandidate:
conn = sqlite3.connect(str(db_path))
try:
conn.execute("UPDATE blocklist_candidates SET status='unblocked' WHERE id=?", (candidate_id,))
conn.commit()
return _get_candidate(conn, candidate_id)
finally:
conn.close()

View file

@ -84,3 +84,171 @@ class TestTelemetry:
result = matches_telemetry("api.xbcs.net", rules) result = matches_telemetry("api.xbcs.net", rules)
assert result is not None assert result is not None
assert result.category == "belkin" assert result.category == "belkin"
class TestExtraction:
@pytest.fixture
def db(self, tmp_path):
from app.ingest.pipeline import ensure_schema
p = tmp_path / "test.db"
ensure_schema(p)
return p
@pytest.fixture
def rules(self):
from app.services.blocklist import load_telemetry_rules
return load_telemetry_rules(
Path(__file__).parent.parent / "patterns" / "telemetry.yaml"
)
def test_dnsmasq_entry_extracted(self, db, rules):
import sqlite3
from app.services.blocklist import run_scan
conn = sqlite3.connect(str(db))
conn.execute(
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
VALUES ('e1', 'router:syslog', 1, '2026-05-14T00:00:00+00:00',
'dnsmasq[123]: query[A] samsungads.com from 192.168.1.45')"""
)
conn.commit()
conn.close()
count = run_scan(
db,
router_source_ids=["router:syslog"],
device_map={"192.168.1.45": "Samsung Projector"},
telemetry_rules=rules,
)
assert count >= 1
conn = sqlite3.connect(str(db))
row = conn.execute(
"SELECT domain_or_ip, source_device_name, matched_rule, status FROM blocklist_candidates"
).fetchone()
conn.close()
assert row[0] == "samsungads.com"
assert row[1] == "Samsung Projector"
assert row[2] == "samsung_ads"
assert row[3] == "pending"
def test_iptables_entry_extracted(self, db, rules):
import sqlite3
from app.services.blocklist import run_scan
conn = sqlite3.connect(str(db))
conn.execute(
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
VALUES ('e2', 'router:fw', 1, '2026-05-14T00:00:00+00:00',
'kernel: FORWARD SRC=192.168.1.67 DST=52.11.243.144 PROTO=TCP DPT=443')"""
)
conn.commit()
conn.close()
count = run_scan(
db,
router_source_ids=["router:fw"],
device_map={"192.168.1.67": "Belkin Switch 1"},
telemetry_rules=rules,
)
assert count >= 1
conn = sqlite3.connect(str(db))
row = conn.execute("SELECT domain_or_ip, source_device_name FROM blocklist_candidates").fetchone()
conn.close()
assert row[0] == "52.11.243.144"
assert row[1] == "Belkin Switch 1"
def test_unknown_device_skipped(self, db, rules):
import sqlite3
from app.services.blocklist import run_scan
conn = sqlite3.connect(str(db))
conn.execute(
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
VALUES ('e3', 'router:syslog', 1, '2026-05-14T00:00:00+00:00',
'dnsmasq[123]: query[A] samsungads.com from 10.0.0.99')"""
)
conn.commit()
conn.close()
count = run_scan(
db,
router_source_ids=["router:syslog"],
device_map={"192.168.1.45": "Samsung Projector"},
telemetry_rules=rules,
)
assert count == 0
def test_dedup_upsert_increments_hit_count(self, db, rules):
import sqlite3
from app.services.blocklist import run_scan
conn = sqlite3.connect(str(db))
for i in range(3):
conn.execute(
f"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
VALUES ('e{i}', 'router:syslog', {i}, '2026-05-14T00:00:00+00:00',
'dnsmasq[123]: query[A] samsungads.com from 192.168.1.45')"""
)
conn.commit()
conn.close()
run_scan(db, ["router:syslog"], {"192.168.1.45": "Projector"}, rules)
conn = sqlite3.connect(str(db))
rows = conn.execute("SELECT hit_count FROM blocklist_candidates").fetchall()
conn.close()
assert len(rows) == 1 # one row, not three
assert rows[0][0] == 3
class TestCandidateManagement:
@pytest.fixture
def db_with_candidate(self, tmp_path):
from app.ingest.pipeline import ensure_schema
import sqlite3, uuid
db = tmp_path / "test.db"
ensure_schema(db)
conn = sqlite3.connect(str(db))
cid = str(uuid.uuid4())
conn.execute(
"""INSERT INTO blocklist_candidates
(id, domain_or_ip, first_seen, last_seen)
VALUES (?, 'samsungads.com', '2026-05-14T00:00:00+00:00', '2026-05-14T00:00:00+00:00')""",
(cid,),
)
conn.commit()
conn.close()
return db, cid
def test_list_candidates_returns_all(self, db_with_candidate):
from app.services.blocklist import list_candidates
db, _ = db_with_candidate
results = list_candidates(db)
assert len(results) == 1
assert results[0].domain_or_ip == "samsungads.com"
def test_list_candidates_filter_by_status(self, db_with_candidate):
from app.services.blocklist import list_candidates
db, _ = db_with_candidate
assert len(list_candidates(db, status="pending")) == 1
assert len(list_candidates(db, status="pushed")) == 0
def test_update_status_to_approved(self, db_with_candidate):
from app.services.blocklist import update_candidate_status, list_candidates
db, cid = db_with_candidate
candidate = update_candidate_status(db, cid, "approved")
assert candidate.status == "approved"
assert list_candidates(db, status="approved")[0].status == "approved"
def test_update_status_invalid_raises(self, db_with_candidate):
from app.services.blocklist import update_candidate_status
db, cid = db_with_candidate
with pytest.raises(ValueError, match="Invalid status"):
update_candidate_status(db, cid, "hacked")
def test_mark_pushed_sets_status_and_timestamp(self, db_with_candidate):
from app.services.blocklist import update_candidate_status, mark_pushed
db, cid = db_with_candidate
update_candidate_status(db, cid, "approved")
candidate = mark_pushed(db, cid)
assert candidate.status == "pushed"
assert candidate.pushed_at is not None
def test_mark_unblocked(self, db_with_candidate):
from app.services.blocklist import update_candidate_status, mark_pushed, mark_unblocked
db, cid = db_with_candidate
update_candidate_status(db, cid, "approved")
mark_pushed(db, cid)
candidate = mark_unblocked(db, cid)
assert candidate.status == "unblocked"