feat(blocklist): extraction scan + candidate CRUD + full test suite
This commit is contained in:
parent
7d213b8aca
commit
0695c42908
2 changed files with 392 additions and 0 deletions
|
|
@ -2,6 +2,11 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
|
@ -62,3 +67,222 @@ def matches_telemetry(domain: str, rules: list[TelemetryRule]) -> TelemetryRule
|
|||
if d == rd or d.endswith("." + rd):
|
||||
return rule
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Regex extractors for router log entries
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_DNSMASQ_RE = re.compile(
|
||||
r"query\[A{1,4}\]\s+(?P<domain>\S+)\s+from\s+(?P<src>[\d.]+)"
|
||||
)
|
||||
_IPTABLES_RE = re.compile(
|
||||
r"SRC=(?P<src>[\d.]+).*?DST=(?P<dst>[\d.a-zA-Z.-]+)"
|
||||
)
|
||||
|
||||
_VALID_STATUSES = {"pending", "approved", "rejected", "pushed", "unblocked"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DB helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _row_to_candidate(row: tuple) -> BlocklistCandidate:
|
||||
return BlocklistCandidate(
|
||||
id=row[0],
|
||||
domain_or_ip=row[1],
|
||||
source_device_ip=row[2],
|
||||
source_device_name=row[3],
|
||||
first_seen=row[4],
|
||||
last_seen=row[5],
|
||||
hit_count=row[6],
|
||||
status=row[7],
|
||||
pushed_at=row[8],
|
||||
log_evidence=json.loads(row[9] or "[]"),
|
||||
matched_rule=row[10],
|
||||
llm_score=row[11],
|
||||
llm_reason=row[12],
|
||||
)
|
||||
|
||||
|
||||
def _upsert_candidate(
|
||||
conn: sqlite3.Connection,
|
||||
domain_or_ip: str,
|
||||
source_device_ip: str | None,
|
||||
source_device_name: str | None,
|
||||
matched_rule: str | None,
|
||||
entry_id: str,
|
||||
now: str,
|
||||
) -> bool:
|
||||
"""Insert or update a candidate. Returns True if a new row was created."""
|
||||
row = conn.execute(
|
||||
"SELECT id, hit_count, log_evidence FROM blocklist_candidates "
|
||||
"WHERE domain_or_ip = ? AND source_device_ip IS ?",
|
||||
(domain_or_ip, source_device_ip),
|
||||
).fetchone()
|
||||
|
||||
if row is None:
|
||||
conn.execute(
|
||||
"""INSERT INTO blocklist_candidates
|
||||
(id, domain_or_ip, source_device_ip, source_device_name,
|
||||
first_seen, last_seen, hit_count, status, pushed_at, log_evidence, matched_rule)
|
||||
VALUES (?, ?, ?, ?, ?, ?, 1, 'pending', NULL, ?, ?)""",
|
||||
(
|
||||
str(uuid.uuid4()), domain_or_ip, source_device_ip, source_device_name,
|
||||
now, now, json.dumps([entry_id]), matched_rule,
|
||||
),
|
||||
)
|
||||
return True
|
||||
|
||||
existing_id, hit_count, existing_evidence = row
|
||||
evidence = json.loads(existing_evidence or "[]")
|
||||
if entry_id not in evidence:
|
||||
evidence.append(entry_id)
|
||||
evidence = evidence[-10:] # cap at 10
|
||||
conn.execute(
|
||||
"UPDATE blocklist_candidates SET last_seen=?, hit_count=?, log_evidence=? WHERE id=?",
|
||||
(now, hit_count + 1, json.dumps(evidence), existing_id),
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Extraction scan
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_scan(
|
||||
db_path: Path,
|
||||
router_source_ids: list[str],
|
||||
device_map: dict[str, str],
|
||||
telemetry_rules: list[TelemetryRule],
|
||||
) -> int:
|
||||
"""Scan log_entries from router sources, upsert blocklist candidates.
|
||||
|
||||
Only entries whose source IP is in device_map are recorded.
|
||||
Returns the total number of rows created or updated.
|
||||
"""
|
||||
if not router_source_ids or not device_map:
|
||||
return 0
|
||||
|
||||
placeholders = ",".join("?" for _ in router_source_ids)
|
||||
now = _now_iso()
|
||||
count = 0
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
rows = conn.execute(
|
||||
f"SELECT id, text FROM log_entries WHERE source_id IN ({placeholders})",
|
||||
router_source_ids,
|
||||
).fetchall()
|
||||
|
||||
for entry_id, text in rows:
|
||||
src_ip: str | None = None
|
||||
dst: str | None = None
|
||||
|
||||
m = _DNSMASQ_RE.search(text)
|
||||
if m:
|
||||
src_ip = m.group("src")
|
||||
dst = m.group("domain")
|
||||
else:
|
||||
m = _IPTABLES_RE.search(text)
|
||||
if m:
|
||||
src_ip = m.group("src")
|
||||
dst = m.group("dst")
|
||||
|
||||
if src_ip is None or src_ip not in device_map:
|
||||
continue
|
||||
|
||||
device_name = device_map[src_ip]
|
||||
rule = matches_telemetry(dst, telemetry_rules) if dst else None
|
||||
matched_rule_name = rule.name if rule else None
|
||||
|
||||
_upsert_candidate(conn, dst or "unknown", src_ip, device_name, matched_rule_name, entry_id, now)
|
||||
count += 1
|
||||
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
return count
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Candidate CRUD
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CANDIDATE_SELECT = (
|
||||
"SELECT id,domain_or_ip,source_device_ip,source_device_name,"
|
||||
"first_seen,last_seen,hit_count,status,pushed_at,log_evidence,"
|
||||
"matched_rule,llm_score,llm_reason FROM blocklist_candidates"
|
||||
)
|
||||
|
||||
|
||||
def list_candidates(
|
||||
db_path: Path,
|
||||
status: str | None = None,
|
||||
device_ip: str | None = None,
|
||||
) -> list[BlocklistCandidate]:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
query = f"{_CANDIDATE_SELECT} WHERE 1=1"
|
||||
params: list = []
|
||||
if status and status != "all":
|
||||
query += " AND status = ?"
|
||||
params.append(status)
|
||||
if device_ip:
|
||||
query += " AND source_device_ip = ?"
|
||||
params.append(device_ip)
|
||||
query += " ORDER BY last_seen DESC"
|
||||
rows = conn.execute(query, params).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
return [_row_to_candidate(r) for r in rows]
|
||||
|
||||
|
||||
def _get_candidate(conn: sqlite3.Connection, candidate_id: str) -> BlocklistCandidate:
|
||||
row = conn.execute(
|
||||
f"{_CANDIDATE_SELECT} WHERE id=?",
|
||||
(candidate_id,),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
raise KeyError(f"Candidate {candidate_id!r} not found")
|
||||
return _row_to_candidate(row)
|
||||
|
||||
|
||||
def update_candidate_status(db_path: Path, candidate_id: str, new_status: str) -> BlocklistCandidate:
|
||||
if new_status not in _VALID_STATUSES:
|
||||
raise ValueError(f"Invalid status {new_status!r}. Must be one of {_VALID_STATUSES}")
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
conn.execute("UPDATE blocklist_candidates SET status=? WHERE id=?", (new_status, candidate_id))
|
||||
conn.commit()
|
||||
return _get_candidate(conn, candidate_id)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def mark_pushed(db_path: Path, candidate_id: str) -> BlocklistCandidate:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
conn.execute(
|
||||
"UPDATE blocklist_candidates SET status='pushed', pushed_at=? WHERE id=?",
|
||||
(_now_iso(), candidate_id),
|
||||
)
|
||||
conn.commit()
|
||||
return _get_candidate(conn, candidate_id)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def mark_unblocked(db_path: Path, candidate_id: str) -> BlocklistCandidate:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
conn.execute("UPDATE blocklist_candidates SET status='unblocked' WHERE id=?", (candidate_id,))
|
||||
conn.commit()
|
||||
return _get_candidate(conn, candidate_id)
|
||||
finally:
|
||||
conn.close()
|
||||
|
|
|
|||
|
|
@ -84,3 +84,171 @@ class TestTelemetry:
|
|||
result = matches_telemetry("api.xbcs.net", rules)
|
||||
assert result is not None
|
||||
assert result.category == "belkin"
|
||||
|
||||
|
||||
class TestExtraction:
|
||||
@pytest.fixture
|
||||
def db(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
p = tmp_path / "test.db"
|
||||
ensure_schema(p)
|
||||
return p
|
||||
|
||||
@pytest.fixture
|
||||
def rules(self):
|
||||
from app.services.blocklist import load_telemetry_rules
|
||||
return load_telemetry_rules(
|
||||
Path(__file__).parent.parent / "patterns" / "telemetry.yaml"
|
||||
)
|
||||
|
||||
def test_dnsmasq_entry_extracted(self, db, rules):
|
||||
import sqlite3
|
||||
from app.services.blocklist import run_scan
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute(
|
||||
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||
VALUES ('e1', 'router:syslog', 1, '2026-05-14T00:00:00+00:00',
|
||||
'dnsmasq[123]: query[A] samsungads.com from 192.168.1.45')"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
count = run_scan(
|
||||
db,
|
||||
router_source_ids=["router:syslog"],
|
||||
device_map={"192.168.1.45": "Samsung Projector"},
|
||||
telemetry_rules=rules,
|
||||
)
|
||||
assert count >= 1
|
||||
conn = sqlite3.connect(str(db))
|
||||
row = conn.execute(
|
||||
"SELECT domain_or_ip, source_device_name, matched_rule, status FROM blocklist_candidates"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row[0] == "samsungads.com"
|
||||
assert row[1] == "Samsung Projector"
|
||||
assert row[2] == "samsung_ads"
|
||||
assert row[3] == "pending"
|
||||
|
||||
def test_iptables_entry_extracted(self, db, rules):
|
||||
import sqlite3
|
||||
from app.services.blocklist import run_scan
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute(
|
||||
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||
VALUES ('e2', 'router:fw', 1, '2026-05-14T00:00:00+00:00',
|
||||
'kernel: FORWARD SRC=192.168.1.67 DST=52.11.243.144 PROTO=TCP DPT=443')"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
count = run_scan(
|
||||
db,
|
||||
router_source_ids=["router:fw"],
|
||||
device_map={"192.168.1.67": "Belkin Switch 1"},
|
||||
telemetry_rules=rules,
|
||||
)
|
||||
assert count >= 1
|
||||
conn = sqlite3.connect(str(db))
|
||||
row = conn.execute("SELECT domain_or_ip, source_device_name FROM blocklist_candidates").fetchone()
|
||||
conn.close()
|
||||
assert row[0] == "52.11.243.144"
|
||||
assert row[1] == "Belkin Switch 1"
|
||||
|
||||
def test_unknown_device_skipped(self, db, rules):
|
||||
import sqlite3
|
||||
from app.services.blocklist import run_scan
|
||||
conn = sqlite3.connect(str(db))
|
||||
conn.execute(
|
||||
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||
VALUES ('e3', 'router:syslog', 1, '2026-05-14T00:00:00+00:00',
|
||||
'dnsmasq[123]: query[A] samsungads.com from 10.0.0.99')"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
count = run_scan(
|
||||
db,
|
||||
router_source_ids=["router:syslog"],
|
||||
device_map={"192.168.1.45": "Samsung Projector"},
|
||||
telemetry_rules=rules,
|
||||
)
|
||||
assert count == 0
|
||||
|
||||
def test_dedup_upsert_increments_hit_count(self, db, rules):
|
||||
import sqlite3
|
||||
from app.services.blocklist import run_scan
|
||||
conn = sqlite3.connect(str(db))
|
||||
for i in range(3):
|
||||
conn.execute(
|
||||
f"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||
VALUES ('e{i}', 'router:syslog', {i}, '2026-05-14T00:00:00+00:00',
|
||||
'dnsmasq[123]: query[A] samsungads.com from 192.168.1.45')"""
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
run_scan(db, ["router:syslog"], {"192.168.1.45": "Projector"}, rules)
|
||||
conn = sqlite3.connect(str(db))
|
||||
rows = conn.execute("SELECT hit_count FROM blocklist_candidates").fetchall()
|
||||
conn.close()
|
||||
assert len(rows) == 1 # one row, not three
|
||||
assert rows[0][0] == 3
|
||||
|
||||
|
||||
class TestCandidateManagement:
|
||||
@pytest.fixture
|
||||
def db_with_candidate(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
import sqlite3, uuid
|
||||
db = tmp_path / "test.db"
|
||||
ensure_schema(db)
|
||||
conn = sqlite3.connect(str(db))
|
||||
cid = str(uuid.uuid4())
|
||||
conn.execute(
|
||||
"""INSERT INTO blocklist_candidates
|
||||
(id, domain_or_ip, first_seen, last_seen)
|
||||
VALUES (?, 'samsungads.com', '2026-05-14T00:00:00+00:00', '2026-05-14T00:00:00+00:00')""",
|
||||
(cid,),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db, cid
|
||||
|
||||
def test_list_candidates_returns_all(self, db_with_candidate):
|
||||
from app.services.blocklist import list_candidates
|
||||
db, _ = db_with_candidate
|
||||
results = list_candidates(db)
|
||||
assert len(results) == 1
|
||||
assert results[0].domain_or_ip == "samsungads.com"
|
||||
|
||||
def test_list_candidates_filter_by_status(self, db_with_candidate):
|
||||
from app.services.blocklist import list_candidates
|
||||
db, _ = db_with_candidate
|
||||
assert len(list_candidates(db, status="pending")) == 1
|
||||
assert len(list_candidates(db, status="pushed")) == 0
|
||||
|
||||
def test_update_status_to_approved(self, db_with_candidate):
|
||||
from app.services.blocklist import update_candidate_status, list_candidates
|
||||
db, cid = db_with_candidate
|
||||
candidate = update_candidate_status(db, cid, "approved")
|
||||
assert candidate.status == "approved"
|
||||
assert list_candidates(db, status="approved")[0].status == "approved"
|
||||
|
||||
def test_update_status_invalid_raises(self, db_with_candidate):
|
||||
from app.services.blocklist import update_candidate_status
|
||||
db, cid = db_with_candidate
|
||||
with pytest.raises(ValueError, match="Invalid status"):
|
||||
update_candidate_status(db, cid, "hacked")
|
||||
|
||||
def test_mark_pushed_sets_status_and_timestamp(self, db_with_candidate):
|
||||
from app.services.blocklist import update_candidate_status, mark_pushed
|
||||
db, cid = db_with_candidate
|
||||
update_candidate_status(db, cid, "approved")
|
||||
candidate = mark_pushed(db, cid)
|
||||
assert candidate.status == "pushed"
|
||||
assert candidate.pushed_at is not None
|
||||
|
||||
def test_mark_unblocked(self, db_with_candidate):
|
||||
from app.services.blocklist import update_candidate_status, mark_pushed, mark_unblocked
|
||||
db, cid = db_with_candidate
|
||||
update_candidate_status(db, cid, "approved")
|
||||
mark_pushed(db, cid)
|
||||
candidate = mark_unblocked(db, cid)
|
||||
assert candidate.status == "unblocked"
|
||||
|
|
|
|||
Loading…
Reference in a new issue