feat(blocklist): extraction scan + candidate CRUD + full test suite
This commit is contained in:
parent
7d213b8aca
commit
0695c42908
2 changed files with 392 additions and 0 deletions
|
|
@ -2,6 +2,11 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import dataclasses
|
import dataclasses
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
@ -62,3 +67,222 @@ def matches_telemetry(domain: str, rules: list[TelemetryRule]) -> TelemetryRule
|
||||||
if d == rd or d.endswith("." + rd):
|
if d == rd or d.endswith("." + rd):
|
||||||
return rule
|
return rule
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Regex extractors for router log entries
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_DNSMASQ_RE = re.compile(
|
||||||
|
r"query\[A{1,4}\]\s+(?P<domain>\S+)\s+from\s+(?P<src>[\d.]+)"
|
||||||
|
)
|
||||||
|
_IPTABLES_RE = re.compile(
|
||||||
|
r"SRC=(?P<src>[\d.]+).*?DST=(?P<dst>[\d.a-zA-Z.-]+)"
|
||||||
|
)
|
||||||
|
|
||||||
|
_VALID_STATUSES = {"pending", "approved", "rejected", "pushed", "unblocked"}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# DB helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def _now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).isoformat()
|
||||||
|
|
||||||
|
|
||||||
|
def _row_to_candidate(row: tuple) -> BlocklistCandidate:
|
||||||
|
return BlocklistCandidate(
|
||||||
|
id=row[0],
|
||||||
|
domain_or_ip=row[1],
|
||||||
|
source_device_ip=row[2],
|
||||||
|
source_device_name=row[3],
|
||||||
|
first_seen=row[4],
|
||||||
|
last_seen=row[5],
|
||||||
|
hit_count=row[6],
|
||||||
|
status=row[7],
|
||||||
|
pushed_at=row[8],
|
||||||
|
log_evidence=json.loads(row[9] or "[]"),
|
||||||
|
matched_rule=row[10],
|
||||||
|
llm_score=row[11],
|
||||||
|
llm_reason=row[12],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _upsert_candidate(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
domain_or_ip: str,
|
||||||
|
source_device_ip: str | None,
|
||||||
|
source_device_name: str | None,
|
||||||
|
matched_rule: str | None,
|
||||||
|
entry_id: str,
|
||||||
|
now: str,
|
||||||
|
) -> bool:
|
||||||
|
"""Insert or update a candidate. Returns True if a new row was created."""
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id, hit_count, log_evidence FROM blocklist_candidates "
|
||||||
|
"WHERE domain_or_ip = ? AND source_device_ip IS ?",
|
||||||
|
(domain_or_ip, source_device_ip),
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if row is None:
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO blocklist_candidates
|
||||||
|
(id, domain_or_ip, source_device_ip, source_device_name,
|
||||||
|
first_seen, last_seen, hit_count, status, pushed_at, log_evidence, matched_rule)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, 1, 'pending', NULL, ?, ?)""",
|
||||||
|
(
|
||||||
|
str(uuid.uuid4()), domain_or_ip, source_device_ip, source_device_name,
|
||||||
|
now, now, json.dumps([entry_id]), matched_rule,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
existing_id, hit_count, existing_evidence = row
|
||||||
|
evidence = json.loads(existing_evidence or "[]")
|
||||||
|
if entry_id not in evidence:
|
||||||
|
evidence.append(entry_id)
|
||||||
|
evidence = evidence[-10:] # cap at 10
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE blocklist_candidates SET last_seen=?, hit_count=?, log_evidence=? WHERE id=?",
|
||||||
|
(now, hit_count + 1, json.dumps(evidence), existing_id),
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Extraction scan
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def run_scan(
|
||||||
|
db_path: Path,
|
||||||
|
router_source_ids: list[str],
|
||||||
|
device_map: dict[str, str],
|
||||||
|
telemetry_rules: list[TelemetryRule],
|
||||||
|
) -> int:
|
||||||
|
"""Scan log_entries from router sources, upsert blocklist candidates.
|
||||||
|
|
||||||
|
Only entries whose source IP is in device_map are recorded.
|
||||||
|
Returns the total number of rows created or updated.
|
||||||
|
"""
|
||||||
|
if not router_source_ids or not device_map:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
placeholders = ",".join("?" for _ in router_source_ids)
|
||||||
|
now = _now_iso()
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
f"SELECT id, text FROM log_entries WHERE source_id IN ({placeholders})",
|
||||||
|
router_source_ids,
|
||||||
|
).fetchall()
|
||||||
|
|
||||||
|
for entry_id, text in rows:
|
||||||
|
src_ip: str | None = None
|
||||||
|
dst: str | None = None
|
||||||
|
|
||||||
|
m = _DNSMASQ_RE.search(text)
|
||||||
|
if m:
|
||||||
|
src_ip = m.group("src")
|
||||||
|
dst = m.group("domain")
|
||||||
|
else:
|
||||||
|
m = _IPTABLES_RE.search(text)
|
||||||
|
if m:
|
||||||
|
src_ip = m.group("src")
|
||||||
|
dst = m.group("dst")
|
||||||
|
|
||||||
|
if src_ip is None or src_ip not in device_map:
|
||||||
|
continue
|
||||||
|
|
||||||
|
device_name = device_map[src_ip]
|
||||||
|
rule = matches_telemetry(dst, telemetry_rules) if dst else None
|
||||||
|
matched_rule_name = rule.name if rule else None
|
||||||
|
|
||||||
|
_upsert_candidate(conn, dst or "unknown", src_ip, device_name, matched_rule_name, entry_id, now)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Candidate CRUD
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_CANDIDATE_SELECT = (
|
||||||
|
"SELECT id,domain_or_ip,source_device_ip,source_device_name,"
|
||||||
|
"first_seen,last_seen,hit_count,status,pushed_at,log_evidence,"
|
||||||
|
"matched_rule,llm_score,llm_reason FROM blocklist_candidates"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_candidates(
|
||||||
|
db_path: Path,
|
||||||
|
status: str | None = None,
|
||||||
|
device_ip: str | None = None,
|
||||||
|
) -> list[BlocklistCandidate]:
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
query = f"{_CANDIDATE_SELECT} WHERE 1=1"
|
||||||
|
params: list = []
|
||||||
|
if status and status != "all":
|
||||||
|
query += " AND status = ?"
|
||||||
|
params.append(status)
|
||||||
|
if device_ip:
|
||||||
|
query += " AND source_device_ip = ?"
|
||||||
|
params.append(device_ip)
|
||||||
|
query += " ORDER BY last_seen DESC"
|
||||||
|
rows = conn.execute(query, params).fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
return [_row_to_candidate(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_candidate(conn: sqlite3.Connection, candidate_id: str) -> BlocklistCandidate:
|
||||||
|
row = conn.execute(
|
||||||
|
f"{_CANDIDATE_SELECT} WHERE id=?",
|
||||||
|
(candidate_id,),
|
||||||
|
).fetchone()
|
||||||
|
if row is None:
|
||||||
|
raise KeyError(f"Candidate {candidate_id!r} not found")
|
||||||
|
return _row_to_candidate(row)
|
||||||
|
|
||||||
|
|
||||||
|
def update_candidate_status(db_path: Path, candidate_id: str, new_status: str) -> BlocklistCandidate:
|
||||||
|
if new_status not in _VALID_STATUSES:
|
||||||
|
raise ValueError(f"Invalid status {new_status!r}. Must be one of {_VALID_STATUSES}")
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
conn.execute("UPDATE blocklist_candidates SET status=? WHERE id=?", (new_status, candidate_id))
|
||||||
|
conn.commit()
|
||||||
|
return _get_candidate(conn, candidate_id)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def mark_pushed(db_path: Path, candidate_id: str) -> BlocklistCandidate:
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE blocklist_candidates SET status='pushed', pushed_at=? WHERE id=?",
|
||||||
|
(_now_iso(), candidate_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
return _get_candidate(conn, candidate_id)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def mark_unblocked(db_path: Path, candidate_id: str) -> BlocklistCandidate:
|
||||||
|
conn = sqlite3.connect(str(db_path))
|
||||||
|
try:
|
||||||
|
conn.execute("UPDATE blocklist_candidates SET status='unblocked' WHERE id=?", (candidate_id,))
|
||||||
|
conn.commit()
|
||||||
|
return _get_candidate(conn, candidate_id)
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
|
||||||
|
|
@ -84,3 +84,171 @@ class TestTelemetry:
|
||||||
result = matches_telemetry("api.xbcs.net", rules)
|
result = matches_telemetry("api.xbcs.net", rules)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
assert result.category == "belkin"
|
assert result.category == "belkin"
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtraction:
|
||||||
|
@pytest.fixture
|
||||||
|
def db(self, tmp_path):
|
||||||
|
from app.ingest.pipeline import ensure_schema
|
||||||
|
p = tmp_path / "test.db"
|
||||||
|
ensure_schema(p)
|
||||||
|
return p
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def rules(self):
|
||||||
|
from app.services.blocklist import load_telemetry_rules
|
||||||
|
return load_telemetry_rules(
|
||||||
|
Path(__file__).parent.parent / "patterns" / "telemetry.yaml"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_dnsmasq_entry_extracted(self, db, rules):
|
||||||
|
import sqlite3
|
||||||
|
from app.services.blocklist import run_scan
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||||
|
VALUES ('e1', 'router:syslog', 1, '2026-05-14T00:00:00+00:00',
|
||||||
|
'dnsmasq[123]: query[A] samsungads.com from 192.168.1.45')"""
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
count = run_scan(
|
||||||
|
db,
|
||||||
|
router_source_ids=["router:syslog"],
|
||||||
|
device_map={"192.168.1.45": "Samsung Projector"},
|
||||||
|
telemetry_rules=rules,
|
||||||
|
)
|
||||||
|
assert count >= 1
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT domain_or_ip, source_device_name, matched_rule, status FROM blocklist_candidates"
|
||||||
|
).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "samsungads.com"
|
||||||
|
assert row[1] == "Samsung Projector"
|
||||||
|
assert row[2] == "samsung_ads"
|
||||||
|
assert row[3] == "pending"
|
||||||
|
|
||||||
|
def test_iptables_entry_extracted(self, db, rules):
|
||||||
|
import sqlite3
|
||||||
|
from app.services.blocklist import run_scan
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||||
|
VALUES ('e2', 'router:fw', 1, '2026-05-14T00:00:00+00:00',
|
||||||
|
'kernel: FORWARD SRC=192.168.1.67 DST=52.11.243.144 PROTO=TCP DPT=443')"""
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
count = run_scan(
|
||||||
|
db,
|
||||||
|
router_source_ids=["router:fw"],
|
||||||
|
device_map={"192.168.1.67": "Belkin Switch 1"},
|
||||||
|
telemetry_rules=rules,
|
||||||
|
)
|
||||||
|
assert count >= 1
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
row = conn.execute("SELECT domain_or_ip, source_device_name FROM blocklist_candidates").fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "52.11.243.144"
|
||||||
|
assert row[1] == "Belkin Switch 1"
|
||||||
|
|
||||||
|
def test_unknown_device_skipped(self, db, rules):
|
||||||
|
import sqlite3
|
||||||
|
from app.services.blocklist import run_scan
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||||
|
VALUES ('e3', 'router:syslog', 1, '2026-05-14T00:00:00+00:00',
|
||||||
|
'dnsmasq[123]: query[A] samsungads.com from 10.0.0.99')"""
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
count = run_scan(
|
||||||
|
db,
|
||||||
|
router_source_ids=["router:syslog"],
|
||||||
|
device_map={"192.168.1.45": "Samsung Projector"},
|
||||||
|
telemetry_rules=rules,
|
||||||
|
)
|
||||||
|
assert count == 0
|
||||||
|
|
||||||
|
def test_dedup_upsert_increments_hit_count(self, db, rules):
|
||||||
|
import sqlite3
|
||||||
|
from app.services.blocklist import run_scan
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
for i in range(3):
|
||||||
|
conn.execute(
|
||||||
|
f"""INSERT INTO log_entries (id, source_id, sequence, ingest_time, text)
|
||||||
|
VALUES ('e{i}', 'router:syslog', {i}, '2026-05-14T00:00:00+00:00',
|
||||||
|
'dnsmasq[123]: query[A] samsungads.com from 192.168.1.45')"""
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
run_scan(db, ["router:syslog"], {"192.168.1.45": "Projector"}, rules)
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
rows = conn.execute("SELECT hit_count FROM blocklist_candidates").fetchall()
|
||||||
|
conn.close()
|
||||||
|
assert len(rows) == 1 # one row, not three
|
||||||
|
assert rows[0][0] == 3
|
||||||
|
|
||||||
|
|
||||||
|
class TestCandidateManagement:
|
||||||
|
@pytest.fixture
|
||||||
|
def db_with_candidate(self, tmp_path):
|
||||||
|
from app.ingest.pipeline import ensure_schema
|
||||||
|
import sqlite3, uuid
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
ensure_schema(db)
|
||||||
|
conn = sqlite3.connect(str(db))
|
||||||
|
cid = str(uuid.uuid4())
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO blocklist_candidates
|
||||||
|
(id, domain_or_ip, first_seen, last_seen)
|
||||||
|
VALUES (?, 'samsungads.com', '2026-05-14T00:00:00+00:00', '2026-05-14T00:00:00+00:00')""",
|
||||||
|
(cid,),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return db, cid
|
||||||
|
|
||||||
|
def test_list_candidates_returns_all(self, db_with_candidate):
|
||||||
|
from app.services.blocklist import list_candidates
|
||||||
|
db, _ = db_with_candidate
|
||||||
|
results = list_candidates(db)
|
||||||
|
assert len(results) == 1
|
||||||
|
assert results[0].domain_or_ip == "samsungads.com"
|
||||||
|
|
||||||
|
def test_list_candidates_filter_by_status(self, db_with_candidate):
|
||||||
|
from app.services.blocklist import list_candidates
|
||||||
|
db, _ = db_with_candidate
|
||||||
|
assert len(list_candidates(db, status="pending")) == 1
|
||||||
|
assert len(list_candidates(db, status="pushed")) == 0
|
||||||
|
|
||||||
|
def test_update_status_to_approved(self, db_with_candidate):
|
||||||
|
from app.services.blocklist import update_candidate_status, list_candidates
|
||||||
|
db, cid = db_with_candidate
|
||||||
|
candidate = update_candidate_status(db, cid, "approved")
|
||||||
|
assert candidate.status == "approved"
|
||||||
|
assert list_candidates(db, status="approved")[0].status == "approved"
|
||||||
|
|
||||||
|
def test_update_status_invalid_raises(self, db_with_candidate):
|
||||||
|
from app.services.blocklist import update_candidate_status
|
||||||
|
db, cid = db_with_candidate
|
||||||
|
with pytest.raises(ValueError, match="Invalid status"):
|
||||||
|
update_candidate_status(db, cid, "hacked")
|
||||||
|
|
||||||
|
def test_mark_pushed_sets_status_and_timestamp(self, db_with_candidate):
|
||||||
|
from app.services.blocklist import update_candidate_status, mark_pushed
|
||||||
|
db, cid = db_with_candidate
|
||||||
|
update_candidate_status(db, cid, "approved")
|
||||||
|
candidate = mark_pushed(db, cid)
|
||||||
|
assert candidate.status == "pushed"
|
||||||
|
assert candidate.pushed_at is not None
|
||||||
|
|
||||||
|
def test_mark_unblocked(self, db_with_candidate):
|
||||||
|
from app.services.blocklist import update_candidate_status, mark_pushed, mark_unblocked
|
||||||
|
db, cid = db_with_candidate
|
||||||
|
update_candidate_status(db, cid, "approved")
|
||||||
|
mark_pushed(db, cid)
|
||||||
|
candidate = mark_unblocked(db, cid)
|
||||||
|
assert candidate.status == "unblocked"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue