Implements the Orchard branch grafting system for harvest.circuitforge.tech:
- POST /api/orchard/graft: provisions data dir, starts a new
turnstone-submissions-<slug> Docker container on the next free port
(ORCHARD_PORT_BASE=8538+), injects a handle_path block into the
Caddyfile dynamic-branches marker section, restarts caddy-proxy,
returns {submit_endpoint, api_key}
- GET /api/orchard/branches: list active/inactive branches (admin-only)
- DELETE /api/orchard/branches/<slug>: deactivate branch + stop container
- POST /api/orchard/branches/<slug>/anonymize: HMAC-based IP/username
pseudonymization worker over a branch DB
- POST /api/glean/batch: optional TURNSTONE_BRANCH_KEY auth guard
- anonymized column added to log_entries schema (migration-safe)
- Updated Caddyfile with /huginn/* route (port 8536), /node2/* (8537),
and dynamic-branch marker section
- All endpoints admin-gated via TURNSTONE_ORCHARD_ADMIN_KEY
Closes: #27
327 lines
12 KiB
Python
327 lines
12 KiB
Python
"""The Orchard — auto-enrollment of new Turnstone branch nodes.
|
|
|
|
A "branch" is an external Turnstone instance that submits pattern-matched log
|
|
entries to a central harvest receiver (harvest.circuitforge.tech). Grafting
|
|
provisions the receiving infrastructure for a new branch:
|
|
|
|
1. Creates a data dir at ORCHARD_DATA_ROOT/<slug>/
|
|
2. Starts a new turnstone-submissions-<slug> Docker container
|
|
3. Injects a handle_path block into the Caddyfile marker section
|
|
4. Restarts caddy-proxy to activate the route
|
|
5. Persists the branch registry to orchard-branches.yaml
|
|
|
|
Admin auth: the graft/deactivate endpoints require
|
|
Authorization: Bearer <TURNSTONE_ORCHARD_ADMIN_KEY>
|
|
|
|
Set TURNSTONE_ORCHARD_ADMIN_KEY in the environment on the harvest instance.
|
|
If unset, the endpoints return 501 Not Implemented (feature is off).
|
|
|
|
Anonymization: a separate pass (run_anonymization) replaces IPs, hostnames,
|
|
and usernames in branch DBs with stable pseudonyms before Avocet reads them.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import hmac
|
|
import ipaddress
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import secrets
|
|
import sqlite3
|
|
import subprocess
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Config (read from env on the harvest instance)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ORCHARD_DATA_ROOT = Path(os.environ.get("TURNSTONE_ORCHARD_DATA_ROOT", "/devl/docker/turnstone-submissions"))
|
|
ORCHARD_CADDYFILE = Path(os.environ.get("TURNSTONE_ORCHARD_CADDYFILE", "/devl/caddy-proxy/Caddyfile"))
|
|
ORCHARD_CADDY_CONTAINER = os.environ.get("TURNSTONE_ORCHARD_CADDY_CONTAINER", "caddy-proxy")
|
|
ORCHARD_HARVEST_HOST = os.environ.get("TURNSTONE_ORCHARD_HARVEST_HOST", "https://harvest.circuitforge.tech")
|
|
ORCHARD_IMAGE = os.environ.get("TURNSTONE_ORCHARD_IMAGE", "localhost/turnstone:latest")
|
|
|
|
# Ports for submission containers start here and scan upward.
|
|
ORCHARD_PORT_BASE = int(os.environ.get("TURNSTONE_ORCHARD_PORT_BASE", "8538"))
|
|
|
|
_REGISTRY_FILE = ORCHARD_DATA_ROOT / "orchard-branches.yaml"
|
|
|
|
_CADDY_BRANCH_START = "# --- ORCHARD BRANCHES: auto-managed by POST /api/orchard/graft, do not edit manually ---"
|
|
_CADDY_BRANCH_END = "# --- END ORCHARD BRANCHES ---"
|
|
|
|
_SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9-]{1,30}[a-z0-9]$")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Branch registry
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _load_registry() -> list[dict[str, Any]]:
|
|
if not _REGISTRY_FILE.exists():
|
|
return []
|
|
import yaml as _yaml
|
|
try:
|
|
data = _yaml.safe_load(_REGISTRY_FILE.read_text()) or {}
|
|
return data.get("branches", [])
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def _save_registry(branches: list[dict[str, Any]]) -> None:
|
|
import yaml as _yaml
|
|
_REGISTRY_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
_REGISTRY_FILE.write_text(_yaml.dump({"branches": branches}, default_flow_style=False))
|
|
|
|
|
|
def list_branches() -> list[dict[str, Any]]:
|
|
return _load_registry()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Port allocation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _next_free_port() -> int:
|
|
used = {b["port"] for b in _load_registry() if "port" in b}
|
|
port = ORCHARD_PORT_BASE
|
|
while port in used:
|
|
port += 1
|
|
return port
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Caddy route injection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _build_branch_block(slug: str, port: int) -> str:
|
|
return (
|
|
f" handle_path /{slug}/* {{\n"
|
|
f" reverse_proxy http://host.docker.internal:{port} {{\n"
|
|
f" header_up X-Real-IP {{remote_host}}\n"
|
|
f" header_up X-Forwarded-Proto {{scheme}}\n"
|
|
f" flush_interval -1\n"
|
|
f" transport http {{\n"
|
|
f" response_header_timeout 0\n"
|
|
f" read_timeout 0\n"
|
|
f" }}\n"
|
|
f" }}\n"
|
|
f" }}"
|
|
)
|
|
|
|
|
|
def _rewrite_caddy_branches(branches: list[dict[str, Any]]) -> None:
|
|
"""Replace the auto-managed section in the Caddyfile with current branches."""
|
|
if not ORCHARD_CADDYFILE.exists():
|
|
raise RuntimeError(f"Caddyfile not found at {ORCHARD_CADDYFILE}")
|
|
|
|
text = ORCHARD_CADDYFILE.read_text()
|
|
start_idx = text.find(_CADDY_BRANCH_START)
|
|
end_idx = text.find(_CADDY_BRANCH_END)
|
|
if start_idx == -1 or end_idx == -1:
|
|
raise RuntimeError("Caddyfile is missing the ORCHARD BRANCHES marker section")
|
|
|
|
active = [b for b in branches if b.get("active", True)]
|
|
blocks = "\n".join(_build_branch_block(b["slug"], b["port"]) for b in active)
|
|
replacement = f"{_CADDY_BRANCH_START}\n{blocks}\n {_CADDY_BRANCH_END}"
|
|
|
|
new_text = text[:start_idx] + replacement + text[end_idx + len(_CADDY_BRANCH_END):]
|
|
ORCHARD_CADDYFILE.write_text(new_text)
|
|
logger.info("Caddyfile updated with %d active branch routes", len(active))
|
|
|
|
|
|
def _reload_caddy() -> None:
|
|
result = subprocess.run(
|
|
["docker", "restart", ORCHARD_CADDY_CONTAINER],
|
|
capture_output=True, text=True, timeout=30,
|
|
)
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"docker restart {ORCHARD_CADDY_CONTAINER} failed: {result.stderr}")
|
|
logger.info("Restarted %s", ORCHARD_CADDY_CONTAINER)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Container provisioning
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _start_branch_container(slug: str, port: int, data_dir: Path) -> None:
|
|
patterns_dir = data_dir / "patterns"
|
|
patterns_dir.mkdir(parents=True, exist_ok=True)
|
|
data_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Seed default patterns if not already present
|
|
repo_patterns = Path(__file__).parent.parent.parent / "patterns"
|
|
for yaml_file in ("default.yaml", "sources-example.yaml"):
|
|
src = repo_patterns / yaml_file
|
|
dst = patterns_dir / yaml_file
|
|
if src.exists() and not dst.exists():
|
|
dst.write_text(src.read_text())
|
|
|
|
container_name = f"turnstone-submissions-{slug}"
|
|
cmd = [
|
|
"docker", "run", "-d",
|
|
"--name", container_name,
|
|
"--restart", "unless-stopped",
|
|
"-p", f"{port}:8534",
|
|
"-v", f"{data_dir}:/data",
|
|
"-v", f"{patterns_dir}:/patterns",
|
|
"-e", f"TURNSTONE_DB=/data/turnstone.db",
|
|
"-e", f"TURNSTONE_SOURCE_HOST={slug}",
|
|
"-e", "PYTHONUNBUFFERED=1",
|
|
"-e", "TZ=America/Los_Angeles",
|
|
ORCHARD_IMAGE,
|
|
]
|
|
# Remove any stale container with the same name first
|
|
subprocess.run(["docker", "rm", "-f", container_name], capture_output=True)
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
|
|
if result.returncode != 0:
|
|
raise RuntimeError(f"docker run for {container_name} failed: {result.stderr}")
|
|
logger.info("Started container %s on port %d", container_name, port)
|
|
|
|
|
|
def _stop_branch_container(slug: str) -> None:
|
|
container_name = f"turnstone-submissions-{slug}"
|
|
subprocess.run(["docker", "rm", "-f", container_name], capture_output=True, timeout=30)
|
|
logger.info("Removed container %s", container_name)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Public API
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def graft(slug: str, contact_email: str, agreed_to_terms: bool) -> dict[str, Any]:
|
|
"""Provision a new Orchard branch and return connection details."""
|
|
if not agreed_to_terms:
|
|
raise ValueError("agreed_to_terms must be true")
|
|
if not _SLUG_RE.match(slug):
|
|
raise ValueError(
|
|
f"Invalid slug {slug!r}: must be 2-32 lowercase alphanumeric/hyphen, "
|
|
"cannot start or end with a hyphen"
|
|
)
|
|
|
|
branches = _load_registry()
|
|
if any(b["slug"] == slug for b in branches):
|
|
raise ValueError(f"Branch {slug!r} already exists")
|
|
|
|
port = _next_free_port()
|
|
data_dir = ORCHARD_DATA_ROOT / slug
|
|
api_key = secrets.token_urlsafe(32)
|
|
|
|
branch: dict[str, Any] = {
|
|
"slug": slug,
|
|
"port": port,
|
|
"contact_email": contact_email,
|
|
"api_key_hash": hashlib.sha256(api_key.encode()).hexdigest(),
|
|
"grafted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
|
|
"active": True,
|
|
}
|
|
|
|
_start_branch_container(slug, port, data_dir)
|
|
branches.append(branch)
|
|
_save_registry(branches)
|
|
|
|
_rewrite_caddy_branches(branches)
|
|
_reload_caddy()
|
|
|
|
submit_endpoint = f"{ORCHARD_HARVEST_HOST}/{slug}"
|
|
logger.info("Grafted branch %r at %s", slug, submit_endpoint)
|
|
return {
|
|
"slug": slug,
|
|
"submit_endpoint": submit_endpoint,
|
|
"api_key": api_key,
|
|
"port": port,
|
|
}
|
|
|
|
|
|
def deactivate(slug: str) -> dict[str, Any]:
|
|
"""Deactivate a branch: stop its container and remove its Caddy route."""
|
|
branches = _load_registry()
|
|
branch = next((b for b in branches if b["slug"] == slug), None)
|
|
if branch is None:
|
|
raise KeyError(f"Branch {slug!r} not found")
|
|
|
|
_stop_branch_container(slug)
|
|
branch["active"] = False
|
|
_save_registry(branches)
|
|
_rewrite_caddy_branches(branches)
|
|
_reload_caddy()
|
|
return {"slug": slug, "deactivated": True}
|
|
|
|
|
|
def verify_api_key(slug: str, key: str) -> bool:
|
|
"""Check whether *key* is valid for the given branch slug."""
|
|
branches = _load_registry()
|
|
branch = next((b for b in branches if b["slug"] == slug and b.get("active")), None)
|
|
if branch is None:
|
|
return False
|
|
expected = branch.get("api_key_hash", "")
|
|
provided = hashlib.sha256(key.encode()).hexdigest()
|
|
return hmac.compare_digest(expected, provided)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Anonymization worker
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_IP_RE = re.compile(
|
|
r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
|
|
)
|
|
_USERNAME_RE = re.compile(r"\bfor\s+(\w+)\b|\buser\s+(\w+)\b|\bsession\s+opened\s+for\s+(\w+)\b", re.IGNORECASE)
|
|
|
|
|
|
def _pseudonym(value: str, salt: bytes, prefix: str) -> str:
|
|
digest = hmac.new(salt, value.encode(), "sha256").hexdigest()[:10]
|
|
return f"{prefix}-{digest}"
|
|
|
|
|
|
def _anonymize_text(text: str, salt: bytes) -> str:
|
|
def replace_ip(m: re.Match) -> str:
|
|
return _pseudonym(m.group(), salt, "ip")
|
|
|
|
def replace_user(m: re.Match) -> str:
|
|
user = next(g for g in m.groups() if g)
|
|
return m.group().replace(user, _pseudonym(user, salt, "user"))
|
|
|
|
text = _IP_RE.sub(replace_ip, text)
|
|
text = _USERNAME_RE.sub(replace_user, text)
|
|
return text
|
|
|
|
|
|
def run_anonymization(slug: str) -> dict[str, Any]:
|
|
"""Anonymize IPs and usernames in a branch DB in-place.
|
|
|
|
Uses a stable per-branch salt so pseudonyms are consistent across runs
|
|
but not reversible without the salt.
|
|
"""
|
|
branch = next((b for b in _load_registry() if b["slug"] == slug), None)
|
|
if branch is None:
|
|
raise KeyError(f"Branch {slug!r} not found")
|
|
|
|
db_path = ORCHARD_DATA_ROOT / slug / "turnstone.db"
|
|
if not db_path.exists():
|
|
return {"slug": slug, "anonymized": 0}
|
|
|
|
# Per-branch salt derived from api_key_hash for stability
|
|
salt = branch["api_key_hash"].encode()[:32].ljust(32, b"0")
|
|
|
|
conn = sqlite3.connect(str(db_path), timeout=30)
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
rows = conn.execute("SELECT id, text FROM log_entries WHERE anonymized IS NULL OR anonymized = 0").fetchall()
|
|
|
|
updated = 0
|
|
for row_id, text in rows:
|
|
clean = _anonymize_text(text or "", salt)
|
|
if clean != text:
|
|
conn.execute("UPDATE log_entries SET text = ?, anonymized = 1 WHERE id = ?", (clean, row_id))
|
|
updated += 1
|
|
else:
|
|
conn.execute("UPDATE log_entries SET anonymized = 1 WHERE id = ?", (row_id,))
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
logger.info("Anonymized %d/%d entries in branch %r", updated, len(rows), slug)
|
|
return {"slug": slug, "anonymized": updated, "total_processed": len(rows)}
|