turnstone/scripts/harvest_docs.py
pyr0ball 3fd9b6d5a2 feat(diagnose): tech-level post-processor, offline mode, API auth, context harvest
- synthesizer: 3 system prompts (sysadmin/homelab/executive) selected by tech_level pref
- settings: tech_level selector (UI + backend) persisted in preferences.json
- QuickCapture: shows active level label in diagnosis card header
- TURNSTONE_OFFLINE_MODE=1: sets HF_HUB_OFFLINE + TRANSFORMERS_OFFLINE before lib load
- TURNSTONE_API_KEY: bearer token auth on all /api/ routes (hmac.compare_digest)
- /health always open; unset key = no auth (backward compatible)
- docs/air-gapped-deployment.md: full offline deployment guide
- scripts/harvest_docs.py: generalized context doc bulk-uploader with manifest support
- scripts/manifests/: heimdall-devops.yaml (10 docs ingested) + example.yaml template
- fix: _ingest_upload -> _glean_upload in context doc upload endpoint (was 500)

Closes: #56
Closes: #45
Closes: #47
Closes: #49
Closes: #21
2026-05-28 08:51:05 -07:00

266 lines
8.5 KiB
Python

#!/usr/bin/env python3
"""harvest_docs.py — Bulk-upload documentation into Turnstone's context RAG.
Reads a YAML manifest that describes which files or directories to upload,
then POSTs each file to the Turnstone /api/context/docs endpoint.
Usage:
# From a manifest file
python harvest_docs.py --manifest manifests/my-cluster.yaml
# Explicit files (no manifest needed)
python harvest_docs.py --base-url http://localhost:8534 file1.md dir/file2.yaml
# Dry run — show what would be uploaded without sending
python harvest_docs.py --manifest manifests/my-cluster.yaml --dry-run
Manifest format (YAML):
base_url: http://localhost:8534 # optional; overridden by --base-url
sources:
- path: /absolute/path/to/file.md
label: friendly-name # optional; overrides filename in DB
- path: /absolute/path/to/dir/
include: ["*.md", "*.yaml"] # glob patterns; default: see INCLUDE_EXTS
exclude: ["CLAUDE*", "SESSION_*", "*_keys*"]
recursive: false # default false
"""
from __future__ import annotations
import argparse
import fnmatch
import sys
import urllib.request
import urllib.error
from pathlib import Path
try:
import yaml
_HAS_YAML = True
except ImportError:
_HAS_YAML = False
# File extensions included when walking a directory with no explicit `include`.
INCLUDE_EXTS = {".md", ".yaml", ".yml", ".txt", ".conf", ".rst"}
# Default exclude patterns applied to every directory source (unless overridden).
DEFAULT_EXCLUDES = [
"CLAUDE*",
"SESSION_*",
"HANDOFF_*",
"*.key",
"*.pem",
"*.crt",
"node_modules",
".git",
"__pycache__",
]
UPLOAD_PATH = "/turnstone/api/context/docs"
# ---------------------------------------------------------------------------
# File collection
# ---------------------------------------------------------------------------
def _matches_any(name: str, patterns: list[str]) -> bool:
return any(fnmatch.fnmatch(name, p) for p in patterns)
def _collect_from_dir(
root: Path,
include: list[str],
exclude: list[str],
recursive: bool,
) -> list[Path]:
pattern = "**/*" if recursive else "*"
candidates: list[Path] = []
for p in root.glob(pattern):
if not p.is_file():
continue
# Exclude any path component that matches an exclude pattern
if any(_matches_any(part, exclude) for part in p.parts):
continue
if include:
if not _matches_any(p.name, include):
continue
else:
if p.suffix.lower() not in INCLUDE_EXTS:
continue
candidates.append(p)
return sorted(candidates)
def resolve_sources(sources: list[dict]) -> list[tuple[Path, str]]:
"""Return list of (path, label) pairs from a manifest sources list."""
results: list[tuple[Path, str]] = []
for entry in sources:
raw_path = entry.get("path", "")
p = Path(raw_path).expanduser().resolve()
label: str = entry.get("label", "")
include: list[str] = entry.get("include", [])
exclude: list[str] = entry.get("exclude", DEFAULT_EXCLUDES)
recursive: bool = entry.get("recursive", False)
if not p.exists():
print(f" [WARN] path not found, skipping: {p}", file=sys.stderr)
continue
if p.is_file():
results.append((p, label or p.name))
elif p.is_dir():
found = _collect_from_dir(p, include, exclude, recursive)
for f in found:
results.append((f, f.name))
else:
print(f" [WARN] not a file or directory, skipping: {p}", file=sys.stderr)
return results
# ---------------------------------------------------------------------------
# Upload
# ---------------------------------------------------------------------------
def _build_multipart(boundary: bytes, filename: str, content: bytes) -> bytes:
"""Build a minimal multipart/form-data body for a single file field."""
lines: list[bytes] = [
b"--" + boundary,
f'Content-Disposition: form-data; name="file"; filename="{filename}"'.encode(),
b"Content-Type: application/octet-stream",
b"",
content,
b"--" + boundary + b"--",
b"",
]
return b"\r\n".join(lines)
def upload_file(base_url: str, path: Path, label: str) -> dict:
"""POST a file to Turnstone's context doc endpoint. Returns response dict."""
url = base_url.rstrip("/") + UPLOAD_PATH
content = path.read_bytes()
filename = label or path.name
boundary = b"----TurnstoneHarvest"
body = _build_multipart(boundary, filename, content)
content_type = f"multipart/form-data; boundary={boundary.decode()}"
req = urllib.request.Request(
url,
data=body,
headers={"Content-Type": content_type},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=30) as resp:
import json
return json.loads(resp.read())
except urllib.error.HTTPError as e:
body_text = e.read().decode(errors="replace")
return {"error": f"HTTP {e.code}: {body_text[:200]}"}
except Exception as exc:
return {"error": str(exc)}
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Bulk-upload docs into Turnstone context RAG.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"--manifest", "-m",
metavar="FILE",
help="YAML manifest describing sources to upload",
)
parser.add_argument(
"--base-url", "-u",
default="http://localhost:8534",
metavar="URL",
help="Turnstone base URL (default: http://localhost:8534)",
)
parser.add_argument(
"--dry-run", "-n",
action="store_true",
help="Show files that would be uploaded without actually uploading",
)
parser.add_argument(
"files",
nargs="*",
metavar="FILE",
help="Explicit files to upload (alternative to --manifest)",
)
args = parser.parse_args()
base_url = args.base_url
sources: list[tuple[Path, str]] = []
if args.manifest:
if not _HAS_YAML:
print("ERROR: PyYAML is required for --manifest. Run: pip install pyyaml", file=sys.stderr)
sys.exit(1)
manifest_path = Path(args.manifest).expanduser().resolve()
if not manifest_path.exists():
print(f"ERROR: manifest not found: {manifest_path}", file=sys.stderr)
sys.exit(1)
data = yaml.safe_load(manifest_path.read_text())
base_url = args.base_url if args.base_url != "http://localhost:8534" else data.get("base_url", base_url)
sources = resolve_sources(data.get("sources", []))
for raw in args.files:
p = Path(raw).expanduser().resolve()
if not p.exists():
print(f" [WARN] not found, skipping: {p}", file=sys.stderr)
continue
if p.is_file():
sources.append((p, p.name))
else:
print(f" [WARN] {p} is a directory; use a manifest with recursive:true for directory sources", file=sys.stderr)
if not sources:
print("No files to upload. Pass --manifest or explicit file paths.")
sys.exit(0)
print(f"Turnstone: {base_url}")
print(f"Files to upload: {len(sources)}")
if args.dry_run:
print("\n[DRY RUN] Would upload:")
print()
ok = 0
failed = 0
for path, label in sources:
size_kb = path.stat().st_size / 1024
if args.dry_run:
print(f" {label} ({size_kb:.1f} KB) ← {path}")
ok += 1
continue
print(f" Uploading {label} ({size_kb:.1f} KB)…", end=" ", flush=True)
result = upload_file(base_url, path, label)
if "error" in result:
print(f"FAILED — {result['error']}")
failed += 1
else:
chunks = result.get("chunks_written", result.get("chunks_created", "?"))
facts = result.get("facts_written", 0)
extra = f", {facts} facts" if facts else ""
print(f"OK ({chunks} chunks{extra})")
ok += 1
print()
if args.dry_run:
print(f"Dry run complete. {ok} file(s) would be uploaded.")
else:
print(f"Done. {ok} uploaded, {failed} failed.")
if failed:
sys.exit(1)
if __name__ == "__main__":
main()