- synthesizer: 3 system prompts (sysadmin/homelab/executive) selected by tech_level pref - settings: tech_level selector (UI + backend) persisted in preferences.json - QuickCapture: shows active level label in diagnosis card header - TURNSTONE_OFFLINE_MODE=1: sets HF_HUB_OFFLINE + TRANSFORMERS_OFFLINE before lib load - TURNSTONE_API_KEY: bearer token auth on all /api/ routes (hmac.compare_digest) - /health always open; unset key = no auth (backward compatible) - docs/air-gapped-deployment.md: full offline deployment guide - scripts/harvest_docs.py: generalized context doc bulk-uploader with manifest support - scripts/manifests/: heimdall-devops.yaml (10 docs ingested) + example.yaml template - fix: _ingest_upload -> _glean_upload in context doc upload endpoint (was 500) Closes: #56 Closes: #45 Closes: #47 Closes: #49 Closes: #21
266 lines
8.5 KiB
Python
266 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""harvest_docs.py — Bulk-upload documentation into Turnstone's context RAG.
|
|
|
|
Reads a YAML manifest that describes which files or directories to upload,
|
|
then POSTs each file to the Turnstone /api/context/docs endpoint.
|
|
|
|
Usage:
|
|
# From a manifest file
|
|
python harvest_docs.py --manifest manifests/my-cluster.yaml
|
|
|
|
# Explicit files (no manifest needed)
|
|
python harvest_docs.py --base-url http://localhost:8534 file1.md dir/file2.yaml
|
|
|
|
# Dry run — show what would be uploaded without sending
|
|
python harvest_docs.py --manifest manifests/my-cluster.yaml --dry-run
|
|
|
|
Manifest format (YAML):
|
|
base_url: http://localhost:8534 # optional; overridden by --base-url
|
|
sources:
|
|
- path: /absolute/path/to/file.md
|
|
label: friendly-name # optional; overrides filename in DB
|
|
|
|
- path: /absolute/path/to/dir/
|
|
include: ["*.md", "*.yaml"] # glob patterns; default: see INCLUDE_EXTS
|
|
exclude: ["CLAUDE*", "SESSION_*", "*_keys*"]
|
|
recursive: false # default false
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import fnmatch
|
|
import sys
|
|
import urllib.request
|
|
import urllib.error
|
|
from pathlib import Path
|
|
|
|
try:
|
|
import yaml
|
|
_HAS_YAML = True
|
|
except ImportError:
|
|
_HAS_YAML = False
|
|
|
|
# File extensions included when walking a directory with no explicit `include`.
|
|
INCLUDE_EXTS = {".md", ".yaml", ".yml", ".txt", ".conf", ".rst"}
|
|
|
|
# Default exclude patterns applied to every directory source (unless overridden).
|
|
DEFAULT_EXCLUDES = [
|
|
"CLAUDE*",
|
|
"SESSION_*",
|
|
"HANDOFF_*",
|
|
"*.key",
|
|
"*.pem",
|
|
"*.crt",
|
|
"node_modules",
|
|
".git",
|
|
"__pycache__",
|
|
]
|
|
|
|
UPLOAD_PATH = "/turnstone/api/context/docs"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# File collection
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _matches_any(name: str, patterns: list[str]) -> bool:
|
|
return any(fnmatch.fnmatch(name, p) for p in patterns)
|
|
|
|
|
|
def _collect_from_dir(
|
|
root: Path,
|
|
include: list[str],
|
|
exclude: list[str],
|
|
recursive: bool,
|
|
) -> list[Path]:
|
|
pattern = "**/*" if recursive else "*"
|
|
candidates: list[Path] = []
|
|
for p in root.glob(pattern):
|
|
if not p.is_file():
|
|
continue
|
|
# Exclude any path component that matches an exclude pattern
|
|
if any(_matches_any(part, exclude) for part in p.parts):
|
|
continue
|
|
if include:
|
|
if not _matches_any(p.name, include):
|
|
continue
|
|
else:
|
|
if p.suffix.lower() not in INCLUDE_EXTS:
|
|
continue
|
|
candidates.append(p)
|
|
return sorted(candidates)
|
|
|
|
|
|
def resolve_sources(sources: list[dict]) -> list[tuple[Path, str]]:
|
|
"""Return list of (path, label) pairs from a manifest sources list."""
|
|
results: list[tuple[Path, str]] = []
|
|
for entry in sources:
|
|
raw_path = entry.get("path", "")
|
|
p = Path(raw_path).expanduser().resolve()
|
|
label: str = entry.get("label", "")
|
|
include: list[str] = entry.get("include", [])
|
|
exclude: list[str] = entry.get("exclude", DEFAULT_EXCLUDES)
|
|
recursive: bool = entry.get("recursive", False)
|
|
|
|
if not p.exists():
|
|
print(f" [WARN] path not found, skipping: {p}", file=sys.stderr)
|
|
continue
|
|
|
|
if p.is_file():
|
|
results.append((p, label or p.name))
|
|
elif p.is_dir():
|
|
found = _collect_from_dir(p, include, exclude, recursive)
|
|
for f in found:
|
|
results.append((f, f.name))
|
|
else:
|
|
print(f" [WARN] not a file or directory, skipping: {p}", file=sys.stderr)
|
|
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Upload
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _build_multipart(boundary: bytes, filename: str, content: bytes) -> bytes:
|
|
"""Build a minimal multipart/form-data body for a single file field."""
|
|
lines: list[bytes] = [
|
|
b"--" + boundary,
|
|
f'Content-Disposition: form-data; name="file"; filename="{filename}"'.encode(),
|
|
b"Content-Type: application/octet-stream",
|
|
b"",
|
|
content,
|
|
b"--" + boundary + b"--",
|
|
b"",
|
|
]
|
|
return b"\r\n".join(lines)
|
|
|
|
|
|
def upload_file(base_url: str, path: Path, label: str) -> dict:
|
|
"""POST a file to Turnstone's context doc endpoint. Returns response dict."""
|
|
url = base_url.rstrip("/") + UPLOAD_PATH
|
|
content = path.read_bytes()
|
|
filename = label or path.name
|
|
|
|
boundary = b"----TurnstoneHarvest"
|
|
body = _build_multipart(boundary, filename, content)
|
|
content_type = f"multipart/form-data; boundary={boundary.decode()}"
|
|
|
|
req = urllib.request.Request(
|
|
url,
|
|
data=body,
|
|
headers={"Content-Type": content_type},
|
|
method="POST",
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
|
import json
|
|
return json.loads(resp.read())
|
|
except urllib.error.HTTPError as e:
|
|
body_text = e.read().decode(errors="replace")
|
|
return {"error": f"HTTP {e.code}: {body_text[:200]}"}
|
|
except Exception as exc:
|
|
return {"error": str(exc)}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Bulk-upload docs into Turnstone context RAG.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog=__doc__,
|
|
)
|
|
parser.add_argument(
|
|
"--manifest", "-m",
|
|
metavar="FILE",
|
|
help="YAML manifest describing sources to upload",
|
|
)
|
|
parser.add_argument(
|
|
"--base-url", "-u",
|
|
default="http://localhost:8534",
|
|
metavar="URL",
|
|
help="Turnstone base URL (default: http://localhost:8534)",
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run", "-n",
|
|
action="store_true",
|
|
help="Show files that would be uploaded without actually uploading",
|
|
)
|
|
parser.add_argument(
|
|
"files",
|
|
nargs="*",
|
|
metavar="FILE",
|
|
help="Explicit files to upload (alternative to --manifest)",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
base_url = args.base_url
|
|
sources: list[tuple[Path, str]] = []
|
|
|
|
if args.manifest:
|
|
if not _HAS_YAML:
|
|
print("ERROR: PyYAML is required for --manifest. Run: pip install pyyaml", file=sys.stderr)
|
|
sys.exit(1)
|
|
manifest_path = Path(args.manifest).expanduser().resolve()
|
|
if not manifest_path.exists():
|
|
print(f"ERROR: manifest not found: {manifest_path}", file=sys.stderr)
|
|
sys.exit(1)
|
|
data = yaml.safe_load(manifest_path.read_text())
|
|
base_url = args.base_url if args.base_url != "http://localhost:8534" else data.get("base_url", base_url)
|
|
sources = resolve_sources(data.get("sources", []))
|
|
|
|
for raw in args.files:
|
|
p = Path(raw).expanduser().resolve()
|
|
if not p.exists():
|
|
print(f" [WARN] not found, skipping: {p}", file=sys.stderr)
|
|
continue
|
|
if p.is_file():
|
|
sources.append((p, p.name))
|
|
else:
|
|
print(f" [WARN] {p} is a directory; use a manifest with recursive:true for directory sources", file=sys.stderr)
|
|
|
|
if not sources:
|
|
print("No files to upload. Pass --manifest or explicit file paths.")
|
|
sys.exit(0)
|
|
|
|
print(f"Turnstone: {base_url}")
|
|
print(f"Files to upload: {len(sources)}")
|
|
if args.dry_run:
|
|
print("\n[DRY RUN] Would upload:")
|
|
print()
|
|
|
|
ok = 0
|
|
failed = 0
|
|
for path, label in sources:
|
|
size_kb = path.stat().st_size / 1024
|
|
if args.dry_run:
|
|
print(f" {label} ({size_kb:.1f} KB) ← {path}")
|
|
ok += 1
|
|
continue
|
|
|
|
print(f" Uploading {label} ({size_kb:.1f} KB)…", end=" ", flush=True)
|
|
result = upload_file(base_url, path, label)
|
|
if "error" in result:
|
|
print(f"FAILED — {result['error']}")
|
|
failed += 1
|
|
else:
|
|
chunks = result.get("chunks_written", result.get("chunks_created", "?"))
|
|
facts = result.get("facts_written", 0)
|
|
extra = f", {facts} facts" if facts else ""
|
|
print(f"OK ({chunks} chunks{extra})")
|
|
ok += 1
|
|
|
|
print()
|
|
if args.dry_run:
|
|
print(f"Dry run complete. {ok} file(s) would be uploaded.")
|
|
else:
|
|
print(f"Done. {ok} uploaded, {failed} failed.")
|
|
if failed:
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|