#!/usr/bin/env python3 """harvest_docs.py — Bulk-upload documentation into Turnstone's context RAG. Reads a YAML manifest that describes which files or directories to upload, then POSTs each file to the Turnstone /api/context/docs endpoint. Usage: # From a manifest file python harvest_docs.py --manifest manifests/my-cluster.yaml # Explicit files (no manifest needed) python harvest_docs.py --base-url http://localhost:8534 file1.md dir/file2.yaml # Dry run — show what would be uploaded without sending python harvest_docs.py --manifest manifests/my-cluster.yaml --dry-run Manifest format (YAML): base_url: http://localhost:8534 # optional; overridden by --base-url sources: - path: /absolute/path/to/file.md label: friendly-name # optional; overrides filename in DB - path: /absolute/path/to/dir/ include: ["*.md", "*.yaml"] # glob patterns; default: see INCLUDE_EXTS exclude: ["CLAUDE*", "SESSION_*", "*_keys*"] recursive: false # default false """ from __future__ import annotations import argparse import fnmatch import sys import urllib.request import urllib.error from pathlib import Path try: import yaml _HAS_YAML = True except ImportError: _HAS_YAML = False # File extensions included when walking a directory with no explicit `include`. INCLUDE_EXTS = {".md", ".yaml", ".yml", ".txt", ".conf", ".rst"} # Default exclude patterns applied to every directory source (unless overridden). DEFAULT_EXCLUDES = [ "CLAUDE*", "SESSION_*", "HANDOFF_*", "*.key", "*.pem", "*.crt", "node_modules", ".git", "__pycache__", ] UPLOAD_PATH = "/turnstone/api/context/docs" # --------------------------------------------------------------------------- # File collection # --------------------------------------------------------------------------- def _matches_any(name: str, patterns: list[str]) -> bool: return any(fnmatch.fnmatch(name, p) for p in patterns) def _collect_from_dir( root: Path, include: list[str], exclude: list[str], recursive: bool, ) -> list[Path]: pattern = "**/*" if recursive else "*" candidates: list[Path] = [] for p in root.glob(pattern): if not p.is_file(): continue # Exclude any path component that matches an exclude pattern if any(_matches_any(part, exclude) for part in p.parts): continue if include: if not _matches_any(p.name, include): continue else: if p.suffix.lower() not in INCLUDE_EXTS: continue candidates.append(p) return sorted(candidates) def resolve_sources(sources: list[dict]) -> list[tuple[Path, str]]: """Return list of (path, label) pairs from a manifest sources list.""" results: list[tuple[Path, str]] = [] for entry in sources: raw_path = entry.get("path", "") p = Path(raw_path).expanduser().resolve() label: str = entry.get("label", "") include: list[str] = entry.get("include", []) exclude: list[str] = entry.get("exclude", DEFAULT_EXCLUDES) recursive: bool = entry.get("recursive", False) if not p.exists(): print(f" [WARN] path not found, skipping: {p}", file=sys.stderr) continue if p.is_file(): results.append((p, label or p.name)) elif p.is_dir(): found = _collect_from_dir(p, include, exclude, recursive) for f in found: results.append((f, f.name)) else: print(f" [WARN] not a file or directory, skipping: {p}", file=sys.stderr) return results # --------------------------------------------------------------------------- # Upload # --------------------------------------------------------------------------- def _build_multipart(boundary: bytes, filename: str, content: bytes) -> bytes: """Build a minimal multipart/form-data body for a single file field.""" lines: list[bytes] = [ b"--" + boundary, f'Content-Disposition: form-data; name="file"; filename="{filename}"'.encode(), b"Content-Type: application/octet-stream", b"", content, b"--" + boundary + b"--", b"", ] return b"\r\n".join(lines) def upload_file(base_url: str, path: Path, label: str) -> dict: """POST a file to Turnstone's context doc endpoint. Returns response dict.""" url = base_url.rstrip("/") + UPLOAD_PATH content = path.read_bytes() filename = label or path.name boundary = b"----TurnstoneHarvest" body = _build_multipart(boundary, filename, content) content_type = f"multipart/form-data; boundary={boundary.decode()}" req = urllib.request.Request( url, data=body, headers={"Content-Type": content_type}, method="POST", ) try: with urllib.request.urlopen(req, timeout=30) as resp: import json return json.loads(resp.read()) except urllib.error.HTTPError as e: body_text = e.read().decode(errors="replace") return {"error": f"HTTP {e.code}: {body_text[:200]}"} except Exception as exc: return {"error": str(exc)} # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def main() -> None: parser = argparse.ArgumentParser( description="Bulk-upload docs into Turnstone context RAG.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--manifest", "-m", metavar="FILE", help="YAML manifest describing sources to upload", ) parser.add_argument( "--base-url", "-u", default="http://localhost:8534", metavar="URL", help="Turnstone base URL (default: http://localhost:8534)", ) parser.add_argument( "--dry-run", "-n", action="store_true", help="Show files that would be uploaded without actually uploading", ) parser.add_argument( "files", nargs="*", metavar="FILE", help="Explicit files to upload (alternative to --manifest)", ) args = parser.parse_args() base_url = args.base_url sources: list[tuple[Path, str]] = [] if args.manifest: if not _HAS_YAML: print("ERROR: PyYAML is required for --manifest. Run: pip install pyyaml", file=sys.stderr) sys.exit(1) manifest_path = Path(args.manifest).expanduser().resolve() if not manifest_path.exists(): print(f"ERROR: manifest not found: {manifest_path}", file=sys.stderr) sys.exit(1) data = yaml.safe_load(manifest_path.read_text()) base_url = args.base_url if args.base_url != "http://localhost:8534" else data.get("base_url", base_url) sources = resolve_sources(data.get("sources", [])) for raw in args.files: p = Path(raw).expanduser().resolve() if not p.exists(): print(f" [WARN] not found, skipping: {p}", file=sys.stderr) continue if p.is_file(): sources.append((p, p.name)) else: print(f" [WARN] {p} is a directory; use a manifest with recursive:true for directory sources", file=sys.stderr) if not sources: print("No files to upload. Pass --manifest or explicit file paths.") sys.exit(0) print(f"Turnstone: {base_url}") print(f"Files to upload: {len(sources)}") if args.dry_run: print("\n[DRY RUN] Would upload:") print() ok = 0 failed = 0 for path, label in sources: size_kb = path.stat().st_size / 1024 if args.dry_run: print(f" {label} ({size_kb:.1f} KB) ← {path}") ok += 1 continue print(f" Uploading {label} ({size_kb:.1f} KB)…", end=" ", flush=True) result = upload_file(base_url, path, label) if "error" in result: print(f"FAILED — {result['error']}") failed += 1 else: chunks = result.get("chunks_written", result.get("chunks_created", "?")) facts = result.get("facts_written", 0) extra = f", {facts} facts" if facts else "" print(f"OK ({chunks} chunks{extra})") ok += 1 print() if args.dry_run: print(f"Dry run complete. {ok} file(s) would be uploaded.") else: print(f"Done. {ok} uploaded, {failed} failed.") if failed: sys.exit(1) if __name__ == "__main__": main()