From b484e27004ed3db0443e86858a83718653f5109d Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 10 Jun 2026 15:01:19 -0700 Subject: [PATCH] chore(corpus): preserve watermark files across updates; document corpus env vars update.sh now backs up data/corpus_watermark.txt and data/incident_watermark.txt before git pull and restores them after, mirroring the existing watch.yaml pattern. Without this, an update would reset watermarks to zero and re-push all corpus entries from the beginning on the next export run. .env.example adds a corpus export section documenting the three env vars needed to opt a node into the Avocet training pipeline. Closes: https://git.opensourcesolarpunk.com/Circuit-Forge/turnstone/issues/6 --- .env.example | 9 +++++++++ scripts/update.sh | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/.env.example b/.env.example index d7687f9..b1c81de 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,15 @@ # Remote endpoint to push diagnostic bundles for escalation. # TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles +# --- Log corpus export to Avocet (optional) --- +# Push ERROR/CRITICAL entries and labeled incidents to the Avocet corpus endpoint +# for logreading fine-tune training. Requires a consent token issued by CF. +# Contact alan@circuitforge.tech to register your node and receive a token. +# Watermarks are stored at data/corpus_watermark.txt and data/incident_watermark.txt. +# AVOCET_CORPUS_ENDPOINT=https://avocet.circuitforge.tech/api/corpus/log-batch +# AVOCET_CONSENT_TOKEN=your-uuid-token-here +# TURNSTONE_SOURCE_HOST=my-server-name # defaults to system hostname if unset + # --- Periodic batch glean --- # Seconds between automatic glean runs from sources.yaml. Set to 0 to disable. # TURNSTONE_GLEAN_INTERVAL=900 diff --git a/scripts/update.sh b/scripts/update.sh index 50c7e7d..5db724c 100644 --- a/scripts/update.sh +++ b/scripts/update.sh @@ -6,8 +6,10 @@ # sudo bash /opt/turnstone/scripts/update.sh feat/live-watch # test a branch # # Local files preserved across updates: -# patterns/watch.yaml — site-specific watch source config -# data/ — database and live journal files (bind-mounted, untouched) +# patterns/watch.yaml — site-specific watch source config +# data/corpus_watermark.txt — corpus export watermark (last exported rowid) +# data/incident_watermark.txt — incident export watermark (last exported timestamp) +# data/ — database and live journal files (bind-mounted, untouched) set -euo pipefail @@ -21,7 +23,9 @@ echo "==> Turnstone update: branch=$BRANCH" # ── Preserve site-local config ──────────────────────────────────────────────── # watch.yaml is tracked in git as a template but overridden per-host. -# Back it up before the pull and restore it after. +# Corpus watermarks track the last exported entry/incident — must survive updates +# or the next export run will re-push everything from the beginning. +# Back them up before the pull and restore after. WATCH_YAML="$REPO_DIR/patterns/watch.yaml" WATCH_BACKUP="" if [ -f "$WATCH_YAML" ]; then @@ -29,6 +33,19 @@ if [ -f "$WATCH_YAML" ]; then cp "$WATCH_YAML" "$WATCH_BACKUP" fi +CORPUS_WM="$REPO_DIR/data/corpus_watermark.txt" +INCIDENT_WM="$REPO_DIR/data/incident_watermark.txt" +CORPUS_WM_BACKUP="" +INCIDENT_WM_BACKUP="" +if [ -f "$CORPUS_WM" ]; then + CORPUS_WM_BACKUP=$(mktemp /tmp/corpus-wm.XXXXXX) + cp "$CORPUS_WM" "$CORPUS_WM_BACKUP" +fi +if [ -f "$INCIDENT_WM" ]; then + INCIDENT_WM_BACKUP=$(mktemp /tmp/incident-wm.XXXXXX) + cp "$INCIDENT_WM" "$INCIDENT_WM_BACKUP" +fi + # ── Pull ────────────────────────────────────────────────────────────────────── git fetch --all --tags --quiet @@ -50,6 +67,16 @@ if [ -n "$WATCH_BACKUP" ]; then rm -f "$WATCH_BACKUP" echo "==> Restored patterns/watch.yaml" fi +if [ -n "$CORPUS_WM_BACKUP" ]; then + cp "$CORPUS_WM_BACKUP" "$CORPUS_WM" + rm -f "$CORPUS_WM_BACKUP" + echo "==> Restored data/corpus_watermark.txt" +fi +if [ -n "$INCIDENT_WM_BACKUP" ]; then + cp "$INCIDENT_WM_BACKUP" "$INCIDENT_WM" + rm -f "$INCIDENT_WM_BACKUP" + echo "==> Restored data/incident_watermark.txt" +fi # ── Build ───────────────────────────────────────────────────────────────────── echo "==> Building $IMAGE ..."