chore(corpus): preserve watermark files across updates; document corpus env vars

update.sh now backs up data/corpus_watermark.txt and data/incident_watermark.txt
before git pull and restores them after, mirroring the existing watch.yaml pattern.
Without this, an update would reset watermarks to zero and re-push all corpus
entries from the beginning on the next export run.

.env.example adds a corpus export section documenting the three env vars
needed to opt a node into the Avocet training pipeline.

Closes: #6
This commit is contained in:
pyr0ball 2026-06-10 15:01:19 -07:00
parent e2a78d45ef
commit 674e945004
2 changed files with 39 additions and 3 deletions

View file

@ -23,6 +23,15 @@
# Remote endpoint to push diagnostic bundles for escalation. # Remote endpoint to push diagnostic bundles for escalation.
# TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles # TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles
# --- Log corpus export to Avocet (optional) ---
# Push ERROR/CRITICAL entries and labeled incidents to the Avocet corpus endpoint
# for logreading fine-tune training. Requires a consent token issued by CF.
# Contact alan@circuitforge.tech to register your node and receive a token.
# Watermarks are stored at data/corpus_watermark.txt and data/incident_watermark.txt.
# AVOCET_CORPUS_ENDPOINT=https://avocet.circuitforge.tech/api/corpus/log-batch
# AVOCET_CONSENT_TOKEN=your-uuid-token-here
# TURNSTONE_SOURCE_HOST=my-server-name # defaults to system hostname if unset
# --- Periodic batch glean --- # --- Periodic batch glean ---
# Seconds between automatic glean runs from sources.yaml. Set to 0 to disable. # Seconds between automatic glean runs from sources.yaml. Set to 0 to disable.
# TURNSTONE_GLEAN_INTERVAL=900 # TURNSTONE_GLEAN_INTERVAL=900

View file

@ -6,8 +6,10 @@
# sudo bash /opt/turnstone/scripts/update.sh feat/live-watch # test a branch # sudo bash /opt/turnstone/scripts/update.sh feat/live-watch # test a branch
# #
# Local files preserved across updates: # Local files preserved across updates:
# patterns/watch.yaml — site-specific watch source config # patterns/watch.yaml — site-specific watch source config
# data/ — database and live journal files (bind-mounted, untouched) # data/corpus_watermark.txt — corpus export watermark (last exported rowid)
# data/incident_watermark.txt — incident export watermark (last exported timestamp)
# data/ — database and live journal files (bind-mounted, untouched)
set -euo pipefail set -euo pipefail
@ -21,7 +23,9 @@ echo "==> Turnstone update: branch=$BRANCH"
# ── Preserve site-local config ──────────────────────────────────────────────── # ── Preserve site-local config ────────────────────────────────────────────────
# watch.yaml is tracked in git as a template but overridden per-host. # watch.yaml is tracked in git as a template but overridden per-host.
# Back it up before the pull and restore it after. # Corpus watermarks track the last exported entry/incident — must survive updates
# or the next export run will re-push everything from the beginning.
# Back them up before the pull and restore after.
WATCH_YAML="$REPO_DIR/patterns/watch.yaml" WATCH_YAML="$REPO_DIR/patterns/watch.yaml"
WATCH_BACKUP="" WATCH_BACKUP=""
if [ -f "$WATCH_YAML" ]; then if [ -f "$WATCH_YAML" ]; then
@ -29,6 +33,19 @@ if [ -f "$WATCH_YAML" ]; then
cp "$WATCH_YAML" "$WATCH_BACKUP" cp "$WATCH_YAML" "$WATCH_BACKUP"
fi fi
CORPUS_WM="$REPO_DIR/data/corpus_watermark.txt"
INCIDENT_WM="$REPO_DIR/data/incident_watermark.txt"
CORPUS_WM_BACKUP=""
INCIDENT_WM_BACKUP=""
if [ -f "$CORPUS_WM" ]; then
CORPUS_WM_BACKUP=$(mktemp /tmp/corpus-wm.XXXXXX)
cp "$CORPUS_WM" "$CORPUS_WM_BACKUP"
fi
if [ -f "$INCIDENT_WM" ]; then
INCIDENT_WM_BACKUP=$(mktemp /tmp/incident-wm.XXXXXX)
cp "$INCIDENT_WM" "$INCIDENT_WM_BACKUP"
fi
# ── Pull ────────────────────────────────────────────────────────────────────── # ── Pull ──────────────────────────────────────────────────────────────────────
git fetch --all --tags --quiet git fetch --all --tags --quiet
@ -50,6 +67,16 @@ if [ -n "$WATCH_BACKUP" ]; then
rm -f "$WATCH_BACKUP" rm -f "$WATCH_BACKUP"
echo "==> Restored patterns/watch.yaml" echo "==> Restored patterns/watch.yaml"
fi fi
if [ -n "$CORPUS_WM_BACKUP" ]; then
cp "$CORPUS_WM_BACKUP" "$CORPUS_WM"
rm -f "$CORPUS_WM_BACKUP"
echo "==> Restored data/corpus_watermark.txt"
fi
if [ -n "$INCIDENT_WM_BACKUP" ]; then
cp "$INCIDENT_WM_BACKUP" "$INCIDENT_WM"
rm -f "$INCIDENT_WM_BACKUP"
echo "==> Restored data/incident_watermark.txt"
fi
# ── Build ───────────────────────────────────────────────────────────────────── # ── Build ─────────────────────────────────────────────────────────────────────
echo "==> Building $IMAGE ..." echo "==> Building $IMAGE ..."