diff --git a/scripts/pipeline/purple_carrot/weekly_harvest.sh b/scripts/pipeline/purple_carrot/weekly_harvest.sh new file mode 100755 index 0000000..74bf6c8 --- /dev/null +++ b/scripts/pipeline/purple_carrot/weekly_harvest.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Weekly Purple Carrot recipe harvest +# Runs every Sunday night via cron. +# Discovers this week's menu and scrapes full recipe data. +# Logs to /Library/Assets/kiwi/pipeline/logs/purple_carrot_harvest.log + +set -euo pipefail + +REPO="/Library/Development/CircuitForge/kiwi" +MENU_OUT="/Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet" +LIVE_OUT="/Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet" +LOG_DIR="/Library/Assets/kiwi/pipeline/logs" +LOG="$LOG_DIR/purple_carrot_harvest.log" + +mkdir -p "$LOG_DIR" + +echo "=== Purple Carrot harvest $(date -u '+%Y-%m-%d %H:%M UTC') ===" >> "$LOG" + +cd "$REPO" + +# Step 1: discover this week's menu slugs +echo "[1/2] Discovering current menu slugs..." | tee -a "$LOG" +conda run -n cf python3 scripts/pipeline/purple_carrot/discover_current_menu.py \ + --out "$MENU_OUT" 2>&1 | tee -a "$LOG" + +# Step 2: scrape full recipe data for new slugs only (--resume skips already-scraped) +echo "[2/2] Scraping live recipe pages..." | tee -a "$LOG" +conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \ + --slugs-from "$MENU_OUT" \ + --out "$LIVE_OUT" \ + --resume \ + --delay 3.0 2>&1 | tee -a "$LOG" + +echo "=== Done $(date -u '+%Y-%m-%d %H:%M UTC') ===" >> "$LOG" +echo "" >> "$LOG"