- Maps Purple Carrot parquet columns to recipes table schema: Slug → external_id (pc_<slug>), Name → title, RecipeIngredientParts/RecipeInstructions → ingredients/directions - Sets source='purplecarrot', category='meal-kit', servings=2 - Allergens encoded as allergen:<tag> keywords alongside HIGH-PROTEIN etc. - Handles numpy ndarray columns from parquet (not plain Python lists) - Upserts: insert new, update existing — safe to run repeatedly Wire step 3 (ingest) into weekly_harvest.sh so the full pipeline is: 1. discover_current_menu.py → parquet of active menu slugs 2. scrape_live.py --resume → scrape only new slugs, append to live parquet 3. ingest_purplecarrot.py → upsert into /Library/Assets/kiwi/kiwi.db
41 lines
1.5 KiB
Bash
Executable file
41 lines
1.5 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# Weekly Purple Carrot recipe harvest
|
|
# Runs every Sunday night via cron.
|
|
# Discovers this week's menu and scrapes full recipe data.
|
|
# Logs to /Library/Assets/kiwi/pipeline/logs/purple_carrot_harvest.log
|
|
|
|
set -euo pipefail
|
|
|
|
REPO="/Library/Development/CircuitForge/kiwi"
|
|
MENU_OUT="/Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet"
|
|
LIVE_OUT="/Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet"
|
|
LOG_DIR="/Library/Assets/kiwi/pipeline/logs"
|
|
LOG="$LOG_DIR/purple_carrot_harvest.log"
|
|
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
echo "=== Purple Carrot harvest $(date -u '+%Y-%m-%d %H:%M UTC') ===" >> "$LOG"
|
|
|
|
cd "$REPO"
|
|
|
|
# Step 1: discover this week's menu slugs
|
|
echo "[1/2] Discovering current menu slugs..." | tee -a "$LOG"
|
|
conda run -n cf python3 scripts/pipeline/purple_carrot/discover_current_menu.py \
|
|
--out "$MENU_OUT" 2>&1 | tee -a "$LOG"
|
|
|
|
# Step 2: scrape full recipe data for new slugs only (--resume skips already-scraped)
|
|
echo "[2/2] Scraping live recipe pages..." | tee -a "$LOG"
|
|
conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
|
|
--slugs-from "$MENU_OUT" \
|
|
--out "$LIVE_OUT" \
|
|
--resume \
|
|
--delay 3.0 2>&1 | tee -a "$LOG"
|
|
|
|
# Step 3: ingest new recipes into the shared corpus DB
|
|
echo "[3/3] Ingesting into corpus DB..." | tee -a "$LOG"
|
|
conda run -n cf python3 scripts/pipeline/ingest_purplecarrot.py \
|
|
--parquet "$LIVE_OUT" \
|
|
--db /Library/Assets/kiwi/kiwi.db 2>&1 | tee -a "$LOG"
|
|
|
|
echo "=== Done $(date -u '+%Y-%m-%d %H:%M UTC') ===" >> "$LOG"
|
|
echo "" >> "$LOG"
|