feat(pipeline): purple carrot weekly menu scraper with CF bypass

Add three new scripts for Purple Carrot recipe pipeline: - discover_current_menu.py: fetches this week's active menu slugs from /plant-based-recipes using requests (server-rendered HTML, no JS needed). Accumulates slugs across weekly runs for building a recipe corpus over time. - discover_slugs_categories.py: crawls recipe-category listing pages with ?page=N pagination to discover historical slug inventory. Note: category archive slugs (past menu items) 404 when scraped live; only use for identifying currently-featured recipes per category. - scrape_live.py: updated with --slugs-from flag (load slug inventory from any parquet, not just the default Wayback one) and fresh-context-per-slug pattern to bypass Cloudflare session-level bot detection (which fires on the 2nd+ request in a shared browser context). Discovery: the live site only renders full ingredient/instruction content for recipes currently on the active weekly menu. 23/23 current menu recipes scraped successfully (100% hit rate vs ~1% for archived slugs).
2026-05-21 16:16:32 -07:00 · 2026-05-21 16:16:32 -07:00 · a9ab996bcc
commit a9ab996bcc
parent 56f942b3fd
3 changed files with 588 additions and 0 deletions
--- a/scripts/pipeline/purple_carrot/discover_current_menu.py
+++ b/scripts/pipeline/purple_carrot/discover_current_menu.py
@ -0,0 +1,120 @@
+"""Discover Purple Carrot's current weekly menu recipe slugs.
+
+The main /plant-based-recipes listing page always renders the current week's
+menu as server-side HTML.  This script pulls those slugs and writes them to a
+parquet that can be passed directly to scrape_live.py via --slugs-from.
+
+Run weekly (e.g. via cron) to accumulate new recipes as the menu rotates.
+
+Usage:
+    conda run -n cf python3 scripts/pipeline/purple_carrot/discover_current_menu.py \
+        [--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet]
+
+Then scrape:
+    conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
+        --slugs-from /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet \
+        --out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \
+        --resume
+"""
+
+from __future__ import annotations
+
+import re
+import sys
+from datetime import date
+from pathlib import Path
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+# ── Config ─────────────────────────────────────────────────────────────────────
+
+LISTING_URL = "https://www.purplecarrot.com/plant-based-recipes"
+BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
+
+DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet")
+
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+}
+
+RECIPE_HREF_RE = re.compile(r"/recipe/([^?#]+)")
+
+
+# ── Main ───────────────────────────────────────────────────────────────────────
+
+def discover_current_slugs() -> list[str]:
+    """Fetch the listing page and return unique recipe slugs from the current menu."""
+    resp = requests.get(LISTING_URL, headers=HEADERS, timeout=15)
+    if resp.status_code != 200:
+        print(f"ERROR: listing page returned HTTP {resp.status_code}", file=sys.stderr)
+        return []
+
+    soup = BeautifulSoup(resp.text, "html.parser")
+    slugs: list[str] = []
+    seen: set[str] = set()
+    for a in soup.find_all("a", href=RECIPE_HREF_RE):
+        m = RECIPE_HREF_RE.search(a["href"])
+        if m:
+            slug = m.group(1)
+            if slug not in seen:
+                seen.add(slug)
+                slugs.append(slug)
+    return slugs
+
+
+def main() -> None:
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
+    args = parser.parse_args()
+
+    print(f"Fetching current menu from {LISTING_URL} …")
+    slugs = discover_current_slugs()
+
+    if not slugs:
+        print("No slugs found — the listing page may have changed structure or blocked the request.")
+        sys.exit(1)
+
+    today = date.today().isoformat()
+    records = [
+        {
+            "Slug": slug,
+            "SourceURL": BASE_URL.format(slug=slug),
+            "Source": "purplecarrot_menu",
+            "DiscoveredDate": today,
+        }
+        for slug in slugs
+    ]
+
+    # Merge with any existing menu parquet (accumulate weeks)
+    df_new = pd.DataFrame(records)
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+
+    if args.out.exists():
+        df_prev = pd.read_parquet(args.out)
+        combined = pd.concat([df_prev, df_new], ignore_index=True)
+        combined = combined.drop_duplicates(subset=["Slug"], keep="first")
+        df_new = combined
+
+    df_new.to_parquet(args.out, index=False)
+
+    print(f"Found {len(slugs)} current-menu slugs this week:")
+    for s in slugs:
+        print(f"  {s}")
+    print(f"\nSaved {len(df_new)} total slugs (accumulated) to {args.out}")
+    print(f"\nTo scrape full recipes:")
+    print(f"  conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \\")
+    print(f"    --slugs-from {args.out} \\")
+    print(f"    --out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \\")
+    print(f"    --resume")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/pipeline/purple_carrot/discover_slugs_categories.py
+++ b/scripts/pipeline/purple_carrot/discover_slugs_categories.py
@ -0,0 +1,218 @@
+"""Discover Purple Carrot recipe slugs by crawling all recipe-category listing pages.
+
+The site serves full server-rendered HTML for category pages, paginated via
+?page=N.  Each page loads 18 recipe cards.  This script crawls every category
+across all pages and writes a deduplicated slug inventory.
+
+Usage:
+    conda run -n cf python3 scripts/pipeline/purple_carrot/discover_slugs_categories.py \
+        [--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_slugs.parquet] \
+        [--delay 2.0] \
+        [--max-pages 50]   # safety cap per category (comfort-foods has ~18)
+"""
+
+from __future__ import annotations
+
+import argparse
+import re
+import time
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+# ── Config ─────────────────────────────────────────────────────────────────────
+
+BASE = "https://www.purplecarrot.com"
+
+# All known category slugs (from /plant-based-recipes nav)
+CATEGORIES: list[str] = [
+    "comfort-foods",
+    "family-friendly",
+    "healthy-desserts",
+    "holiday-recipes",
+    "quick-and-easy",
+    "party-foods",
+    "seasonal-menu",
+    "spring-recipes",
+    "summer-recipes",
+    "fall-recipes",
+    "winter-recipes",
+    "african",
+    "american",
+    "asian",
+    "comfort",
+    "french",
+    "indian",
+    "italian",
+    "mediterranean",
+    "mexican",
+    "middle-eastern",
+    "soups",
+    "salads",
+    "bowls",
+    "pasta",
+    "sandwiches-wraps",
+    "tacos",
+    "breakfast",
+    "snacks-sides",
+]
+
+DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_slugs.parquet")
+EXISTING_PARQUET = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot.parquet")
+
+RECIPE_LINK_SELECTOR = "a.c-recipe__title"
+SLUG_RE = re.compile(r"/recipe/([^?#]+)")
+
+HEADERS = {
+    "User-Agent": (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    ),
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Accept-Language": "en-US,en;q=0.5",
+}
+
+
+# ── Helpers ────────────────────────────────────────────────────────────────────
+
+def _fetch_html(url: str, session: requests.Session) -> str | None:
+    """Fetch URL and return HTML string, or None on failure."""
+    try:
+        resp = session.get(url, headers=HEADERS, timeout=15)
+        if resp.status_code == 200:
+            return resp.text
+        if resp.status_code == 404:
+            return None   # expected end of pagination
+        print(f"  HTTP {resp.status_code} — {url}")
+        return None
+    except Exception as exc:
+        print(f"  ERROR fetching {url}: {exc}")
+        return None
+
+
+def _extract_slugs(html: str) -> list[str]:
+    """Pull recipe slugs from one listing-page HTML response."""
+    soup = BeautifulSoup(html, "html.parser")
+    slugs: list[str] = []
+    for a in soup.select(RECIPE_LINK_SELECTOR):
+        href = a.get("href", "")
+        m = SLUG_RE.search(href)
+        if m:
+            slugs.append(m.group(1))
+    return slugs
+
+
+def _get_category_total(html: str) -> int | None:
+    """Try to parse the recipe count shown on the category page (e.g. '319 Recipes')."""
+    m = re.search(r"(\d+)\s+Recipes?\b", html)
+    return int(m.group(1)) if m else None
+
+
+def _discover_category(
+    category: str,
+    session: requests.Session,
+    delay: float,
+    max_pages: int,
+) -> tuple[list[str], int]:
+    """Crawl all pages of a category, return (slugs, pages_fetched)."""
+    slugs: list[str] = []
+    for page_num in range(1, max_pages + 1):
+        if page_num == 1:
+            url = f"{BASE}/recipe-categories/{category}"
+        else:
+            url = f"{BASE}/recipe-categories/{category}?page={page_num}"
+
+        html = _fetch_html(url, session)
+        if html is None:
+            break   # 404 or error = past the end
+
+        page_slugs = _extract_slugs(html)
+        if not page_slugs:
+            # Show total if we got a page but no links (category slug may be wrong)
+            if page_num == 1:
+                total = _get_category_total(html)
+                if total is not None:
+                    print(f"  page 1 loaded (total={total}) but 0 recipe links — selector may need updating")
+            break
+
+        slugs.extend(page_slugs)
+
+        # Print progress
+        total_hint = _get_category_total(html) if page_num == 1 else None
+        total_str = f" / {total_hint}" if total_hint else ""
+        print(f"  page {page_num}: +{len(page_slugs)} slugs ({len(slugs)}{total_str} cumulative)")
+
+        if len(page_slugs) < 18:
+            # Short page = last page
+            break
+
+        time.sleep(delay)
+
+    return slugs, (len(slugs) + 17) // 18  # approximate pages
+
+
+# ── Main ───────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
+    parser.add_argument("--delay", type=float, default=2.0,
+                        help="Seconds between page requests")
+    parser.add_argument("--max-pages", type=int, default=50,
+                        help="Safety cap on pages per category")
+    parser.add_argument("--categories", nargs="*",
+                        help="Crawl only these category slugs (default: all)")
+    args = parser.parse_args()
+
+    categories = args.categories or CATEGORIES
+
+    # Seed with any slugs from the Wayback parquet
+    known_slugs: set[str] = set()
+    if EXISTING_PARQUET.exists():
+        df_wb = pd.read_parquet(EXISTING_PARQUET)
+        known_slugs = set(df_wb["Slug"].dropna().tolist())
+        print(f"Seeded with {len(known_slugs)} slugs from Wayback parquet")
+
+    all_records: list[dict[str, Any]] = []
+    session = requests.Session()
+
+    for category in categories:
+        print(f"\n[{category}]")
+        cat_slugs, pages = _discover_category(category, session, args.delay, args.max_pages)
+        for slug in cat_slugs:
+            all_records.append({"Slug": slug, "Category": category, "Source": "purplecarrot_category"})
+        print(f"  → {len(cat_slugs)} slugs across ~{pages} pages")
+        time.sleep(args.delay)
+
+    if not all_records:
+        print("\nNo records found — check that categories are correct and the site is accessible")
+        return
+
+    # Deduplicate keeping first category encountered
+    df_new = pd.DataFrame(all_records)
+    df_new = df_new.drop_duplicates(subset=["Slug"], keep="first")
+
+    # Also include Wayback slugs not already in the new set
+    if known_slugs:
+        wb_only = known_slugs - set(df_new["Slug"].tolist())
+        if wb_only:
+            df_wb_extra = pd.DataFrame([
+                {"Slug": s, "Category": "wayback", "Source": "purplecarrot_wayback"}
+                for s in wb_only
+            ])
+            df_new = pd.concat([df_new, df_wb_extra], ignore_index=True)
+
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    df_new.to_parquet(args.out, index=False)
+
+    new_count = len(df_new)
+    cat_count = len(df_new[df_new["Source"] == "purplecarrot_category"])
+    print(f"\nDone — {new_count} total slugs saved to {args.out}")
+    print(f"  {cat_count} from category pages, {new_count - cat_count} from Wayback only")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/pipeline/purple_carrot/scrape_live.py
+++ b/scripts/pipeline/purple_carrot/scrape_live.py
@ -0,0 +1,250 @@
+"""Playwright scraper for live purplecarrot.com recipe pages.
+
+Uses the slug inventory already in recipes_purplecarrot.parquet and fills in
+the missing ingredients/instructions by hitting the live site directly.
+
+Usage:
+    conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
+        [--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet] \
+        [--delay 2.5] \
+        [--limit 20]
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import time
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+from playwright.sync_api import sync_playwright, Page, TimeoutError as PWTimeout
+
+# ── Config ─────────────────────────────────────────────────────────────────────
+
+BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
+DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet")
+EXISTING_PARQUET = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot.parquet")
+
+RENDER_WAIT_MS = 2500   # JS render settle time
+NAV_TIMEOUT_MS = 20_000
+
+
+# ── Page parser ────────────────────────────────────────────────────────────────
+
+def _text(page: Page, selector: str) -> str:
+    el = page.query_selector(selector)
+    return el.inner_text().strip() if el else ""
+
+
+def _texts(page: Page, selector: str) -> list[str]:
+    return [el.inner_text().strip() for el in page.query_selector_all(selector)]
+
+
+def _parse_recipe(page: Page, slug: str, source_url: str) -> dict[str, Any] | None:
+    """Extract structured recipe data from the rendered page."""
+    body = page.inner_text("body")
+
+    # Abort if we've been bounced to a generic listing / 404
+    if "Page Not Found" in body or slug not in page.url:
+        return None
+
+    # ── Title ──────────────────────────────────────────────────────────────────
+    # The <h1> on product pages tends to be the recipe name
+    title = (_text(page, "h1") or _text(page, "[class*='recipe-title']")).strip()
+    if not title:
+        # Fallback: first heading-like text before "Ingredients"
+        idx = body.find("Ingredients\n")
+        title = body[:idx].strip().splitlines()[-1] if idx > 0 else ""
+
+    # ── Ingredients / Instructions via body text ───────────────────────────────
+    ing_start = body.find("\nIngredients\n")
+    inst_start = body.find("\nInstructions\n")
+    footer_start = body.find("\nShop\n")   # footer sentinel
+
+    if ing_start == -1:
+        return None   # page didn't render recipe content
+
+    raw_ingredients: list[str] = []
+    raw_instructions: list[str] = []
+
+    if ing_start != -1 and inst_start != -1:
+        ing_block = body[ing_start + len("\nIngredients\n"):inst_start].strip()
+        raw_ingredients = [l.strip() for l in ing_block.splitlines() if l.strip()]
+
+    if inst_start != -1:
+        end = footer_start if footer_start > inst_start else len(body)
+        inst_block = body[inst_start + len("\nInstructions\n"):end].strip()
+        # Steps start with a digit
+        steps: list[str] = []
+        current: list[str] = []
+        for line in inst_block.splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            if re.match(r"^\d+$", line):
+                if current:
+                    steps.append(" ".join(current))
+                current = []
+            elif line.startswith("CULINARY NOTES"):
+                break
+            else:
+                current.append(line)
+        if current:
+            steps.append(" ".join(current))
+        raw_instructions = steps
+
+    # ── Nutrition ──────────────────────────────────────────────────────────────
+    def _extract_num(pattern: str) -> float | None:
+        m = re.search(pattern, body)
+        try:
+            return float(m.group(1)) if m else None
+        except ValueError:
+            return None
+
+    cal   = _extract_num(r"(\d+)\s*CAL")
+    fat   = _extract_num(r"(\d+(?:\.\d+)?)g\s*FAT")
+    carbs = _extract_num(r"(\d+(?:\.\d+)?)g\s*CARBS")
+    prot  = _extract_num(r"(\d+(?:\.\d+)?)g\s*PROTEIN")
+    fiber = _extract_num(r"(\d+(?:\.\d+)?)g\s*FIBER")
+
+    # ── Allergens / tags ───────────────────────────────────────────────────────
+    allergen_m = re.search(r"Allergens?:\s*([^\n]+)", body)
+    allergens = allergen_m.group(1).strip() if allergen_m else ""
+
+    # Feature tags like HIGH-PROTEIN, QUICK, etc. appear before Ingredients
+    pre_ing = body[:ing_start]
+    tags = re.findall(r"\b(HIGH-PROTEIN|QUICK|SPICY|LOW[\-\s]CALORIE|VEGAN|FAMILY\s+FRIENDLY)\b", pre_ing)
+
+    return {
+        "Slug": slug,
+        "Name": title,
+        "SourceURL": source_url,
+        "Source": "purplecarrot_live",
+        "RecipeIngredientParts": raw_ingredients,
+        "RecipeInstructions": raw_instructions,
+        "Calories": cal,
+        "FatContent": fat,
+        "CarbohydrateContent": carbs,
+        "ProteinContent": prot,
+        "FiberContent": fiber,
+        "Allergens": allergens,
+        "Keywords": tags,
+        "HasFullRecipe": bool(raw_ingredients and raw_instructions),
+    }
+
+
+# ── Main ───────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
+    parser.add_argument("--delay", type=float, default=2.5,
+                        help="Seconds between requests (be polite)")
+    parser.add_argument("--limit", type=int, default=0,
+                        help="Stop after N slugs (0 = all)")
+    parser.add_argument("--resume", action="store_true",
+                        help="Skip slugs already present in --out")
+    parser.add_argument("--slugs-from", type=Path, default=None,
+                        help="Read slug inventory from this parquet instead of the default Wayback one")
+    args = parser.parse_args()
+
+    # Load slug inventory — either from a custom parquet or the default Wayback run
+    slugs_parquet = args.slugs_from if args.slugs_from else EXISTING_PARQUET
+    df_existing = pd.read_parquet(slugs_parquet)
+    slugs = df_existing["Slug"].dropna().unique().tolist()
+    # source_urls may not be present in custom parcets — fall back to constructing from slug
+    if "SourceURL" in df_existing.columns:
+        source_urls = dict(zip(df_existing["Slug"], df_existing["SourceURL"]))
+    else:
+        source_urls = {s: BASE_URL.format(slug=s) for s in slugs}
+
+    # Resume support
+    done_slugs: set[str] = set()
+    if args.resume and args.out.exists():
+        df_done = pd.read_parquet(args.out)
+        done_slugs = set(df_done["Slug"].dropna().tolist())
+        print(f"Resuming — {len(done_slugs)} slugs already scraped")
+
+    if args.limit:
+        slugs = slugs[: args.limit]
+
+    results: list[dict[str, Any]] = []
+    skipped = 0
+    failed = 0
+
+    _UA = (
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
+    )
+
+    with sync_playwright() as p:
+        browser = p.chromium.launch(headless=True)
+
+        for i, slug in enumerate(slugs):
+            if slug in done_slugs:
+                skipped += 1
+                continue
+
+            url = BASE_URL.format(slug=slug)
+            print(f"[{i+1}/{len(slugs)}] {slug} … ", end="", flush=True)
+
+            # Use a fresh browser context per slug to avoid Cloudflare session-level
+            # bot detection, which fires on the 2nd+ request in the same context.
+            context = browser.new_context(
+                user_agent=_UA,
+                viewport={"width": 1280, "height": 900},
+            )
+            page = context.new_page()
+
+            try:
+                page.goto(url, timeout=NAV_TIMEOUT_MS, wait_until="domcontentloaded")
+                page.wait_for_timeout(RENDER_WAIT_MS)
+                recipe = _parse_recipe(page, slug, source_urls.get(slug, url))
+            except PWTimeout:
+                print("TIMEOUT")
+                failed += 1
+            except Exception as exc:
+                print(f"ERROR: {exc}")
+                failed += 1
+            else:
+                if recipe is None:
+                    print("no content (404 or redirect)")
+                    failed += 1
+                elif recipe["HasFullRecipe"]:
+                    n = len(recipe["RecipeIngredientParts"])
+                    s = len(recipe["RecipeInstructions"])
+                    print(f"OK  ({n} ingredients, {s} steps)")
+                    results.append(recipe)
+                else:
+                    print(f"partial (ings={len(recipe['RecipeIngredientParts'])}, steps={len(recipe['RecipeInstructions'])})")
+                    results.append(recipe)
+            finally:
+                context.close()
+
+            time.sleep(args.delay)
+
+        browser.close()
+
+    print(f"\nDone — {len(results)} scraped, {skipped} skipped, {failed} failed")
+
+    if results:
+        df_out = pd.DataFrame(results)
+        # Merge with existing metadata (nutrition stubs, wayback fields) for slugs
+        # that didn't previously have full data
+        args.out.parent.mkdir(parents=True, exist_ok=True)
+        if args.resume and args.out.exists():
+            df_prev = pd.read_parquet(args.out)
+            df_out = pd.concat([df_prev, df_out], ignore_index=True)
+            df_out = df_out.drop_duplicates(subset=["Slug"], keep="last")
+        df_out.to_parquet(args.out, index=False)
+        full_count = df_out["HasFullRecipe"].sum() if "HasFullRecipe" in df_out.columns else "?"
+        print(f"Saved {len(df_out)} rows to {args.out}  ({full_count} with full recipes)")
+    else:
+        print("No results — output not written")
+
+
+if __name__ == "__main__":
+    main()