kiwi/scripts/pipeline/purple_carrot/discover_current_menu.py
pyr0ball a9ab996bcc
Some checks are pending
CI / Backend (Python) (push) Waiting to run
CI / Frontend (Vue) (push) Waiting to run
Mirror / mirror (push) Waiting to run
feat(pipeline): purple carrot weekly menu scraper with CF bypass
Add three new scripts for Purple Carrot recipe pipeline:

- discover_current_menu.py: fetches this week's active menu slugs from
  /plant-based-recipes using requests (server-rendered HTML, no JS needed).
  Accumulates slugs across weekly runs for building a recipe corpus over time.

- discover_slugs_categories.py: crawls recipe-category listing pages with
  ?page=N pagination to discover historical slug inventory. Note: category
  archive slugs (past menu items) 404 when scraped live; only use for
  identifying currently-featured recipes per category.

- scrape_live.py: updated with --slugs-from flag (load slug inventory from
  any parquet, not just the default Wayback one) and fresh-context-per-slug
  pattern to bypass Cloudflare session-level bot detection (which fires on
  the 2nd+ request in a shared browser context).

Discovery: the live site only renders full ingredient/instruction content for
recipes currently on the active weekly menu. 23/23 current menu recipes
scraped successfully (100% hit rate vs ~1% for archived slugs).
2026-05-21 16:16:32 -07:00

120 lines
4.2 KiB
Python

"""Discover Purple Carrot's current weekly menu recipe slugs.
The main /plant-based-recipes listing page always renders the current week's
menu as server-side HTML. This script pulls those slugs and writes them to a
parquet that can be passed directly to scrape_live.py via --slugs-from.
Run weekly (e.g. via cron) to accumulate new recipes as the menu rotates.
Usage:
conda run -n cf python3 scripts/pipeline/purple_carrot/discover_current_menu.py \
[--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet]
Then scrape:
conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
--slugs-from /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet \
--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \
--resume
"""
from __future__ import annotations
import re
import sys
from datetime import date
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
# ── Config ─────────────────────────────────────────────────────────────────────
LISTING_URL = "https://www.purplecarrot.com/plant-based-recipes"
BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet")
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
RECIPE_HREF_RE = re.compile(r"/recipe/([^?#]+)")
# ── Main ───────────────────────────────────────────────────────────────────────
def discover_current_slugs() -> list[str]:
"""Fetch the listing page and return unique recipe slugs from the current menu."""
resp = requests.get(LISTING_URL, headers=HEADERS, timeout=15)
if resp.status_code != 200:
print(f"ERROR: listing page returned HTTP {resp.status_code}", file=sys.stderr)
return []
soup = BeautifulSoup(resp.text, "html.parser")
slugs: list[str] = []
seen: set[str] = set()
for a in soup.find_all("a", href=RECIPE_HREF_RE):
m = RECIPE_HREF_RE.search(a["href"])
if m:
slug = m.group(1)
if slug not in seen:
seen.add(slug)
slugs.append(slug)
return slugs
def main() -> None:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
args = parser.parse_args()
print(f"Fetching current menu from {LISTING_URL}")
slugs = discover_current_slugs()
if not slugs:
print("No slugs found — the listing page may have changed structure or blocked the request.")
sys.exit(1)
today = date.today().isoformat()
records = [
{
"Slug": slug,
"SourceURL": BASE_URL.format(slug=slug),
"Source": "purplecarrot_menu",
"DiscoveredDate": today,
}
for slug in slugs
]
# Merge with any existing menu parquet (accumulate weeks)
df_new = pd.DataFrame(records)
args.out.parent.mkdir(parents=True, exist_ok=True)
if args.out.exists():
df_prev = pd.read_parquet(args.out)
combined = pd.concat([df_prev, df_new], ignore_index=True)
combined = combined.drop_duplicates(subset=["Slug"], keep="first")
df_new = combined
df_new.to_parquet(args.out, index=False)
print(f"Found {len(slugs)} current-menu slugs this week:")
for s in slugs:
print(f" {s}")
print(f"\nSaved {len(df_new)} total slugs (accumulated) to {args.out}")
print(f"\nTo scrape full recipes:")
print(f" conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \\")
print(f" --slugs-from {args.out} \\")
print(f" --out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \\")
print(f" --resume")
if __name__ == "__main__":
main()