Add three new scripts for Purple Carrot recipe pipeline: - discover_current_menu.py: fetches this week's active menu slugs from /plant-based-recipes using requests (server-rendered HTML, no JS needed). Accumulates slugs across weekly runs for building a recipe corpus over time. - discover_slugs_categories.py: crawls recipe-category listing pages with ?page=N pagination to discover historical slug inventory. Note: category archive slugs (past menu items) 404 when scraped live; only use for identifying currently-featured recipes per category. - scrape_live.py: updated with --slugs-from flag (load slug inventory from any parquet, not just the default Wayback one) and fresh-context-per-slug pattern to bypass Cloudflare session-level bot detection (which fires on the 2nd+ request in a shared browser context). Discovery: the live site only renders full ingredient/instruction content for recipes currently on the active weekly menu. 23/23 current menu recipes scraped successfully (100% hit rate vs ~1% for archived slugs).
120 lines
4.2 KiB
Python
120 lines
4.2 KiB
Python
"""Discover Purple Carrot's current weekly menu recipe slugs.
|
|
|
|
The main /plant-based-recipes listing page always renders the current week's
|
|
menu as server-side HTML. This script pulls those slugs and writes them to a
|
|
parquet that can be passed directly to scrape_live.py via --slugs-from.
|
|
|
|
Run weekly (e.g. via cron) to accumulate new recipes as the menu rotates.
|
|
|
|
Usage:
|
|
conda run -n cf python3 scripts/pipeline/purple_carrot/discover_current_menu.py \
|
|
[--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet]
|
|
|
|
Then scrape:
|
|
conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
|
|
--slugs-from /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet \
|
|
--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \
|
|
--resume
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
import sys
|
|
from datetime import date
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# ── Config ─────────────────────────────────────────────────────────────────────
|
|
|
|
LISTING_URL = "https://www.purplecarrot.com/plant-based-recipes"
|
|
BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
|
|
|
|
DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet")
|
|
|
|
HEADERS = {
|
|
"User-Agent": (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
),
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
"Accept-Language": "en-US,en;q=0.5",
|
|
}
|
|
|
|
RECIPE_HREF_RE = re.compile(r"/recipe/([^?#]+)")
|
|
|
|
|
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
|
|
def discover_current_slugs() -> list[str]:
|
|
"""Fetch the listing page and return unique recipe slugs from the current menu."""
|
|
resp = requests.get(LISTING_URL, headers=HEADERS, timeout=15)
|
|
if resp.status_code != 200:
|
|
print(f"ERROR: listing page returned HTTP {resp.status_code}", file=sys.stderr)
|
|
return []
|
|
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
slugs: list[str] = []
|
|
seen: set[str] = set()
|
|
for a in soup.find_all("a", href=RECIPE_HREF_RE):
|
|
m = RECIPE_HREF_RE.search(a["href"])
|
|
if m:
|
|
slug = m.group(1)
|
|
if slug not in seen:
|
|
seen.add(slug)
|
|
slugs.append(slug)
|
|
return slugs
|
|
|
|
|
|
def main() -> None:
|
|
import argparse
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
|
args = parser.parse_args()
|
|
|
|
print(f"Fetching current menu from {LISTING_URL} …")
|
|
slugs = discover_current_slugs()
|
|
|
|
if not slugs:
|
|
print("No slugs found — the listing page may have changed structure or blocked the request.")
|
|
sys.exit(1)
|
|
|
|
today = date.today().isoformat()
|
|
records = [
|
|
{
|
|
"Slug": slug,
|
|
"SourceURL": BASE_URL.format(slug=slug),
|
|
"Source": "purplecarrot_menu",
|
|
"DiscoveredDate": today,
|
|
}
|
|
for slug in slugs
|
|
]
|
|
|
|
# Merge with any existing menu parquet (accumulate weeks)
|
|
df_new = pd.DataFrame(records)
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if args.out.exists():
|
|
df_prev = pd.read_parquet(args.out)
|
|
combined = pd.concat([df_prev, df_new], ignore_index=True)
|
|
combined = combined.drop_duplicates(subset=["Slug"], keep="first")
|
|
df_new = combined
|
|
|
|
df_new.to_parquet(args.out, index=False)
|
|
|
|
print(f"Found {len(slugs)} current-menu slugs this week:")
|
|
for s in slugs:
|
|
print(f" {s}")
|
|
print(f"\nSaved {len(df_new)} total slugs (accumulated) to {args.out}")
|
|
print(f"\nTo scrape full recipes:")
|
|
print(f" conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \\")
|
|
print(f" --slugs-from {args.out} \\")
|
|
print(f" --out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \\")
|
|
print(f" --resume")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|