Add three new scripts for Purple Carrot recipe pipeline: - discover_current_menu.py: fetches this week's active menu slugs from /plant-based-recipes using requests (server-rendered HTML, no JS needed). Accumulates slugs across weekly runs for building a recipe corpus over time. - discover_slugs_categories.py: crawls recipe-category listing pages with ?page=N pagination to discover historical slug inventory. Note: category archive slugs (past menu items) 404 when scraped live; only use for identifying currently-featured recipes per category. - scrape_live.py: updated with --slugs-from flag (load slug inventory from any parquet, not just the default Wayback one) and fresh-context-per-slug pattern to bypass Cloudflare session-level bot detection (which fires on the 2nd+ request in a shared browser context). Discovery: the live site only renders full ingredient/instruction content for recipes currently on the active weekly menu. 23/23 current menu recipes scraped successfully (100% hit rate vs ~1% for archived slugs).
250 lines
10 KiB
Python
250 lines
10 KiB
Python
"""Playwright scraper for live purplecarrot.com recipe pages.
|
|
|
|
Uses the slug inventory already in recipes_purplecarrot.parquet and fills in
|
|
the missing ingredients/instructions by hitting the live site directly.
|
|
|
|
Usage:
|
|
conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
|
|
[--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet] \
|
|
[--delay 2.5] \
|
|
[--limit 20]
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import pandas as pd
|
|
from playwright.sync_api import sync_playwright, Page, TimeoutError as PWTimeout
|
|
|
|
# ── Config ─────────────────────────────────────────────────────────────────────
|
|
|
|
BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
|
|
DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet")
|
|
EXISTING_PARQUET = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot.parquet")
|
|
|
|
RENDER_WAIT_MS = 2500 # JS render settle time
|
|
NAV_TIMEOUT_MS = 20_000
|
|
|
|
|
|
# ── Page parser ────────────────────────────────────────────────────────────────
|
|
|
|
def _text(page: Page, selector: str) -> str:
|
|
el = page.query_selector(selector)
|
|
return el.inner_text().strip() if el else ""
|
|
|
|
|
|
def _texts(page: Page, selector: str) -> list[str]:
|
|
return [el.inner_text().strip() for el in page.query_selector_all(selector)]
|
|
|
|
|
|
def _parse_recipe(page: Page, slug: str, source_url: str) -> dict[str, Any] | None:
|
|
"""Extract structured recipe data from the rendered page."""
|
|
body = page.inner_text("body")
|
|
|
|
# Abort if we've been bounced to a generic listing / 404
|
|
if "Page Not Found" in body or slug not in page.url:
|
|
return None
|
|
|
|
# ── Title ──────────────────────────────────────────────────────────────────
|
|
# The <h1> on product pages tends to be the recipe name
|
|
title = (_text(page, "h1") or _text(page, "[class*='recipe-title']")).strip()
|
|
if not title:
|
|
# Fallback: first heading-like text before "Ingredients"
|
|
idx = body.find("Ingredients\n")
|
|
title = body[:idx].strip().splitlines()[-1] if idx > 0 else ""
|
|
|
|
# ── Ingredients / Instructions via body text ───────────────────────────────
|
|
ing_start = body.find("\nIngredients\n")
|
|
inst_start = body.find("\nInstructions\n")
|
|
footer_start = body.find("\nShop\n") # footer sentinel
|
|
|
|
if ing_start == -1:
|
|
return None # page didn't render recipe content
|
|
|
|
raw_ingredients: list[str] = []
|
|
raw_instructions: list[str] = []
|
|
|
|
if ing_start != -1 and inst_start != -1:
|
|
ing_block = body[ing_start + len("\nIngredients\n"):inst_start].strip()
|
|
raw_ingredients = [l.strip() for l in ing_block.splitlines() if l.strip()]
|
|
|
|
if inst_start != -1:
|
|
end = footer_start if footer_start > inst_start else len(body)
|
|
inst_block = body[inst_start + len("\nInstructions\n"):end].strip()
|
|
# Steps start with a digit
|
|
steps: list[str] = []
|
|
current: list[str] = []
|
|
for line in inst_block.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
if re.match(r"^\d+$", line):
|
|
if current:
|
|
steps.append(" ".join(current))
|
|
current = []
|
|
elif line.startswith("CULINARY NOTES"):
|
|
break
|
|
else:
|
|
current.append(line)
|
|
if current:
|
|
steps.append(" ".join(current))
|
|
raw_instructions = steps
|
|
|
|
# ── Nutrition ──────────────────────────────────────────────────────────────
|
|
def _extract_num(pattern: str) -> float | None:
|
|
m = re.search(pattern, body)
|
|
try:
|
|
return float(m.group(1)) if m else None
|
|
except ValueError:
|
|
return None
|
|
|
|
cal = _extract_num(r"(\d+)\s*CAL")
|
|
fat = _extract_num(r"(\d+(?:\.\d+)?)g\s*FAT")
|
|
carbs = _extract_num(r"(\d+(?:\.\d+)?)g\s*CARBS")
|
|
prot = _extract_num(r"(\d+(?:\.\d+)?)g\s*PROTEIN")
|
|
fiber = _extract_num(r"(\d+(?:\.\d+)?)g\s*FIBER")
|
|
|
|
# ── Allergens / tags ───────────────────────────────────────────────────────
|
|
allergen_m = re.search(r"Allergens?:\s*([^\n]+)", body)
|
|
allergens = allergen_m.group(1).strip() if allergen_m else ""
|
|
|
|
# Feature tags like HIGH-PROTEIN, QUICK, etc. appear before Ingredients
|
|
pre_ing = body[:ing_start]
|
|
tags = re.findall(r"\b(HIGH-PROTEIN|QUICK|SPICY|LOW[\-\s]CALORIE|VEGAN|FAMILY\s+FRIENDLY)\b", pre_ing)
|
|
|
|
return {
|
|
"Slug": slug,
|
|
"Name": title,
|
|
"SourceURL": source_url,
|
|
"Source": "purplecarrot_live",
|
|
"RecipeIngredientParts": raw_ingredients,
|
|
"RecipeInstructions": raw_instructions,
|
|
"Calories": cal,
|
|
"FatContent": fat,
|
|
"CarbohydrateContent": carbs,
|
|
"ProteinContent": prot,
|
|
"FiberContent": fiber,
|
|
"Allergens": allergens,
|
|
"Keywords": tags,
|
|
"HasFullRecipe": bool(raw_ingredients and raw_instructions),
|
|
}
|
|
|
|
|
|
# ── Main ───────────────────────────────────────────────────────────────────────
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
|
parser.add_argument("--delay", type=float, default=2.5,
|
|
help="Seconds between requests (be polite)")
|
|
parser.add_argument("--limit", type=int, default=0,
|
|
help="Stop after N slugs (0 = all)")
|
|
parser.add_argument("--resume", action="store_true",
|
|
help="Skip slugs already present in --out")
|
|
parser.add_argument("--slugs-from", type=Path, default=None,
|
|
help="Read slug inventory from this parquet instead of the default Wayback one")
|
|
args = parser.parse_args()
|
|
|
|
# Load slug inventory — either from a custom parquet or the default Wayback run
|
|
slugs_parquet = args.slugs_from if args.slugs_from else EXISTING_PARQUET
|
|
df_existing = pd.read_parquet(slugs_parquet)
|
|
slugs = df_existing["Slug"].dropna().unique().tolist()
|
|
# source_urls may not be present in custom parcets — fall back to constructing from slug
|
|
if "SourceURL" in df_existing.columns:
|
|
source_urls = dict(zip(df_existing["Slug"], df_existing["SourceURL"]))
|
|
else:
|
|
source_urls = {s: BASE_URL.format(slug=s) for s in slugs}
|
|
|
|
# Resume support
|
|
done_slugs: set[str] = set()
|
|
if args.resume and args.out.exists():
|
|
df_done = pd.read_parquet(args.out)
|
|
done_slugs = set(df_done["Slug"].dropna().tolist())
|
|
print(f"Resuming — {len(done_slugs)} slugs already scraped")
|
|
|
|
if args.limit:
|
|
slugs = slugs[: args.limit]
|
|
|
|
results: list[dict[str, Any]] = []
|
|
skipped = 0
|
|
failed = 0
|
|
|
|
_UA = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
with sync_playwright() as p:
|
|
browser = p.chromium.launch(headless=True)
|
|
|
|
for i, slug in enumerate(slugs):
|
|
if slug in done_slugs:
|
|
skipped += 1
|
|
continue
|
|
|
|
url = BASE_URL.format(slug=slug)
|
|
print(f"[{i+1}/{len(slugs)}] {slug} … ", end="", flush=True)
|
|
|
|
# Use a fresh browser context per slug to avoid Cloudflare session-level
|
|
# bot detection, which fires on the 2nd+ request in the same context.
|
|
context = browser.new_context(
|
|
user_agent=_UA,
|
|
viewport={"width": 1280, "height": 900},
|
|
)
|
|
page = context.new_page()
|
|
|
|
try:
|
|
page.goto(url, timeout=NAV_TIMEOUT_MS, wait_until="domcontentloaded")
|
|
page.wait_for_timeout(RENDER_WAIT_MS)
|
|
recipe = _parse_recipe(page, slug, source_urls.get(slug, url))
|
|
except PWTimeout:
|
|
print("TIMEOUT")
|
|
failed += 1
|
|
except Exception as exc:
|
|
print(f"ERROR: {exc}")
|
|
failed += 1
|
|
else:
|
|
if recipe is None:
|
|
print("no content (404 or redirect)")
|
|
failed += 1
|
|
elif recipe["HasFullRecipe"]:
|
|
n = len(recipe["RecipeIngredientParts"])
|
|
s = len(recipe["RecipeInstructions"])
|
|
print(f"OK ({n} ingredients, {s} steps)")
|
|
results.append(recipe)
|
|
else:
|
|
print(f"partial (ings={len(recipe['RecipeIngredientParts'])}, steps={len(recipe['RecipeInstructions'])})")
|
|
results.append(recipe)
|
|
finally:
|
|
context.close()
|
|
|
|
time.sleep(args.delay)
|
|
|
|
browser.close()
|
|
|
|
print(f"\nDone — {len(results)} scraped, {skipped} skipped, {failed} failed")
|
|
|
|
if results:
|
|
df_out = pd.DataFrame(results)
|
|
# Merge with existing metadata (nutrition stubs, wayback fields) for slugs
|
|
# that didn't previously have full data
|
|
args.out.parent.mkdir(parents=True, exist_ok=True)
|
|
if args.resume and args.out.exists():
|
|
df_prev = pd.read_parquet(args.out)
|
|
df_out = pd.concat([df_prev, df_out], ignore_index=True)
|
|
df_out = df_out.drop_duplicates(subset=["Slug"], keep="last")
|
|
df_out.to_parquet(args.out, index=False)
|
|
full_count = df_out["HasFullRecipe"].sum() if "HasFullRecipe" in df_out.columns else "?"
|
|
print(f"Saved {len(df_out)} rows to {args.out} ({full_count} with full recipes)")
|
|
else:
|
|
print("No results — output not written")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|