feat(pipeline): purple carrot weekly menu scraper with CF bypass
Add three new scripts for Purple Carrot recipe pipeline: - discover_current_menu.py: fetches this week's active menu slugs from /plant-based-recipes using requests (server-rendered HTML, no JS needed). Accumulates slugs across weekly runs for building a recipe corpus over time. - discover_slugs_categories.py: crawls recipe-category listing pages with ?page=N pagination to discover historical slug inventory. Note: category archive slugs (past menu items) 404 when scraped live; only use for identifying currently-featured recipes per category. - scrape_live.py: updated with --slugs-from flag (load slug inventory from any parquet, not just the default Wayback one) and fresh-context-per-slug pattern to bypass Cloudflare session-level bot detection (which fires on the 2nd+ request in a shared browser context). Discovery: the live site only renders full ingredient/instruction content for recipes currently on the active weekly menu. 23/23 current menu recipes scraped successfully (100% hit rate vs ~1% for archived slugs).
This commit is contained in:
parent
56f942b3fd
commit
a9ab996bcc
3 changed files with 588 additions and 0 deletions
120
scripts/pipeline/purple_carrot/discover_current_menu.py
Normal file
120
scripts/pipeline/purple_carrot/discover_current_menu.py
Normal file
|
|
@ -0,0 +1,120 @@
|
|||
"""Discover Purple Carrot's current weekly menu recipe slugs.
|
||||
|
||||
The main /plant-based-recipes listing page always renders the current week's
|
||||
menu as server-side HTML. This script pulls those slugs and writes them to a
|
||||
parquet that can be passed directly to scrape_live.py via --slugs-from.
|
||||
|
||||
Run weekly (e.g. via cron) to accumulate new recipes as the menu rotates.
|
||||
|
||||
Usage:
|
||||
conda run -n cf python3 scripts/pipeline/purple_carrot/discover_current_menu.py \
|
||||
[--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet]
|
||||
|
||||
Then scrape:
|
||||
conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
|
||||
--slugs-from /Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet \
|
||||
--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \
|
||||
--resume
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ── Config ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
LISTING_URL = "https://www.purplecarrot.com/plant-based-recipes"
|
||||
BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
|
||||
|
||||
DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_menu.parquet")
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
}
|
||||
|
||||
RECIPE_HREF_RE = re.compile(r"/recipe/([^?#]+)")
|
||||
|
||||
|
||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def discover_current_slugs() -> list[str]:
|
||||
"""Fetch the listing page and return unique recipe slugs from the current menu."""
|
||||
resp = requests.get(LISTING_URL, headers=HEADERS, timeout=15)
|
||||
if resp.status_code != 200:
|
||||
print(f"ERROR: listing page returned HTTP {resp.status_code}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
slugs: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for a in soup.find_all("a", href=RECIPE_HREF_RE):
|
||||
m = RECIPE_HREF_RE.search(a["href"])
|
||||
if m:
|
||||
slug = m.group(1)
|
||||
if slug not in seen:
|
||||
seen.add(slug)
|
||||
slugs.append(slug)
|
||||
return slugs
|
||||
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
||||
args = parser.parse_args()
|
||||
|
||||
print(f"Fetching current menu from {LISTING_URL} …")
|
||||
slugs = discover_current_slugs()
|
||||
|
||||
if not slugs:
|
||||
print("No slugs found — the listing page may have changed structure or blocked the request.")
|
||||
sys.exit(1)
|
||||
|
||||
today = date.today().isoformat()
|
||||
records = [
|
||||
{
|
||||
"Slug": slug,
|
||||
"SourceURL": BASE_URL.format(slug=slug),
|
||||
"Source": "purplecarrot_menu",
|
||||
"DiscoveredDate": today,
|
||||
}
|
||||
for slug in slugs
|
||||
]
|
||||
|
||||
# Merge with any existing menu parquet (accumulate weeks)
|
||||
df_new = pd.DataFrame(records)
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if args.out.exists():
|
||||
df_prev = pd.read_parquet(args.out)
|
||||
combined = pd.concat([df_prev, df_new], ignore_index=True)
|
||||
combined = combined.drop_duplicates(subset=["Slug"], keep="first")
|
||||
df_new = combined
|
||||
|
||||
df_new.to_parquet(args.out, index=False)
|
||||
|
||||
print(f"Found {len(slugs)} current-menu slugs this week:")
|
||||
for s in slugs:
|
||||
print(f" {s}")
|
||||
print(f"\nSaved {len(df_new)} total slugs (accumulated) to {args.out}")
|
||||
print(f"\nTo scrape full recipes:")
|
||||
print(f" conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \\")
|
||||
print(f" --slugs-from {args.out} \\")
|
||||
print(f" --out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet \\")
|
||||
print(f" --resume")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
218
scripts/pipeline/purple_carrot/discover_slugs_categories.py
Normal file
218
scripts/pipeline/purple_carrot/discover_slugs_categories.py
Normal file
|
|
@ -0,0 +1,218 @@
|
|||
"""Discover Purple Carrot recipe slugs by crawling all recipe-category listing pages.
|
||||
|
||||
The site serves full server-rendered HTML for category pages, paginated via
|
||||
?page=N. Each page loads 18 recipe cards. This script crawls every category
|
||||
across all pages and writes a deduplicated slug inventory.
|
||||
|
||||
Usage:
|
||||
conda run -n cf python3 scripts/pipeline/purple_carrot/discover_slugs_categories.py \
|
||||
[--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_slugs.parquet] \
|
||||
[--delay 2.0] \
|
||||
[--max-pages 50] # safety cap per category (comfort-foods has ~18)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ── Config ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
BASE = "https://www.purplecarrot.com"
|
||||
|
||||
# All known category slugs (from /plant-based-recipes nav)
|
||||
CATEGORIES: list[str] = [
|
||||
"comfort-foods",
|
||||
"family-friendly",
|
||||
"healthy-desserts",
|
||||
"holiday-recipes",
|
||||
"quick-and-easy",
|
||||
"party-foods",
|
||||
"seasonal-menu",
|
||||
"spring-recipes",
|
||||
"summer-recipes",
|
||||
"fall-recipes",
|
||||
"winter-recipes",
|
||||
"african",
|
||||
"american",
|
||||
"asian",
|
||||
"comfort",
|
||||
"french",
|
||||
"indian",
|
||||
"italian",
|
||||
"mediterranean",
|
||||
"mexican",
|
||||
"middle-eastern",
|
||||
"soups",
|
||||
"salads",
|
||||
"bowls",
|
||||
"pasta",
|
||||
"sandwiches-wraps",
|
||||
"tacos",
|
||||
"breakfast",
|
||||
"snacks-sides",
|
||||
]
|
||||
|
||||
DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_slugs.parquet")
|
||||
EXISTING_PARQUET = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot.parquet")
|
||||
|
||||
RECIPE_LINK_SELECTOR = "a.c-recipe__title"
|
||||
SLUG_RE = re.compile(r"/recipe/([^?#]+)")
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
),
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
}
|
||||
|
||||
|
||||
# ── Helpers ────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _fetch_html(url: str, session: requests.Session) -> str | None:
|
||||
"""Fetch URL and return HTML string, or None on failure."""
|
||||
try:
|
||||
resp = session.get(url, headers=HEADERS, timeout=15)
|
||||
if resp.status_code == 200:
|
||||
return resp.text
|
||||
if resp.status_code == 404:
|
||||
return None # expected end of pagination
|
||||
print(f" HTTP {resp.status_code} — {url}")
|
||||
return None
|
||||
except Exception as exc:
|
||||
print(f" ERROR fetching {url}: {exc}")
|
||||
return None
|
||||
|
||||
|
||||
def _extract_slugs(html: str) -> list[str]:
|
||||
"""Pull recipe slugs from one listing-page HTML response."""
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
slugs: list[str] = []
|
||||
for a in soup.select(RECIPE_LINK_SELECTOR):
|
||||
href = a.get("href", "")
|
||||
m = SLUG_RE.search(href)
|
||||
if m:
|
||||
slugs.append(m.group(1))
|
||||
return slugs
|
||||
|
||||
|
||||
def _get_category_total(html: str) -> int | None:
|
||||
"""Try to parse the recipe count shown on the category page (e.g. '319 Recipes')."""
|
||||
m = re.search(r"(\d+)\s+Recipes?\b", html)
|
||||
return int(m.group(1)) if m else None
|
||||
|
||||
|
||||
def _discover_category(
|
||||
category: str,
|
||||
session: requests.Session,
|
||||
delay: float,
|
||||
max_pages: int,
|
||||
) -> tuple[list[str], int]:
|
||||
"""Crawl all pages of a category, return (slugs, pages_fetched)."""
|
||||
slugs: list[str] = []
|
||||
for page_num in range(1, max_pages + 1):
|
||||
if page_num == 1:
|
||||
url = f"{BASE}/recipe-categories/{category}"
|
||||
else:
|
||||
url = f"{BASE}/recipe-categories/{category}?page={page_num}"
|
||||
|
||||
html = _fetch_html(url, session)
|
||||
if html is None:
|
||||
break # 404 or error = past the end
|
||||
|
||||
page_slugs = _extract_slugs(html)
|
||||
if not page_slugs:
|
||||
# Show total if we got a page but no links (category slug may be wrong)
|
||||
if page_num == 1:
|
||||
total = _get_category_total(html)
|
||||
if total is not None:
|
||||
print(f" page 1 loaded (total={total}) but 0 recipe links — selector may need updating")
|
||||
break
|
||||
|
||||
slugs.extend(page_slugs)
|
||||
|
||||
# Print progress
|
||||
total_hint = _get_category_total(html) if page_num == 1 else None
|
||||
total_str = f" / {total_hint}" if total_hint else ""
|
||||
print(f" page {page_num}: +{len(page_slugs)} slugs ({len(slugs)}{total_str} cumulative)")
|
||||
|
||||
if len(page_slugs) < 18:
|
||||
# Short page = last page
|
||||
break
|
||||
|
||||
time.sleep(delay)
|
||||
|
||||
return slugs, (len(slugs) + 17) // 18 # approximate pages
|
||||
|
||||
|
||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
||||
parser.add_argument("--delay", type=float, default=2.0,
|
||||
help="Seconds between page requests")
|
||||
parser.add_argument("--max-pages", type=int, default=50,
|
||||
help="Safety cap on pages per category")
|
||||
parser.add_argument("--categories", nargs="*",
|
||||
help="Crawl only these category slugs (default: all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
categories = args.categories or CATEGORIES
|
||||
|
||||
# Seed with any slugs from the Wayback parquet
|
||||
known_slugs: set[str] = set()
|
||||
if EXISTING_PARQUET.exists():
|
||||
df_wb = pd.read_parquet(EXISTING_PARQUET)
|
||||
known_slugs = set(df_wb["Slug"].dropna().tolist())
|
||||
print(f"Seeded with {len(known_slugs)} slugs from Wayback parquet")
|
||||
|
||||
all_records: list[dict[str, Any]] = []
|
||||
session = requests.Session()
|
||||
|
||||
for category in categories:
|
||||
print(f"\n[{category}]")
|
||||
cat_slugs, pages = _discover_category(category, session, args.delay, args.max_pages)
|
||||
for slug in cat_slugs:
|
||||
all_records.append({"Slug": slug, "Category": category, "Source": "purplecarrot_category"})
|
||||
print(f" → {len(cat_slugs)} slugs across ~{pages} pages")
|
||||
time.sleep(args.delay)
|
||||
|
||||
if not all_records:
|
||||
print("\nNo records found — check that categories are correct and the site is accessible")
|
||||
return
|
||||
|
||||
# Deduplicate keeping first category encountered
|
||||
df_new = pd.DataFrame(all_records)
|
||||
df_new = df_new.drop_duplicates(subset=["Slug"], keep="first")
|
||||
|
||||
# Also include Wayback slugs not already in the new set
|
||||
if known_slugs:
|
||||
wb_only = known_slugs - set(df_new["Slug"].tolist())
|
||||
if wb_only:
|
||||
df_wb_extra = pd.DataFrame([
|
||||
{"Slug": s, "Category": "wayback", "Source": "purplecarrot_wayback"}
|
||||
for s in wb_only
|
||||
])
|
||||
df_new = pd.concat([df_new, df_wb_extra], ignore_index=True)
|
||||
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
df_new.to_parquet(args.out, index=False)
|
||||
|
||||
new_count = len(df_new)
|
||||
cat_count = len(df_new[df_new["Source"] == "purplecarrot_category"])
|
||||
print(f"\nDone — {new_count} total slugs saved to {args.out}")
|
||||
print(f" {cat_count} from category pages, {new_count - cat_count} from Wayback only")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
250
scripts/pipeline/purple_carrot/scrape_live.py
Normal file
250
scripts/pipeline/purple_carrot/scrape_live.py
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
"""Playwright scraper for live purplecarrot.com recipe pages.
|
||||
|
||||
Uses the slug inventory already in recipes_purplecarrot.parquet and fills in
|
||||
the missing ingredients/instructions by hitting the live site directly.
|
||||
|
||||
Usage:
|
||||
conda run -n cf python3 scripts/pipeline/purple_carrot/scrape_live.py \
|
||||
[--out /Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet] \
|
||||
[--delay 2.5] \
|
||||
[--limit 20]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import pandas as pd
|
||||
from playwright.sync_api import sync_playwright, Page, TimeoutError as PWTimeout
|
||||
|
||||
# ── Config ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
BASE_URL = "https://www.purplecarrot.com/recipe/{slug}"
|
||||
DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot_live.parquet")
|
||||
EXISTING_PARQUET = Path("/Library/Assets/kiwi/pipeline/recipes_purplecarrot.parquet")
|
||||
|
||||
RENDER_WAIT_MS = 2500 # JS render settle time
|
||||
NAV_TIMEOUT_MS = 20_000
|
||||
|
||||
|
||||
# ── Page parser ────────────────────────────────────────────────────────────────
|
||||
|
||||
def _text(page: Page, selector: str) -> str:
|
||||
el = page.query_selector(selector)
|
||||
return el.inner_text().strip() if el else ""
|
||||
|
||||
|
||||
def _texts(page: Page, selector: str) -> list[str]:
|
||||
return [el.inner_text().strip() for el in page.query_selector_all(selector)]
|
||||
|
||||
|
||||
def _parse_recipe(page: Page, slug: str, source_url: str) -> dict[str, Any] | None:
|
||||
"""Extract structured recipe data from the rendered page."""
|
||||
body = page.inner_text("body")
|
||||
|
||||
# Abort if we've been bounced to a generic listing / 404
|
||||
if "Page Not Found" in body or slug not in page.url:
|
||||
return None
|
||||
|
||||
# ── Title ──────────────────────────────────────────────────────────────────
|
||||
# The <h1> on product pages tends to be the recipe name
|
||||
title = (_text(page, "h1") or _text(page, "[class*='recipe-title']")).strip()
|
||||
if not title:
|
||||
# Fallback: first heading-like text before "Ingredients"
|
||||
idx = body.find("Ingredients\n")
|
||||
title = body[:idx].strip().splitlines()[-1] if idx > 0 else ""
|
||||
|
||||
# ── Ingredients / Instructions via body text ───────────────────────────────
|
||||
ing_start = body.find("\nIngredients\n")
|
||||
inst_start = body.find("\nInstructions\n")
|
||||
footer_start = body.find("\nShop\n") # footer sentinel
|
||||
|
||||
if ing_start == -1:
|
||||
return None # page didn't render recipe content
|
||||
|
||||
raw_ingredients: list[str] = []
|
||||
raw_instructions: list[str] = []
|
||||
|
||||
if ing_start != -1 and inst_start != -1:
|
||||
ing_block = body[ing_start + len("\nIngredients\n"):inst_start].strip()
|
||||
raw_ingredients = [l.strip() for l in ing_block.splitlines() if l.strip()]
|
||||
|
||||
if inst_start != -1:
|
||||
end = footer_start if footer_start > inst_start else len(body)
|
||||
inst_block = body[inst_start + len("\nInstructions\n"):end].strip()
|
||||
# Steps start with a digit
|
||||
steps: list[str] = []
|
||||
current: list[str] = []
|
||||
for line in inst_block.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
if re.match(r"^\d+$", line):
|
||||
if current:
|
||||
steps.append(" ".join(current))
|
||||
current = []
|
||||
elif line.startswith("CULINARY NOTES"):
|
||||
break
|
||||
else:
|
||||
current.append(line)
|
||||
if current:
|
||||
steps.append(" ".join(current))
|
||||
raw_instructions = steps
|
||||
|
||||
# ── Nutrition ──────────────────────────────────────────────────────────────
|
||||
def _extract_num(pattern: str) -> float | None:
|
||||
m = re.search(pattern, body)
|
||||
try:
|
||||
return float(m.group(1)) if m else None
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
cal = _extract_num(r"(\d+)\s*CAL")
|
||||
fat = _extract_num(r"(\d+(?:\.\d+)?)g\s*FAT")
|
||||
carbs = _extract_num(r"(\d+(?:\.\d+)?)g\s*CARBS")
|
||||
prot = _extract_num(r"(\d+(?:\.\d+)?)g\s*PROTEIN")
|
||||
fiber = _extract_num(r"(\d+(?:\.\d+)?)g\s*FIBER")
|
||||
|
||||
# ── Allergens / tags ───────────────────────────────────────────────────────
|
||||
allergen_m = re.search(r"Allergens?:\s*([^\n]+)", body)
|
||||
allergens = allergen_m.group(1).strip() if allergen_m else ""
|
||||
|
||||
# Feature tags like HIGH-PROTEIN, QUICK, etc. appear before Ingredients
|
||||
pre_ing = body[:ing_start]
|
||||
tags = re.findall(r"\b(HIGH-PROTEIN|QUICK|SPICY|LOW[\-\s]CALORIE|VEGAN|FAMILY\s+FRIENDLY)\b", pre_ing)
|
||||
|
||||
return {
|
||||
"Slug": slug,
|
||||
"Name": title,
|
||||
"SourceURL": source_url,
|
||||
"Source": "purplecarrot_live",
|
||||
"RecipeIngredientParts": raw_ingredients,
|
||||
"RecipeInstructions": raw_instructions,
|
||||
"Calories": cal,
|
||||
"FatContent": fat,
|
||||
"CarbohydrateContent": carbs,
|
||||
"ProteinContent": prot,
|
||||
"FiberContent": fiber,
|
||||
"Allergens": allergens,
|
||||
"Keywords": tags,
|
||||
"HasFullRecipe": bool(raw_ingredients and raw_instructions),
|
||||
}
|
||||
|
||||
|
||||
# ── Main ───────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--out", type=Path, default=DEFAULT_OUT)
|
||||
parser.add_argument("--delay", type=float, default=2.5,
|
||||
help="Seconds between requests (be polite)")
|
||||
parser.add_argument("--limit", type=int, default=0,
|
||||
help="Stop after N slugs (0 = all)")
|
||||
parser.add_argument("--resume", action="store_true",
|
||||
help="Skip slugs already present in --out")
|
||||
parser.add_argument("--slugs-from", type=Path, default=None,
|
||||
help="Read slug inventory from this parquet instead of the default Wayback one")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load slug inventory — either from a custom parquet or the default Wayback run
|
||||
slugs_parquet = args.slugs_from if args.slugs_from else EXISTING_PARQUET
|
||||
df_existing = pd.read_parquet(slugs_parquet)
|
||||
slugs = df_existing["Slug"].dropna().unique().tolist()
|
||||
# source_urls may not be present in custom parcets — fall back to constructing from slug
|
||||
if "SourceURL" in df_existing.columns:
|
||||
source_urls = dict(zip(df_existing["Slug"], df_existing["SourceURL"]))
|
||||
else:
|
||||
source_urls = {s: BASE_URL.format(slug=s) for s in slugs}
|
||||
|
||||
# Resume support
|
||||
done_slugs: set[str] = set()
|
||||
if args.resume and args.out.exists():
|
||||
df_done = pd.read_parquet(args.out)
|
||||
done_slugs = set(df_done["Slug"].dropna().tolist())
|
||||
print(f"Resuming — {len(done_slugs)} slugs already scraped")
|
||||
|
||||
if args.limit:
|
||||
slugs = slugs[: args.limit]
|
||||
|
||||
results: list[dict[str, Any]] = []
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
_UA = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
with sync_playwright() as p:
|
||||
browser = p.chromium.launch(headless=True)
|
||||
|
||||
for i, slug in enumerate(slugs):
|
||||
if slug in done_slugs:
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
url = BASE_URL.format(slug=slug)
|
||||
print(f"[{i+1}/{len(slugs)}] {slug} … ", end="", flush=True)
|
||||
|
||||
# Use a fresh browser context per slug to avoid Cloudflare session-level
|
||||
# bot detection, which fires on the 2nd+ request in the same context.
|
||||
context = browser.new_context(
|
||||
user_agent=_UA,
|
||||
viewport={"width": 1280, "height": 900},
|
||||
)
|
||||
page = context.new_page()
|
||||
|
||||
try:
|
||||
page.goto(url, timeout=NAV_TIMEOUT_MS, wait_until="domcontentloaded")
|
||||
page.wait_for_timeout(RENDER_WAIT_MS)
|
||||
recipe = _parse_recipe(page, slug, source_urls.get(slug, url))
|
||||
except PWTimeout:
|
||||
print("TIMEOUT")
|
||||
failed += 1
|
||||
except Exception as exc:
|
||||
print(f"ERROR: {exc}")
|
||||
failed += 1
|
||||
else:
|
||||
if recipe is None:
|
||||
print("no content (404 or redirect)")
|
||||
failed += 1
|
||||
elif recipe["HasFullRecipe"]:
|
||||
n = len(recipe["RecipeIngredientParts"])
|
||||
s = len(recipe["RecipeInstructions"])
|
||||
print(f"OK ({n} ingredients, {s} steps)")
|
||||
results.append(recipe)
|
||||
else:
|
||||
print(f"partial (ings={len(recipe['RecipeIngredientParts'])}, steps={len(recipe['RecipeInstructions'])})")
|
||||
results.append(recipe)
|
||||
finally:
|
||||
context.close()
|
||||
|
||||
time.sleep(args.delay)
|
||||
|
||||
browser.close()
|
||||
|
||||
print(f"\nDone — {len(results)} scraped, {skipped} skipped, {failed} failed")
|
||||
|
||||
if results:
|
||||
df_out = pd.DataFrame(results)
|
||||
# Merge with existing metadata (nutrition stubs, wayback fields) for slugs
|
||||
# that didn't previously have full data
|
||||
args.out.parent.mkdir(parents=True, exist_ok=True)
|
||||
if args.resume and args.out.exists():
|
||||
df_prev = pd.read_parquet(args.out)
|
||||
df_out = pd.concat([df_prev, df_out], ignore_index=True)
|
||||
df_out = df_out.drop_duplicates(subset=["Slug"], keep="last")
|
||||
df_out.to_parquet(args.out, index=False)
|
||||
full_count = df_out["HasFullRecipe"].sum() if "HasFullRecipe" in df_out.columns else "?"
|
||||
print(f"Saved {len(df_out)} rows to {args.out} ({full_count} with full recipes)")
|
||||
else:
|
||||
print("No results — output not written")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue