kiwi/scripts/pipeline/purple_carrot/discover_wayback.py

"""
discover_wayback.py — enumerate Purple Carrot recipe slugs via the Wayback Machine.

Strategy:
  1. CDX API  → all archived /api/v2/menus/* URLs (multiple timestamps)
  2. Replay   → fetch each menu's menuItems, extract productPath slugs
  3. CDX API  → all archived /api/v1/products/* URLs (direct slug capture)
  4. CDX API  → /recipe-categories/* HTML pages for older slugs
  5. Deduplicate and write manifest to OUT_FILE

Output (JSONL, one record per recipe):
  {"slug": "...", "title": "...", "subtitle": "...", "cook_time": "...",
   "tags": [...], "serving_size": 2, "image_url": "...",
   "wayback_ts": "20260412150557", "source": "menu|product_api|category_page"}

Usage:
  conda run -n cf python -m scripts.pipeline.purple_carrot.discover_wayback
  conda run -n cf python -m scripts.pipeline.purple_carrot.discover_wayback --out /Library/Assets/kiwi/pipeline/pc_slugs.jsonl
"""
from __future__ import annotations

import argparse
import json
import logging
import time
from pathlib import Path
from typing import Any
from urllib.parse import urlencode

import requests

logger = logging.getLogger(__name__)

CDX_BASE = "http://web.archive.org/cdx/search/cdx"
WB_BASE = "https://web.archive.org/web"
PC_HOST = "www.purplecarrot.com"

# Polite delay between Wayback replay fetches (seconds)
REPLAY_DELAY = 1.0
CDX_DELAY = 0.5

DEFAULT_OUT = Path("/Library/Assets/kiwi/pipeline/pc_slugs.jsonl")


# ── CDX helpers ───────────────────────────────────────────────────────────────

def cdx_query(url_pattern: str, **kwargs) -> list[dict]:
    """Run a CDX search and return a list of result dicts."""
    params = {
        "url": url_pattern,
        "output": "json",
        "fl": "original,timestamp,statuscode",
        "collapse": "urlkey",
        "filter": "statuscode:200",
        **kwargs,
    }
    for attempt in range(3):
        try:
            resp = requests.get(CDX_BASE, params=params, timeout=30)
            resp.raise_for_status()
            rows = resp.json()
            if not rows or len(rows) < 2:
                return []
            headers = rows[0]
            return [dict(zip(headers, row)) for row in rows[1:]]
        except Exception as exc:
            logger.warning("CDX attempt %d failed: %s", attempt + 1, exc)
            time.sleep(2 ** attempt)
    return []


def wayback_get(url: str, timestamp: str) -> Any | None:
    """Fetch a Wayback replay of a URL and return parsed JSON (or None)."""
    replay_url = f"{WB_BASE}/{timestamp}/{url}"
    for attempt in range(3):
        try:
            resp = requests.get(replay_url, timeout=30)
            if resp.status_code == 200:
                return resp.json()
            if resp.status_code == 404:
                return None
        except Exception as exc:
            logger.warning("Wayback GET attempt %d failed for %s: %s", attempt + 1, url, exc)
        time.sleep(2 ** attempt)
    return None


# ── Slug extraction ───────────────────────────────────────────────────────────

def slug_from_product_path(path: str) -> str | None:
    """'/recipe/foo-bar-baz' → 'foo-bar-baz'."""
    if not path:
        return None
    return path.strip("/").split("/")[-1] or None


def _menu_item_to_record(item: dict, wayback_ts: str) -> dict | None:
    slug = slug_from_product_path(item.get("productPath", ""))
    if not slug:
        return None
    return {
        "slug": slug,
        "title": item.get("title", ""),
        "subtitle": item.get("subtitle", ""),
        "cook_time": item.get("cookTime", ""),
        "tags": item.get("filterTags") or [],
        "serving_size": item.get("servingSize"),
        "image_url": item.get("imageURL", ""),
        "description": item.get("description", ""),
        "wayback_ts": wayback_ts,
        "source": "menu",
    }


# ── Discovery passes ──────────────────────────────────────────────────────────

def pass_menus(seen_slugs: set[str]) -> list[dict]:
    """Walk all archived /api/v2/menus/* captures to extract slugs."""
    records: list[dict] = []

    # Find all distinct archived menu URLs
    menu_cdx = cdx_query(f"{PC_HOST}/api/v2/menus/*", limit="500")
    logger.info("CDX: %d archived menu URLs found", len(menu_cdx))
    time.sleep(CDX_DELAY)

    processed_menu_ids: set[str] = set()

    for entry in menu_cdx:
        url = entry["original"]
        ts = entry["timestamp"]

        # Skip the listing endpoint, only process individual menus
        if not url.split("?")[0].rstrip("/").split("/")[-1].isdigit():
            continue

        menu_id = url.split("?")[0].rstrip("/").split("/")[-1]
        if menu_id in processed_menu_ids:
            continue
        processed_menu_ids.add(menu_id)

        logger.info("Fetching menu %s (ts=%s) ...", menu_id, ts)
        data = wayback_get(url.split("?")[0] + "?logged_out=true", ts)
        time.sleep(REPLAY_DELAY)

        if not data or "menuItems" not in data:
            continue

        for item in data["menuItems"]:
            rec = _menu_item_to_record(item, ts)
            if rec and rec["slug"] not in seen_slugs:
                seen_slugs.add(rec["slug"])
                records.append(rec)
                logger.debug("  + %s", rec["slug"])

        logger.info("  %d new slugs (total so far: %d)", len(records), len(seen_slugs))

    return records


def pass_product_api(seen_slugs: set[str]) -> list[dict]:
    """Pick up any directly archived /api/v1/products/* URLs the menu pass missed."""
    records: list[dict] = []

    product_cdx = cdx_query(f"{PC_HOST}/api/v1/products/*", limit="5000")
    logger.info("CDX: %d archived product API URLs found", len(product_cdx))
    time.sleep(CDX_DELAY)

    for entry in product_cdx:
        slug = entry["original"].rstrip("/").split("/")[-1]
        if not slug or slug in seen_slugs:
            continue
        seen_slugs.add(slug)
        records.append({
            "slug": slug,
            "title": "",
            "subtitle": "",
            "cook_time": "",
            "tags": [],
            "serving_size": None,
            "image_url": "",
            "description": "",
            "wayback_ts": entry["timestamp"],
            "source": "product_api",
        })

    logger.info("product_api pass: %d new slugs", len(records))
    return records


def pass_category_pages(seen_slugs: set[str]) -> list[dict]:
    """Parse archived recipe-categories HTML pages for slugs not in the API.

    Category pages are rendered SSR/with inline JSON state on older captures,
    so we do a simple regex scan for /recipe/<slug> patterns.
    """
    import re

    records: list[dict] = []
    SLUG_RE = re.compile(r'["\s]/recipe/([a-z0-9][a-z0-9\-]{3,})["\s/?]')

    cat_cdx = cdx_query(f"{PC_HOST}/recipe-categories/*", limit="200")
    logger.info("CDX: %d archived category pages found", len(cat_cdx))
    time.sleep(CDX_DELAY)

    seen_category_urls: set[str] = set()

    for entry in cat_cdx:
        url = entry["original"].split("?")[0]
        if url in seen_category_urls:
            continue
        seen_category_urls.add(url)

        replay_url = f"{WB_BASE}/{entry['timestamp']}/{url}"
        try:
            resp = requests.get(replay_url, timeout=30)
            time.sleep(REPLAY_DELAY)
            if resp.status_code != 200:
                continue
        except Exception as exc:
            logger.warning("Category page fetch failed: %s", exc)
            continue

        for slug in SLUG_RE.findall(resp.text):
            if slug in seen_slugs:
                continue
            seen_slugs.add(slug)
            records.append({
                "slug": slug,
                "title": "",
                "subtitle": "",
                "cook_time": "",
                "tags": [],
                "serving_size": None,
                "image_url": "",
                "description": "",
                "wayback_ts": entry["timestamp"],
                "source": "category_page",
            })

    logger.info("category_pages pass: %d new slugs", len(records))
    return records


# ── Main ──────────────────────────────────────────────────────────────────────

def discover(out_file: Path) -> None:
    seen: set[str] = set()

    # Load previously discovered slugs so reruns are incremental
    existing: list[dict] = []
    if out_file.exists():
        with open(out_file) as f:
            for line in f:
                line = line.strip()
                if line:
                    rec = json.loads(line)
                    seen.add(rec["slug"])
                    existing.append(rec)
        logger.info("Loaded %d existing slugs from %s", len(seen), out_file)

    new_records: list[dict] = []
    new_records += pass_menus(seen)
    new_records += pass_product_api(seen)
    new_records += pass_category_pages(seen)

    out_file.parent.mkdir(parents=True, exist_ok=True)
    with open(out_file, "a") as f:
        for rec in new_records:
            f.write(json.dumps(rec) + "\n")

    total = len(existing) + len(new_records)
    logger.info(
        "Done. %d new slugs written to %s (%d total).",
        len(new_records), out_file, total,
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Discover Purple Carrot recipe slugs via Wayback")
    parser.add_argument(
        "--out",
        type=Path,
        default=DEFAULT_OUT,
        help=f"Output JSONL manifest (default: {DEFAULT_OUT})",
    )
    parser.add_argument("--debug", action="store_true")
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.DEBUG if args.debug else logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s: %(message)s",
    )

    discover(args.out)


if __name__ == "__main__":
    main()