- build_recipe_index.py: add _parse_r_vector() for food.com R format, add _parse_allrecipes_text() for corbt/all-recipes text format, _row_to_fields() dispatcher handles both columnar (food.com) and single-text (all-recipes) - build_flavorgraph_index.py: switch from graph.json to nodes/edges CSVs matching actual FlavorGraph repo structure - download_datasets.py: switch recipe source to corbt/all-recipes (2.1M recipes, 807MB) replacing near-empty AkashPS11/recipes_data_food.com - 007_recipe_corpus.sql: add UNIQUE constraint on external_id to prevent duplicate inserts on pipeline reruns
44 lines
1.6 KiB
Python
44 lines
1.6 KiB
Python
"""
|
|
Download recipe engine datasets from HuggingFace.
|
|
|
|
Usage:
|
|
conda run -n cf python scripts/pipeline/download_datasets.py --data-dir data/pipeline
|
|
|
|
Downloads:
|
|
- corbt/all-recipes (no license) → data/pipeline/recipes_allrecipes.parquet [2.1M recipes]
|
|
- omid5/usda-fdc-foods-cleaned (CC0) → data/pipeline/usda_fdc_cleaned.parquet
|
|
- jacktol/usda-branded-food-data (MIT) → data/pipeline/usda_branded.parquet
|
|
- lishuyang/recipepairs (GPL-3.0 ⚠) → data/pipeline/recipepairs.parquet [derive only, don't ship]
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
from pathlib import Path
|
|
from datasets import load_dataset
|
|
|
|
|
|
DATASETS = [
|
|
("corbt/all-recipes", "train", "recipes_allrecipes.parquet"),
|
|
("omid5/usda-fdc-foods-cleaned", "train", "usda_fdc_cleaned.parquet"),
|
|
("jacktol/usda-branded-food-data","train", "usda_branded.parquet"),
|
|
("lishuyang/recipepairs", "train", "recipepairs.parquet"),
|
|
]
|
|
|
|
|
|
def download_all(data_dir: Path) -> None:
|
|
data_dir.mkdir(parents=True, exist_ok=True)
|
|
for hf_path, split, filename in DATASETS:
|
|
out = data_dir / filename
|
|
if out.exists():
|
|
print(f" skip {filename} (already exists)")
|
|
continue
|
|
print(f" downloading {hf_path} ...")
|
|
ds = load_dataset(hf_path, split=split)
|
|
ds.to_parquet(str(out))
|
|
print(f" saved → {out}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--data-dir", required=True, type=Path)
|
|
args = parser.parse_args()
|
|
download_all(args.data_dir)
|