- build_recipe_index.py: add _parse_r_vector() for food.com R format, add _parse_allrecipes_text() for corbt/all-recipes text format, _row_to_fields() dispatcher handles both columnar (food.com) and single-text (all-recipes) - build_flavorgraph_index.py: switch from graph.json to nodes/edges CSVs matching actual FlavorGraph repo structure - download_datasets.py: switch recipe source to corbt/all-recipes (2.1M recipes, 807MB) replacing near-empty AkashPS11/recipes_data_food.com - 007_recipe_corpus.sql: add UNIQUE constraint on external_id to prevent duplicate inserts on pipeline reruns
89 lines
3.1 KiB
Python
89 lines
3.1 KiB
Python
"""
|
|
Import FlavorGraph compound->ingredient map into flavor_molecules table.
|
|
|
|
FlavorGraph GitHub: https://github.com/lamypark/FlavorGraph
|
|
Download: git clone https://github.com/lamypark/FlavorGraph /tmp/flavorgraph
|
|
|
|
Usage:
|
|
conda run -n cf python scripts/pipeline/build_flavorgraph_index.py \
|
|
--db data/kiwi.db \
|
|
--flavorgraph-dir /tmp/flavorgraph/input
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import sqlite3
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def parse_ingredient_nodes(
|
|
nodes_path: Path, edges_path: Path
|
|
) -> tuple[dict[str, list[str]], dict[str, str]]:
|
|
"""Parse FlavorGraph CSVs → (ingredient→compounds, compound→name)."""
|
|
nodes = pd.read_csv(nodes_path, dtype=str).fillna("")
|
|
edges = pd.read_csv(edges_path, dtype=str).fillna("")
|
|
|
|
ingredient_ids: dict[str, str] = {} # node_id -> ingredient_name
|
|
compound_names: dict[str, str] = {} # node_id -> compound_name
|
|
|
|
for _, row in nodes.iterrows():
|
|
nid = row["node_id"]
|
|
name = row["name"].lower().replace("_", " ").strip()
|
|
if row["node_type"] == "ingredient":
|
|
ingredient_ids[nid] = name
|
|
else:
|
|
compound_names[nid] = name
|
|
|
|
ingredient_compounds: dict[str, list[str]] = defaultdict(list)
|
|
for _, row in edges.iterrows():
|
|
src, tgt = row["id_1"], row["id_2"]
|
|
if src in ingredient_ids:
|
|
ingredient_compounds[ingredient_ids[src]].append(tgt)
|
|
if tgt in ingredient_ids:
|
|
ingredient_compounds[ingredient_ids[tgt]].append(src)
|
|
|
|
return dict(ingredient_compounds), compound_names
|
|
|
|
|
|
def build(db_path: Path, flavorgraph_dir: Path) -> None:
|
|
nodes_path = flavorgraph_dir / "nodes_191120.csv"
|
|
edges_path = flavorgraph_dir / "edges_191120.csv"
|
|
|
|
ingredient_map, compound_names = parse_ingredient_nodes(nodes_path, edges_path)
|
|
|
|
compound_ingredients: dict[str, list[str]] = defaultdict(list)
|
|
for ingredient, compounds in ingredient_map.items():
|
|
for cid in compounds:
|
|
compound_ingredients[cid].append(ingredient)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
try:
|
|
for ingredient, compounds in ingredient_map.items():
|
|
conn.execute(
|
|
"UPDATE ingredient_profiles SET flavor_molecule_ids = ? WHERE name = ?",
|
|
(json.dumps(compounds), ingredient),
|
|
)
|
|
|
|
for cid, ingredients in compound_ingredients.items():
|
|
conn.execute(
|
|
"INSERT OR IGNORE INTO flavor_molecules (compound_id, compound_name, ingredient_names)"
|
|
" VALUES (?, ?, ?)",
|
|
(cid, compound_names.get(cid, cid), json.dumps(ingredients)),
|
|
)
|
|
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
print(f"Indexed {len(ingredient_map)} ingredients, {len(compound_ingredients)} compounds")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--db", required=True, type=Path)
|
|
parser.add_argument("--flavorgraph-dir", required=True, type=Path)
|
|
args = parser.parse_args()
|
|
build(args.db, args.flavorgraph_dir)
|