kiwi/scripts/pipeline/build_flavorgraph_index.py
pyr0ball 77627cec23 fix: data pipeline — R-vector parser, allrecipes dataset, unique recipe index
- build_recipe_index.py: add _parse_r_vector() for food.com R format, add
  _parse_allrecipes_text() for corbt/all-recipes text format, _row_to_fields()
  dispatcher handles both columnar (food.com) and single-text (all-recipes)
- build_flavorgraph_index.py: switch from graph.json to nodes/edges CSVs
  matching actual FlavorGraph repo structure
- download_datasets.py: switch recipe source to corbt/all-recipes (2.1M
  recipes, 807MB) replacing near-empty AkashPS11/recipes_data_food.com
- 007_recipe_corpus.sql: add UNIQUE constraint on external_id to prevent
  duplicate inserts on pipeline reruns
2026-03-31 21:36:13 -07:00

89 lines
3.1 KiB
Python

"""
Import FlavorGraph compound->ingredient map into flavor_molecules table.
FlavorGraph GitHub: https://github.com/lamypark/FlavorGraph
Download: git clone https://github.com/lamypark/FlavorGraph /tmp/flavorgraph
Usage:
conda run -n cf python scripts/pipeline/build_flavorgraph_index.py \
--db data/kiwi.db \
--flavorgraph-dir /tmp/flavorgraph/input
"""
from __future__ import annotations
import argparse
import json
import sqlite3
from collections import defaultdict
from pathlib import Path
import pandas as pd
def parse_ingredient_nodes(
nodes_path: Path, edges_path: Path
) -> tuple[dict[str, list[str]], dict[str, str]]:
"""Parse FlavorGraph CSVs → (ingredient→compounds, compound→name)."""
nodes = pd.read_csv(nodes_path, dtype=str).fillna("")
edges = pd.read_csv(edges_path, dtype=str).fillna("")
ingredient_ids: dict[str, str] = {} # node_id -> ingredient_name
compound_names: dict[str, str] = {} # node_id -> compound_name
for _, row in nodes.iterrows():
nid = row["node_id"]
name = row["name"].lower().replace("_", " ").strip()
if row["node_type"] == "ingredient":
ingredient_ids[nid] = name
else:
compound_names[nid] = name
ingredient_compounds: dict[str, list[str]] = defaultdict(list)
for _, row in edges.iterrows():
src, tgt = row["id_1"], row["id_2"]
if src in ingredient_ids:
ingredient_compounds[ingredient_ids[src]].append(tgt)
if tgt in ingredient_ids:
ingredient_compounds[ingredient_ids[tgt]].append(src)
return dict(ingredient_compounds), compound_names
def build(db_path: Path, flavorgraph_dir: Path) -> None:
nodes_path = flavorgraph_dir / "nodes_191120.csv"
edges_path = flavorgraph_dir / "edges_191120.csv"
ingredient_map, compound_names = parse_ingredient_nodes(nodes_path, edges_path)
compound_ingredients: dict[str, list[str]] = defaultdict(list)
for ingredient, compounds in ingredient_map.items():
for cid in compounds:
compound_ingredients[cid].append(ingredient)
conn = sqlite3.connect(db_path)
try:
for ingredient, compounds in ingredient_map.items():
conn.execute(
"UPDATE ingredient_profiles SET flavor_molecule_ids = ? WHERE name = ?",
(json.dumps(compounds), ingredient),
)
for cid, ingredients in compound_ingredients.items():
conn.execute(
"INSERT OR IGNORE INTO flavor_molecules (compound_id, compound_name, ingredient_names)"
" VALUES (?, ?, ?)",
(cid, compound_names.get(cid, cid), json.dumps(ingredients)),
)
conn.commit()
finally:
conn.close()
print(f"Indexed {len(ingredient_map)} ingredients, {len(compound_ingredients)} compounds")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--db", required=True, type=Path)
parser.add_argument("--flavorgraph-dir", required=True, type=Path)
args = parser.parse_args()
build(args.db, args.flavorgraph_dir)