From 59b6a8265f51961e38c1c50a949385f25d2eab17 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 30 Mar 2026 22:46:53 -0700 Subject: [PATCH] feat: data pipeline -- FlavorGraph molecule index builder --- scripts/pipeline/build_flavorgraph_index.py | 79 +++++++++++++++++++ .../pipeline/test_build_flavorgraph_index.py | 18 +++++ 2 files changed, 97 insertions(+) create mode 100644 scripts/pipeline/build_flavorgraph_index.py create mode 100644 tests/pipeline/test_build_flavorgraph_index.py diff --git a/scripts/pipeline/build_flavorgraph_index.py b/scripts/pipeline/build_flavorgraph_index.py new file mode 100644 index 0000000..d831f85 --- /dev/null +++ b/scripts/pipeline/build_flavorgraph_index.py @@ -0,0 +1,79 @@ +""" +Import FlavorGraph compound->ingredient map into flavor_molecules table. + +FlavorGraph GitHub: https://github.com/lamypark/FlavorGraph +Download: git clone https://github.com/lamypark/FlavorGraph /tmp/flavorgraph + +Usage: + conda run -n job-seeker python scripts/pipeline/build_flavorgraph_index.py \ + --db /path/to/kiwi.db \ + --graph-json /tmp/flavorgraph/data/graph.json +""" +from __future__ import annotations +import argparse +import json +import sqlite3 +from collections import defaultdict +from pathlib import Path + + +def parse_ingredient_nodes(graph: dict) -> dict[str, list[str]]: + """Return {ingredient_name: [compound_id, ...]} from a FlavorGraph JSON.""" + ingredient_compounds: dict[str, list[str]] = defaultdict(list) + ingredient_ids: dict[str, str] = {} # node_id -> ingredient_name + + for node in graph.get("nodes", []): + if node.get("type") == "ingredient": + ingredient_ids[node["id"]] = node["name"].lower() + + for link in graph.get("links", []): + src, tgt = link.get("source", ""), link.get("target", "") + if src in ingredient_ids: + ingredient_compounds[ingredient_ids[src]].append(tgt) + if tgt in ingredient_ids: + ingredient_compounds[ingredient_ids[tgt]].append(src) + + return dict(ingredient_compounds) + + +def build(db_path: Path, graph_json_path: Path) -> None: + graph = json.loads(graph_json_path.read_text()) + ingredient_map = parse_ingredient_nodes(graph) + + compound_ingredients: dict[str, list[str]] = defaultdict(list) + compound_names: dict[str, str] = {} + + for node in graph.get("nodes", []): + if node.get("type") == "compound": + compound_names[node["id"]] = node["name"] + + for ingredient, compounds in ingredient_map.items(): + for cid in compounds: + compound_ingredients[cid].append(ingredient) + + conn = sqlite3.connect(db_path) + + for ingredient, compounds in ingredient_map.items(): + conn.execute(""" + UPDATE ingredient_profiles + SET flavor_molecule_ids = ? + WHERE name = ? + """, (json.dumps(compounds), ingredient)) + + for cid, ingredients in compound_ingredients.items(): + conn.execute(""" + INSERT OR IGNORE INTO flavor_molecules (compound_id, compound_name, ingredient_names) + VALUES (?, ?, ?) + """, (cid, compound_names.get(cid, cid), json.dumps(ingredients))) + + conn.commit() + conn.close() + print(f"Indexed {len(ingredient_map)} ingredients, {len(compound_ingredients)} compounds") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--db", required=True, type=Path) + parser.add_argument("--graph-json", required=True, type=Path) + args = parser.parse_args() + build(args.db, args.graph_json) diff --git a/tests/pipeline/test_build_flavorgraph_index.py b/tests/pipeline/test_build_flavorgraph_index.py new file mode 100644 index 0000000..febf381 --- /dev/null +++ b/tests/pipeline/test_build_flavorgraph_index.py @@ -0,0 +1,18 @@ +def test_parse_flavorgraph_node(): + from scripts.pipeline.build_flavorgraph_index import parse_ingredient_nodes + sample = { + "nodes": [ + {"id": "I_beef", "type": "ingredient", "name": "beef"}, + {"id": "C_pyrazine", "type": "compound", "name": "pyrazine"}, + {"id": "I_mushroom", "type": "ingredient", "name": "mushroom"}, + ], + "links": [ + {"source": "I_beef", "target": "C_pyrazine"}, + {"source": "I_mushroom","target": "C_pyrazine"}, + ] + } + result = parse_ingredient_nodes(sample) + assert "beef" in result + assert "C_pyrazine" in result["beef"] + assert "mushroom" in result + assert "C_pyrazine" in result["mushroom"]