feat: data pipeline -- FlavorGraph molecule index builder
This commit is contained in:
parent
97203313c1
commit
59b6a8265f
2 changed files with 97 additions and 0 deletions
79
scripts/pipeline/build_flavorgraph_index.py
Normal file
79
scripts/pipeline/build_flavorgraph_index.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
"""
|
||||||
|
Import FlavorGraph compound->ingredient map into flavor_molecules table.
|
||||||
|
|
||||||
|
FlavorGraph GitHub: https://github.com/lamypark/FlavorGraph
|
||||||
|
Download: git clone https://github.com/lamypark/FlavorGraph /tmp/flavorgraph
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/pipeline/build_flavorgraph_index.py \
|
||||||
|
--db /path/to/kiwi.db \
|
||||||
|
--graph-json /tmp/flavorgraph/data/graph.json
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def parse_ingredient_nodes(graph: dict) -> dict[str, list[str]]:
|
||||||
|
"""Return {ingredient_name: [compound_id, ...]} from a FlavorGraph JSON."""
|
||||||
|
ingredient_compounds: dict[str, list[str]] = defaultdict(list)
|
||||||
|
ingredient_ids: dict[str, str] = {} # node_id -> ingredient_name
|
||||||
|
|
||||||
|
for node in graph.get("nodes", []):
|
||||||
|
if node.get("type") == "ingredient":
|
||||||
|
ingredient_ids[node["id"]] = node["name"].lower()
|
||||||
|
|
||||||
|
for link in graph.get("links", []):
|
||||||
|
src, tgt = link.get("source", ""), link.get("target", "")
|
||||||
|
if src in ingredient_ids:
|
||||||
|
ingredient_compounds[ingredient_ids[src]].append(tgt)
|
||||||
|
if tgt in ingredient_ids:
|
||||||
|
ingredient_compounds[ingredient_ids[tgt]].append(src)
|
||||||
|
|
||||||
|
return dict(ingredient_compounds)
|
||||||
|
|
||||||
|
|
||||||
|
def build(db_path: Path, graph_json_path: Path) -> None:
|
||||||
|
graph = json.loads(graph_json_path.read_text())
|
||||||
|
ingredient_map = parse_ingredient_nodes(graph)
|
||||||
|
|
||||||
|
compound_ingredients: dict[str, list[str]] = defaultdict(list)
|
||||||
|
compound_names: dict[str, str] = {}
|
||||||
|
|
||||||
|
for node in graph.get("nodes", []):
|
||||||
|
if node.get("type") == "compound":
|
||||||
|
compound_names[node["id"]] = node["name"]
|
||||||
|
|
||||||
|
for ingredient, compounds in ingredient_map.items():
|
||||||
|
for cid in compounds:
|
||||||
|
compound_ingredients[cid].append(ingredient)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
|
||||||
|
for ingredient, compounds in ingredient_map.items():
|
||||||
|
conn.execute("""
|
||||||
|
UPDATE ingredient_profiles
|
||||||
|
SET flavor_molecule_ids = ?
|
||||||
|
WHERE name = ?
|
||||||
|
""", (json.dumps(compounds), ingredient))
|
||||||
|
|
||||||
|
for cid, ingredients in compound_ingredients.items():
|
||||||
|
conn.execute("""
|
||||||
|
INSERT OR IGNORE INTO flavor_molecules (compound_id, compound_name, ingredient_names)
|
||||||
|
VALUES (?, ?, ?)
|
||||||
|
""", (cid, compound_names.get(cid, cid), json.dumps(ingredients)))
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
print(f"Indexed {len(ingredient_map)} ingredients, {len(compound_ingredients)} compounds")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--db", required=True, type=Path)
|
||||||
|
parser.add_argument("--graph-json", required=True, type=Path)
|
||||||
|
args = parser.parse_args()
|
||||||
|
build(args.db, args.graph_json)
|
||||||
18
tests/pipeline/test_build_flavorgraph_index.py
Normal file
18
tests/pipeline/test_build_flavorgraph_index.py
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
def test_parse_flavorgraph_node():
|
||||||
|
from scripts.pipeline.build_flavorgraph_index import parse_ingredient_nodes
|
||||||
|
sample = {
|
||||||
|
"nodes": [
|
||||||
|
{"id": "I_beef", "type": "ingredient", "name": "beef"},
|
||||||
|
{"id": "C_pyrazine", "type": "compound", "name": "pyrazine"},
|
||||||
|
{"id": "I_mushroom", "type": "ingredient", "name": "mushroom"},
|
||||||
|
],
|
||||||
|
"links": [
|
||||||
|
{"source": "I_beef", "target": "C_pyrazine"},
|
||||||
|
{"source": "I_mushroom","target": "C_pyrazine"},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
result = parse_ingredient_nodes(sample)
|
||||||
|
assert "beef" in result
|
||||||
|
assert "C_pyrazine" in result["beef"]
|
||||||
|
assert "mushroom" in result
|
||||||
|
assert "C_pyrazine" in result["mushroom"]
|
||||||
Loading…
Reference in a new issue