feat: data pipeline -- FlavorGraph molecule index builder
This commit is contained in:
parent
97203313c1
commit
59b6a8265f
2 changed files with 97 additions and 0 deletions
79
scripts/pipeline/build_flavorgraph_index.py
Normal file
79
scripts/pipeline/build_flavorgraph_index.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
"""
|
||||
Import FlavorGraph compound->ingredient map into flavor_molecules table.
|
||||
|
||||
FlavorGraph GitHub: https://github.com/lamypark/FlavorGraph
|
||||
Download: git clone https://github.com/lamypark/FlavorGraph /tmp/flavorgraph
|
||||
|
||||
Usage:
|
||||
conda run -n job-seeker python scripts/pipeline/build_flavorgraph_index.py \
|
||||
--db /path/to/kiwi.db \
|
||||
--graph-json /tmp/flavorgraph/data/graph.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import sqlite3
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_ingredient_nodes(graph: dict) -> dict[str, list[str]]:
|
||||
"""Return {ingredient_name: [compound_id, ...]} from a FlavorGraph JSON."""
|
||||
ingredient_compounds: dict[str, list[str]] = defaultdict(list)
|
||||
ingredient_ids: dict[str, str] = {} # node_id -> ingredient_name
|
||||
|
||||
for node in graph.get("nodes", []):
|
||||
if node.get("type") == "ingredient":
|
||||
ingredient_ids[node["id"]] = node["name"].lower()
|
||||
|
||||
for link in graph.get("links", []):
|
||||
src, tgt = link.get("source", ""), link.get("target", "")
|
||||
if src in ingredient_ids:
|
||||
ingredient_compounds[ingredient_ids[src]].append(tgt)
|
||||
if tgt in ingredient_ids:
|
||||
ingredient_compounds[ingredient_ids[tgt]].append(src)
|
||||
|
||||
return dict(ingredient_compounds)
|
||||
|
||||
|
||||
def build(db_path: Path, graph_json_path: Path) -> None:
|
||||
graph = json.loads(graph_json_path.read_text())
|
||||
ingredient_map = parse_ingredient_nodes(graph)
|
||||
|
||||
compound_ingredients: dict[str, list[str]] = defaultdict(list)
|
||||
compound_names: dict[str, str] = {}
|
||||
|
||||
for node in graph.get("nodes", []):
|
||||
if node.get("type") == "compound":
|
||||
compound_names[node["id"]] = node["name"]
|
||||
|
||||
for ingredient, compounds in ingredient_map.items():
|
||||
for cid in compounds:
|
||||
compound_ingredients[cid].append(ingredient)
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
|
||||
for ingredient, compounds in ingredient_map.items():
|
||||
conn.execute("""
|
||||
UPDATE ingredient_profiles
|
||||
SET flavor_molecule_ids = ?
|
||||
WHERE name = ?
|
||||
""", (json.dumps(compounds), ingredient))
|
||||
|
||||
for cid, ingredients in compound_ingredients.items():
|
||||
conn.execute("""
|
||||
INSERT OR IGNORE INTO flavor_molecules (compound_id, compound_name, ingredient_names)
|
||||
VALUES (?, ?, ?)
|
||||
""", (cid, compound_names.get(cid, cid), json.dumps(ingredients)))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Indexed {len(ingredient_map)} ingredients, {len(compound_ingredients)} compounds")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--db", required=True, type=Path)
|
||||
parser.add_argument("--graph-json", required=True, type=Path)
|
||||
args = parser.parse_args()
|
||||
build(args.db, args.graph_json)
|
||||
18
tests/pipeline/test_build_flavorgraph_index.py
Normal file
18
tests/pipeline/test_build_flavorgraph_index.py
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
def test_parse_flavorgraph_node():
|
||||
from scripts.pipeline.build_flavorgraph_index import parse_ingredient_nodes
|
||||
sample = {
|
||||
"nodes": [
|
||||
{"id": "I_beef", "type": "ingredient", "name": "beef"},
|
||||
{"id": "C_pyrazine", "type": "compound", "name": "pyrazine"},
|
||||
{"id": "I_mushroom", "type": "ingredient", "name": "mushroom"},
|
||||
],
|
||||
"links": [
|
||||
{"source": "I_beef", "target": "C_pyrazine"},
|
||||
{"source": "I_mushroom","target": "C_pyrazine"},
|
||||
]
|
||||
}
|
||||
result = parse_ingredient_nodes(sample)
|
||||
assert "beef" in result
|
||||
assert "C_pyrazine" in result["beef"]
|
||||
assert "mushroom" in result
|
||||
assert "C_pyrazine" in result["mushroom"]
|
||||
Loading…
Reference in a new issue