fix: pipeline scripts — connection safety, remove unused recipes_path arg, fix inserted counter, pre-load profile index
This commit is contained in:
parent
e57ae74e27
commit
e44d36e32f
2 changed files with 125 additions and 115 deletions
|
|
@ -55,53 +55,74 @@ def compute_element_coverage(profiles: list[dict]) -> dict[str, float]:
|
||||||
|
|
||||||
def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
|
def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
|
||||||
conn = sqlite3.connect(db_path)
|
conn = sqlite3.connect(db_path)
|
||||||
conn.execute("PRAGMA journal_mode=WAL")
|
try:
|
||||||
df = pd.read_parquet(recipes_path)
|
conn.execute("PRAGMA journal_mode=WAL")
|
||||||
inserted = 0
|
|
||||||
batch = []
|
|
||||||
|
|
||||||
for _, row in df.iterrows():
|
# Pre-load ingredient element profiles to avoid N+1 queries
|
||||||
raw_ingredients = row.get("RecipeIngredientParts", [])
|
profile_index: dict[str, list[str]] = {}
|
||||||
if isinstance(raw_ingredients, str):
|
for row in conn.execute("SELECT name, elements FROM ingredient_profiles"):
|
||||||
try:
|
try:
|
||||||
raw_ingredients = json.loads(raw_ingredients)
|
profile_index[row[0]] = json.loads(row[1])
|
||||||
except Exception:
|
except Exception:
|
||||||
raw_ingredients = [raw_ingredients]
|
pass
|
||||||
raw_ingredients = [str(i) for i in (raw_ingredients or [])]
|
|
||||||
ingredient_names = extract_ingredient_names(raw_ingredients)
|
|
||||||
|
|
||||||
profiles = []
|
df = pd.read_parquet(recipes_path)
|
||||||
for name in ingredient_names:
|
inserted = 0
|
||||||
row_p = conn.execute(
|
batch = []
|
||||||
"SELECT elements FROM ingredient_profiles WHERE name = ?", (name,)
|
|
||||||
).fetchone()
|
|
||||||
if row_p:
|
|
||||||
profiles.append({"elements": json.loads(row_p[0])})
|
|
||||||
coverage = compute_element_coverage(profiles)
|
|
||||||
|
|
||||||
directions = row.get("RecipeInstructions", [])
|
for _, row in df.iterrows():
|
||||||
if isinstance(directions, str):
|
raw_ingredients = row.get("RecipeIngredientParts", [])
|
||||||
try:
|
if isinstance(raw_ingredients, str):
|
||||||
directions = json.loads(directions)
|
try:
|
||||||
except Exception:
|
raw_ingredients = json.loads(raw_ingredients)
|
||||||
directions = [directions]
|
except Exception:
|
||||||
|
raw_ingredients = [raw_ingredients]
|
||||||
|
raw_ingredients = [str(i) for i in (raw_ingredients or [])]
|
||||||
|
ingredient_names = extract_ingredient_names(raw_ingredients)
|
||||||
|
|
||||||
batch.append((
|
profiles = []
|
||||||
str(row.get("RecipeId", "")),
|
for name in ingredient_names:
|
||||||
str(row.get("Name", ""))[:500],
|
if name in profile_index:
|
||||||
json.dumps(raw_ingredients),
|
profiles.append({"elements": profile_index[name]})
|
||||||
json.dumps(ingredient_names),
|
coverage = compute_element_coverage(profiles)
|
||||||
json.dumps([str(d) for d in (directions or [])]),
|
|
||||||
str(row.get("RecipeCategory", "") or ""),
|
|
||||||
json.dumps(list(row.get("Keywords", []) or [])),
|
|
||||||
float(row.get("Calories") or 0) or None,
|
|
||||||
float(row.get("FatContent") or 0) or None,
|
|
||||||
float(row.get("ProteinContent") or 0) or None,
|
|
||||||
float(row.get("SodiumContent") or 0) or None,
|
|
||||||
json.dumps(coverage),
|
|
||||||
))
|
|
||||||
|
|
||||||
if len(batch) >= batch_size:
|
directions = row.get("RecipeInstructions", [])
|
||||||
|
if isinstance(directions, str):
|
||||||
|
try:
|
||||||
|
directions = json.loads(directions)
|
||||||
|
except Exception:
|
||||||
|
directions = [directions]
|
||||||
|
|
||||||
|
batch.append((
|
||||||
|
str(row.get("RecipeId", "")),
|
||||||
|
str(row.get("Name", ""))[:500],
|
||||||
|
json.dumps(raw_ingredients),
|
||||||
|
json.dumps(ingredient_names),
|
||||||
|
json.dumps([str(d) for d in (directions or [])]),
|
||||||
|
str(row.get("RecipeCategory", "") or ""),
|
||||||
|
json.dumps(list(row.get("Keywords", []) or [])),
|
||||||
|
float(row.get("Calories") or 0) or None,
|
||||||
|
float(row.get("FatContent") or 0) or None,
|
||||||
|
float(row.get("ProteinContent") or 0) or None,
|
||||||
|
float(row.get("SodiumContent") or 0) or None,
|
||||||
|
json.dumps(coverage),
|
||||||
|
))
|
||||||
|
|
||||||
|
if len(batch) >= batch_size:
|
||||||
|
before = conn.total_changes
|
||||||
|
conn.executemany("""
|
||||||
|
INSERT OR IGNORE INTO recipes
|
||||||
|
(external_id, title, ingredients, ingredient_names, directions,
|
||||||
|
category, keywords, calories, fat_g, protein_g, sodium_mg, element_coverage)
|
||||||
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
||||||
|
""", batch)
|
||||||
|
conn.commit()
|
||||||
|
inserted += conn.total_changes - before
|
||||||
|
print(f" {inserted} recipes inserted...")
|
||||||
|
batch = []
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
before = conn.total_changes
|
||||||
conn.executemany("""
|
conn.executemany("""
|
||||||
INSERT OR IGNORE INTO recipes
|
INSERT OR IGNORE INTO recipes
|
||||||
(external_id, title, ingredients, ingredient_names, directions,
|
(external_id, title, ingredients, ingredient_names, directions,
|
||||||
|
|
@ -109,21 +130,11 @@ def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
|
||||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
||||||
""", batch)
|
""", batch)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
inserted += len(batch)
|
inserted += conn.total_changes - before
|
||||||
print(f" {inserted} recipes inserted...")
|
|
||||||
batch = []
|
|
||||||
|
|
||||||
if batch:
|
|
||||||
conn.executemany("""
|
|
||||||
INSERT OR IGNORE INTO recipes
|
|
||||||
(external_id, title, ingredients, ingredient_names, directions,
|
|
||||||
category, keywords, calories, fat_g, protein_g, sodium_mg, element_coverage)
|
|
||||||
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
|
|
||||||
""", batch)
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
inserted += len(batch)
|
finally:
|
||||||
|
conn.close()
|
||||||
conn.close()
|
|
||||||
print(f"Total: {inserted} recipes inserted")
|
print(f"Total: {inserted} recipes inserted")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -5,8 +5,7 @@ GPL-3.0 source -- derived annotations only, raw pairs not shipped.
|
||||||
Usage:
|
Usage:
|
||||||
conda run -n job-seeker python scripts/pipeline/derive_substitutions.py \
|
conda run -n job-seeker python scripts/pipeline/derive_substitutions.py \
|
||||||
--db /path/to/kiwi.db \
|
--db /path/to/kiwi.db \
|
||||||
--recipepairs data/recipepairs.parquet \
|
--recipepairs data/recipepairs.parquet
|
||||||
--recipes data/recipes_foodcom.parquet
|
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -31,72 +30,73 @@ def diff_ingredients(base: list[str], target: list[str]) -> tuple[list[str], lis
|
||||||
return removed, added
|
return removed, added
|
||||||
|
|
||||||
|
|
||||||
def build(db_path: Path, recipepairs_path: Path, recipes_path: Path) -> None:
|
def build(db_path: Path, recipepairs_path: Path) -> None:
|
||||||
conn = sqlite3.connect(db_path)
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
print("Loading recipe ingredient index...")
|
||||||
|
recipe_ingredients: dict[str, list[str]] = {}
|
||||||
|
for row in conn.execute("SELECT external_id, ingredient_names FROM recipes"):
|
||||||
|
recipe_ingredients[str(row[0])] = json.loads(row[1])
|
||||||
|
|
||||||
print("Loading recipe ingredient index...")
|
df = pd.read_parquet(recipepairs_path)
|
||||||
recipe_ingredients: dict[str, list[str]] = {}
|
pair_counts: dict[tuple, dict] = defaultdict(lambda: {"count": 0})
|
||||||
for row in conn.execute("SELECT external_id, ingredient_names FROM recipes"):
|
|
||||||
recipe_ingredients[str(row[0])] = json.loads(row[1])
|
|
||||||
|
|
||||||
df = pd.read_parquet(recipepairs_path)
|
print("Diffing recipe pairs...")
|
||||||
pair_counts: dict[tuple, dict] = defaultdict(lambda: {"count": 0})
|
for _, row in df.iterrows():
|
||||||
|
base_id = str(row.get("base", ""))
|
||||||
|
target_id = str(row.get("target", ""))
|
||||||
|
base_ings = recipe_ingredients.get(base_id, [])
|
||||||
|
target_ings = recipe_ingredients.get(target_id, [])
|
||||||
|
if not base_ings or not target_ings:
|
||||||
|
continue
|
||||||
|
|
||||||
print("Diffing recipe pairs...")
|
removed, added = diff_ingredients(base_ings, target_ings)
|
||||||
for _, row in df.iterrows():
|
if len(removed) != 1 or len(added) != 1:
|
||||||
base_id = str(row.get("base", ""))
|
continue
|
||||||
target_id = str(row.get("target", ""))
|
|
||||||
base_ings = recipe_ingredients.get(base_id, [])
|
|
||||||
target_ings = recipe_ingredients.get(target_id, [])
|
|
||||||
if not base_ings or not target_ings:
|
|
||||||
continue
|
|
||||||
|
|
||||||
removed, added = diff_ingredients(base_ings, target_ings)
|
original = removed[0]
|
||||||
if len(removed) != 1 or len(added) != 1:
|
substitute = added[0]
|
||||||
continue
|
constraints = [c for c in CONSTRAINT_COLS if row.get(c, 0)]
|
||||||
|
for constraint in constraints:
|
||||||
|
key = (original, substitute, constraint)
|
||||||
|
pair_counts[key]["count"] += 1
|
||||||
|
|
||||||
original = removed[0]
|
def get_profile(name: str) -> dict:
|
||||||
substitute = added[0]
|
row = conn.execute(
|
||||||
constraints = [c for c in CONSTRAINT_COLS if row.get(c, 0)]
|
"SELECT fat_pct, moisture_pct, glutamate_mg, protein_pct "
|
||||||
for constraint in constraints:
|
"FROM ingredient_profiles WHERE name = ?", (name,)
|
||||||
key = (original, substitute, constraint)
|
).fetchone()
|
||||||
pair_counts[key]["count"] += 1
|
if row:
|
||||||
|
return {"fat": row[0] or 0, "moisture": row[1] or 0,
|
||||||
|
"glutamate": row[2] or 0, "protein": row[3] or 0}
|
||||||
|
return {"fat": 0, "moisture": 0, "glutamate": 0, "protein": 0}
|
||||||
|
|
||||||
def get_profile(name: str) -> dict:
|
print("Writing substitution pairs...")
|
||||||
row = conn.execute(
|
inserted = 0
|
||||||
"SELECT fat_pct, moisture_pct, glutamate_mg, protein_pct "
|
for (original, substitute, constraint), data in pair_counts.items():
|
||||||
"FROM ingredient_profiles WHERE name = ?", (name,)
|
if data["count"] < 3:
|
||||||
).fetchone()
|
continue
|
||||||
if row:
|
p_orig = get_profile(original)
|
||||||
return {"fat": row[0] or 0, "moisture": row[1] or 0,
|
p_sub = get_profile(substitute)
|
||||||
"glutamate": row[2] or 0, "protein": row[3] or 0}
|
conn.execute("""
|
||||||
return {"fat": 0, "moisture": 0, "glutamate": 0, "protein": 0}
|
INSERT OR REPLACE INTO substitution_pairs
|
||||||
|
(original_name, substitute_name, constraint_label,
|
||||||
|
fat_delta, moisture_delta, glutamate_delta, protein_delta,
|
||||||
|
occurrence_count, source)
|
||||||
|
VALUES (?,?,?,?,?,?,?,?,?)
|
||||||
|
""", (
|
||||||
|
original, substitute, constraint,
|
||||||
|
round(p_sub["fat"] - p_orig["fat"], 2),
|
||||||
|
round(p_sub["moisture"] - p_orig["moisture"], 2),
|
||||||
|
round(p_sub["glutamate"] - p_orig["glutamate"], 2),
|
||||||
|
round(p_sub["protein"] - p_orig["protein"], 2),
|
||||||
|
data["count"], "derived",
|
||||||
|
))
|
||||||
|
inserted += 1
|
||||||
|
|
||||||
print("Writing substitution pairs...")
|
conn.commit()
|
||||||
inserted = 0
|
finally:
|
||||||
for (original, substitute, constraint), data in pair_counts.items():
|
conn.close()
|
||||||
if data["count"] < 3:
|
|
||||||
continue
|
|
||||||
p_orig = get_profile(original)
|
|
||||||
p_sub = get_profile(substitute)
|
|
||||||
conn.execute("""
|
|
||||||
INSERT OR REPLACE INTO substitution_pairs
|
|
||||||
(original_name, substitute_name, constraint_label,
|
|
||||||
fat_delta, moisture_delta, glutamate_delta, protein_delta,
|
|
||||||
occurrence_count, source)
|
|
||||||
VALUES (?,?,?,?,?,?,?,?,?)
|
|
||||||
""", (
|
|
||||||
original, substitute, constraint,
|
|
||||||
round(p_sub["fat"] - p_orig["fat"], 2),
|
|
||||||
round(p_sub["moisture"] - p_orig["moisture"], 2),
|
|
||||||
round(p_sub["glutamate"] - p_orig["glutamate"], 2),
|
|
||||||
round(p_sub["protein"] - p_orig["protein"], 2),
|
|
||||||
data["count"], "derived",
|
|
||||||
))
|
|
||||||
inserted += 1
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
print(f"Inserted {inserted} substitution pairs (min 3 occurrences)")
|
print(f"Inserted {inserted} substitution pairs (min 3 occurrences)")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -104,6 +104,5 @@ if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--db", required=True, type=Path)
|
parser.add_argument("--db", required=True, type=Path)
|
||||||
parser.add_argument("--recipepairs", required=True, type=Path)
|
parser.add_argument("--recipepairs", required=True, type=Path)
|
||||||
parser.add_argument("--recipes", required=True, type=Path)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
build(args.db, args.recipepairs, args.recipes)
|
build(args.db, args.recipepairs)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue