- cloud_session.py: CLOUD_AUTH_BYPASS_IPS with CIDR support; X-Real-IP for Docker bridge NAT-aware client IP resolution; local-dev DB path under CLOUD_DATA_ROOT for bypass sessions - compose.cloud.yml: thread CLOUD_AUTH_BYPASS_IPS from shell env; document Docker bridge CIDR requirement in .env.example - nginx.cloud.conf + nginx.conf: client_max_body_size 20m for barcode uploads - barcode_scanner.py: EXIF orientation correction (PIL ImageOps.exif_transpose) before cv2 decode; rotation coverage extended to [90, 180, 270, 45, 135] to catch sideways barcodes the 270° case was missing - llm_recipe.py: CF-core VRAM lease acquire/release wrapping LLMRouter calls - tasks/runner.py + config.py: COORDINATOR_URL + recipe_llm VRAM budget (4GB) - recipes.py: per-request Store creation inside asyncio.to_thread worker to avoid SQLite check_same_thread violations - download_datasets.py: HF_PARQUET_FILES strategy for repos without dataset builders (lishuyang/recipepairs direct parquet download) - derive_substitutions.py: use recipepairs_recipes.parquet for ingredient lookup; numpy array detection; JSON category parsing - test_build_flavorgraph_index.py: rewritten for CSV-based index format - pyproject.toml: add Pillow>=10.0 for EXIF rotation support
134 lines
5.3 KiB
Python
134 lines
5.3 KiB
Python
"""
|
|
Derive substitution pairs by diffing lishuyang/recipepairs.
|
|
GPL-3.0 source -- derived annotations only, raw pairs not shipped.
|
|
|
|
Usage:
|
|
PYTHONPATH=/path/to/kiwi conda run -n cf python scripts/pipeline/derive_substitutions.py \
|
|
--db /path/to/kiwi.db \
|
|
--recipepairs data/pipeline/recipepairs.parquet \
|
|
--recipepairs-recipes data/pipeline/recipepairs_recipes.parquet
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
|
|
def diff_ingredients(base: list[str], target: list[str]) -> tuple[list[str], list[str]]:
|
|
base_set = set(base)
|
|
target_set = set(target)
|
|
removed = list(base_set - target_set)
|
|
added = list(target_set - base_set)
|
|
return removed, added
|
|
|
|
|
|
def _parse_categories(val: object) -> list[str]:
|
|
"""Parse categories field which may be a list, str-repr list, or bare string."""
|
|
if isinstance(val, list):
|
|
return [str(v) for v in val]
|
|
if isinstance(val, str):
|
|
val = val.strip()
|
|
if val.startswith("["):
|
|
# parse list repr: ['a', 'b'] — use json after converting single quotes
|
|
try:
|
|
fixed = re.sub(r"'", '"', val)
|
|
return json.loads(fixed)
|
|
except Exception:
|
|
pass
|
|
return [val] if val else []
|
|
return []
|
|
|
|
|
|
def build(db_path: Path, recipepairs_path: Path, recipes_path: Path) -> None:
|
|
conn = sqlite3.connect(db_path)
|
|
try:
|
|
# Load ingredient lists from the bundled recipepairs recipe corpus.
|
|
# This is GPL-3.0 data — we only use it for diffing; raw data is not persisted.
|
|
print("Loading recipe ingredient index from recipepairs corpus...")
|
|
recipes_df = pd.read_parquet(recipes_path, columns=["id", "ingredients"])
|
|
recipe_ingredients: dict[str, list[str]] = {}
|
|
for _, r in recipes_df.iterrows():
|
|
ings = r["ingredients"]
|
|
if ings is not None and hasattr(ings, "__iter__") and not isinstance(ings, str):
|
|
recipe_ingredients[str(int(r["id"]))] = [str(i) for i in ings]
|
|
print(f" {len(recipe_ingredients)} recipes loaded")
|
|
|
|
pairs_df = pd.read_parquet(recipepairs_path)
|
|
pair_counts: dict[tuple, dict] = defaultdict(lambda: {"count": 0})
|
|
|
|
print("Diffing recipe pairs...")
|
|
for _, row in pairs_df.iterrows():
|
|
base_id = str(int(row["base"]))
|
|
target_id = str(int(row["target"]))
|
|
base_ings = recipe_ingredients.get(base_id, [])
|
|
target_ings = recipe_ingredients.get(target_id, [])
|
|
if not base_ings or not target_ings:
|
|
continue
|
|
|
|
removed, added = diff_ingredients(base_ings, target_ings)
|
|
if len(removed) != 1 or len(added) != 1:
|
|
continue
|
|
|
|
original = removed[0]
|
|
substitute = added[0]
|
|
constraints = _parse_categories(row.get("categories", []))
|
|
if not constraints:
|
|
continue
|
|
for constraint in constraints:
|
|
key = (original, substitute, constraint)
|
|
pair_counts[key]["count"] += 1
|
|
|
|
def get_profile(name: str) -> dict:
|
|
row = conn.execute(
|
|
"SELECT fat_pct, moisture_pct, glutamate_mg, protein_pct "
|
|
"FROM ingredient_profiles WHERE name = ?", (name,)
|
|
).fetchone()
|
|
if row:
|
|
return {"fat": row[0] or 0, "moisture": row[1] or 0,
|
|
"glutamate": row[2] or 0, "protein": row[3] or 0}
|
|
return {"fat": 0, "moisture": 0, "glutamate": 0, "protein": 0}
|
|
|
|
print("Writing substitution pairs...")
|
|
inserted = 0
|
|
for (original, substitute, constraint), data in pair_counts.items():
|
|
if data["count"] < 3:
|
|
continue
|
|
p_orig = get_profile(original)
|
|
p_sub = get_profile(substitute)
|
|
conn.execute("""
|
|
INSERT OR REPLACE INTO substitution_pairs
|
|
(original_name, substitute_name, constraint_label,
|
|
fat_delta, moisture_delta, glutamate_delta, protein_delta,
|
|
occurrence_count, source)
|
|
VALUES (?,?,?,?,?,?,?,?,?)
|
|
""", (
|
|
original, substitute, constraint,
|
|
round(p_sub["fat"] - p_orig["fat"], 2),
|
|
round(p_sub["moisture"] - p_orig["moisture"], 2),
|
|
round(p_sub["glutamate"] - p_orig["glutamate"], 2),
|
|
round(p_sub["protein"] - p_orig["protein"], 2),
|
|
data["count"], "derived",
|
|
))
|
|
inserted += 1
|
|
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
print(f"Inserted {inserted} substitution pairs (min 3 occurrences)")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--db", required=True, type=Path)
|
|
parser.add_argument("--recipepairs", required=True, type=Path,
|
|
help="pairs.parquet from lishuyang/recipepairs")
|
|
parser.add_argument("--recipepairs-recipes", required=True, type=Path,
|
|
dest="recipepairs_recipes",
|
|
help="recipes.parquet from lishuyang/recipepairs (ingredient lookup)")
|
|
args = parser.parse_args()
|
|
build(args.db, args.recipepairs, args.recipepairs_recipes)
|