#!/usr/bin/env python3 """ Backfill texture_profile in ingredient_profiles from existing macro data. Texture categories and their macro signatures (all values g/100g): fatty - fat > 60 (oils, lard, pure butter) creamy - fat 15-60 (cream, cheese, fatty meats, nut butter) firm - protein > 15, fat < 15 (lean meats, fish, legumes, firm tofu) starchy - carbs > 40, fat < 10 (flour, oats, rice, bread, potatoes) fibrous - fiber > 4, carbs < 40 (brassicas, leafy greens, whole grains) tender - protein 2-15, fat < 10, (soft veg, eggs, soft tofu, cooked beans) carbs < 40 liquid - calories < 25, fat < 1, (broth, juice, dilute sauces) protein < 3 neutral - fallthrough default Rules are applied in priority order: fatty → creamy → firm → starchy → fibrous → tender → liquid → neutral. Run: python scripts/backfill_texture_profiles.py [path/to/kiwi.db] Or inside the container: docker exec kiwi-cloud-api-1 python /app/kiwi/scripts/backfill_texture_profiles.py """ from __future__ import annotations import sqlite3 import sys from pathlib import Path # Default DB paths to try _DEFAULT_PATHS = [ "/devl/kiwi-cloud-data/local-dev/kiwi.db", "/devl/kiwi-data/kiwi.db", ] BATCH_SIZE = 5_000 def _classify(fat: float, protein: float, carbs: float, fiber: float, calories: float) -> str: # Cap runaway values — data quality issue in some branded entries fat = min(fat or 0.0, 100.0) protein = min(protein or 0.0, 100.0) carbs = min(carbs or 0.0, 100.0) fiber = min(fiber or 0.0, 50.0) calories = min(calories or 0.0, 900.0) if fat > 60: return "fatty" if fat > 15: return "creamy" # Starchy before firm: oats/legumes have high protein AND high carbs — carbs win if carbs > 40 and fat < 10: return "starchy" # Firm: lean proteins with low carbs (meats, fish, hard tofu) # Lower protein threshold (>7) catches tofu (9%) and similar plant proteins if protein > 7 and fat < 12 and carbs < 20: return "firm" if fiber > 4 and carbs < 40: return "fibrous" if 2 < protein <= 15 and fat < 10 and carbs < 40: return "tender" if calories < 25 and fat < 1 and protein < 3: return "liquid" return "neutral" def backfill(db_path: str) -> None: conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row total = conn.execute("SELECT COUNT(*) FROM ingredient_profiles").fetchone()[0] print(f"Total rows: {total:,}") updated = 0 offset = 0 counts: dict[str, int] = {} while True: rows = conn.execute( """SELECT id, fat_pct, protein_pct, carbs_g_per_100g, fiber_g_per_100g, calories_per_100g FROM ingredient_profiles LIMIT ? OFFSET ?""", (BATCH_SIZE, offset), ).fetchall() if not rows: break batch: list[tuple[str, int]] = [] for row in rows: texture = _classify( row["fat_pct"], row["protein_pct"], row["carbs_g_per_100g"], row["fiber_g_per_100g"], row["calories_per_100g"], ) counts[texture] = counts.get(texture, 0) + 1 batch.append((texture, row["id"])) conn.executemany( "UPDATE ingredient_profiles SET texture_profile = ? WHERE id = ?", batch, ) conn.commit() updated += len(batch) offset += BATCH_SIZE print(f" {updated:,} / {total:,} updated...", end="\r") print(f"\nDone. {updated:,} rows updated.\n") print("Texture distribution:") for texture, count in sorted(counts.items(), key=lambda x: -x[1]): pct = count / updated * 100 print(f" {texture:10s} {count:8,} ({pct:.1f}%)") conn.close() if __name__ == "__main__": if len(sys.argv) > 1: path = sys.argv[1] else: path = next((p for p in _DEFAULT_PATHS if Path(p).exists()), None) if not path: print(f"No DB found. Pass path as argument or create one of: {_DEFAULT_PATHS}") sys.exit(1) print(f"Backfilling texture profiles in: {path}") backfill(path)