kiwi/app/services/recipe/recipe_scanner.py
pyr0ball 896b4e048c feat: recipe scanner — photo to structured recipe (kiwi#9)
New feature: photograph a recipe card, cookbook page, or handwritten
note and have it extracted into a structured, editable recipe.

Backend:
- POST /recipes/scan: accept 1-4 photos, run VLM extraction, return
  structured JSON for review (not auto-saved)
- POST /recipes/scan/save: persist a reviewed/edited recipe
- GET/DELETE /recipes/user: user-created recipe CRUD
- Vision backend priority: cf-orch -> local Qwen2.5-VL -> Anthropic BYOK
- 503 with clear config hint when no vision backend available
- Multi-photo support: facing pages (ingredients/directions) sent together
- Pantry cross-reference: marks which ingredients are already on hand
- migration 041: user_recipes table (title, servings, cook_time, steps,
  ingredients JSON, source, pantry_match_pct)
- Tier gate: recipe_scan -> paid, BYOK-unlockable

Frontend:
- "Scan" button in the Recipes tab bar (camera icon)
- RecipeScanModal: upload step (drag-drop + file picker, up to 4 photos,
  live previews), processing step (spinner), review/edit step (all
  fields inline-editable before save), pantry match badge, warning banner
  for low-confidence or incomplete scans

Tests: 35 new tests (23 unit + 12 API), 404 total passing
2026-04-27 08:23:01 -07:00

411 lines
15 KiB
Python

"""Recipe scanner service (kiwi#9).
Extracts structured recipe data from one or more photos of recipe cards,
cookbook pages, or handwritten notes.
Pipeline:
photo(s) -> EXIF correction -> VLM extraction -> JSON parse -> pantry cross-ref
Vision backend priority (mirrors receipt OCR pattern):
1. cf-orch vision service (if CF_ORCH_URL set)
2. Local Qwen2.5-VL (if GPU available)
3. Anthropic API (BYOK -- if ANTHROPIC_API_KEY set)
BSL 1.1 -- requires Paid tier or BYOK.
"""
from __future__ import annotations
import base64
import io
import json
import logging
import os
import re
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
# Maximum number of photos per scan call (to limit VLM context / VRAM)
MAX_IMAGES = 4
# VLM prompt -- adapted from tests/fixtures/recipe_scan/extract_test.py
_EXTRACTION_PROMPT = """
You are extracting a recipe from a photograph of a recipe card, cookbook page, or handwritten note.
If two or more images are provided, treat them as a single recipe across multiple pages
(e.g. ingredients on page 1, directions on page 2).
Return a single JSON object with these fields:
- title: recipe name (string)
- subtitle: any secondary title or serving suggestion e.g. "with Broccoli & Ranch Dressing" (string or null)
- servings: serving size if shown, as a string e.g. "2", "4-6" (string or null)
- cook_time: total cook time if shown, e.g. "15 min", "1 hour" (string or null)
- source_note: any attribution text like "From Betty Crocker" or "Purple Carrot" (string or null)
- ingredients: array of ingredient objects, each with:
- name: normalized generic ingredient name, lowercase, no quantities, no brand names
(e.g. "Follow Your Heart Vegan Ranch" becomes "ranch dressing")
- qty: quantity as a string, preserving fractions e.g. "1/2", a quarter symbol (string or null)
- unit: unit of measure, null for countable items (e.g. "3 eggs" has unit: null)
- raw: the original ingredient line verbatim, exactly as it appears
- steps: ordered array of instruction strings, one distinct step per element
- notes: any tips, substitutions, storage instructions, or variations (string or null)
- confidence: "high" if text is clear and complete, "medium" if some parts are uncertain,
"low" if mostly handwritten or significantly degraded
- warnings: array of strings describing anything the user should double-check
(e.g. "Directions appear to continue on another page not shown")
Return only valid JSON. No markdown fences. No explanation outside the JSON.
If the image does not appear to be a recipe at all, return: {"error": "not_a_recipe"}
""".strip()
# ── Data types ─────────────────────────────────────────────────────────────────
@dataclass
class ScannedIngredient:
name: str
qty: str | None = None
unit: str | None = None
raw: str | None = None
in_pantry: bool = False
@dataclass
class ScannedRecipeResult:
title: str | None
subtitle: str | None
servings: str | None
cook_time: str | None
source_note: str | None
ingredients: list[ScannedIngredient]
steps: list[str]
notes: str | None
tags: list[str]
pantry_match_pct: int
confidence: str
warnings: list[str]
# ── Image helpers ──────────────────────────────────────────────────────────────
def _load_image_b64(path: Path) -> str:
"""Load image, apply EXIF rotation, return base64-encoded JPEG bytes."""
from PIL import Image, ImageOps
with open(path, "rb") as f:
raw = f.read()
img = Image.open(io.BytesIO(raw))
img = ImageOps.exif_transpose(img).convert("RGB")
buf = io.BytesIO()
img.save(buf, format="JPEG", quality=90)
return base64.b64encode(buf.getvalue()).decode()
# ── Vision backend ─────────────────────────────────────────────────────────────
def _call_via_anthropic(image_paths: list[Path], prompt: str) -> str:
"""Send image(s) + prompt to Anthropic API. Raises RuntimeError if unavailable."""
try:
import anthropic
except ImportError as exc:
raise RuntimeError("anthropic package not installed") from exc
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
raise RuntimeError("ANTHROPIC_API_KEY not set")
client = anthropic.Anthropic(api_key=api_key)
content: list[dict] = []
for i, path in enumerate(image_paths):
if i > 0:
content.append({"type": "text", "text": f"(Page {i + 1} of the same recipe:)"})
content.append({
"type": "image",
"source": {
"type": "base64",
"media_type": "image/jpeg",
"data": _load_image_b64(path),
},
})
content.append({"type": "text", "text": prompt})
msg = client.messages.create(
# Haiku is cost-efficient for well-structured extraction prompts
model="claude-haiku-4-5-20251001",
max_tokens=2048,
messages=[{"role": "user", "content": content}],
)
return msg.content[0].text.strip()
def _call_via_local_vlm(image_paths: list[Path], prompt: str) -> str:
"""Send image(s) + prompt to local Qwen2.5-VL. Raises RuntimeError if unavailable."""
try:
import torch
except ImportError as exc:
raise RuntimeError("torch not installed") from exc
if not torch.cuda.is_available():
raise RuntimeError("No CUDA device -- local VLM unavailable")
# Lazy import so the module loads fast when GPU is absent
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image, ImageOps
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
logger.info("Loading local VLM for recipe scan: %s", model_name)
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True,
)
processor = AutoProcessor.from_pretrained(model_name)
model.train(False) # inference mode
images = []
for path in image_paths:
with open(path, "rb") as f:
raw = f.read()
img = Image.open(io.BytesIO(raw))
img = ImageOps.exif_transpose(img).convert("RGB")
images.append(img)
inputs = processor(images=images, text=prompt, return_tensors="pt")
inputs = {k: v.to("cuda", torch.float16) if isinstance(v, torch.Tensor) else v
for k, v in inputs.items()}
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=2048,
do_sample=False,
temperature=0.0,
)
output = processor.decode(output_ids[0], skip_special_tokens=True)
output = output.replace(prompt, "").strip()
# Free VRAM
del model
torch.cuda.empty_cache()
return output
def _call_vision_backend(image_paths: list[Path], prompt: str) -> str:
"""Dispatch to the best available vision backend.
Priority: cf-orch vision -> local Qwen2.5-VL -> Anthropic API.
Raises RuntimeError with a clear message when no backend is available.
"""
errors: list[str] = []
# 1. Try cf-orch vision allocation
cf_orch_url = os.environ.get("CF_ORCH_URL")
if cf_orch_url:
try:
from circuitforge_orch.client import CFOrchClient
from app.services.ocr.docuvision_client import DocuvisionClient
client = CFOrchClient(cf_orch_url)
with client.allocate(
service="cf-vision",
model_candidates=["qwen2.5-vl-7b", "cf-docuvision"],
ttl_s=90.0,
caller="kiwi-recipe-scan",
) as alloc:
if alloc is not None:
doc_client = DocuvisionClient(alloc.url)
# docuvision takes a single image -- use first image only for now
result = doc_client.extract_text(image_paths[0])
if result.text:
return result.text
except Exception as exc:
logger.debug("cf-orch vision failed for recipe scan: %s", exc)
errors.append(f"cf-orch: {exc}")
# 2. Try local Qwen2.5-VL
try:
return _call_via_local_vlm(image_paths, prompt)
except Exception as exc:
logger.debug("Local VLM unavailable for recipe scan: %s", exc)
errors.append(f"local VLM: {exc}")
# 3. Try Anthropic API (BYOK)
try:
return _call_via_anthropic(image_paths, prompt)
except Exception as exc:
logger.debug("Anthropic API failed for recipe scan: %s", exc)
errors.append(f"Anthropic: {exc}")
raise RuntimeError(
"No vision backend configured for recipe scanning. "
"Options: cf-orch (CF_ORCH_URL), local GPU, or ANTHROPIC_API_KEY (BYOK). "
f"Errors: {'; '.join(errors)}"
)
# ── Parsing helpers ────────────────────────────────────────────────────────────
def _normalize_ingredient_name(name: str) -> str:
"""Lowercase + strip whitespace. Preserves multi-word names as-is."""
return name.lower().strip()
def _parse_scanner_json(raw_text: str) -> dict:
"""Extract and return the JSON dict from VLM output.
Handles:
- Pure JSON
- JSON wrapped in ```json ... ``` markdown fences
- JSON preceded by a line of prose ("Here is the recipe: {...}")
Raises ValueError on not_a_recipe or unparseable output.
"""
text = raw_text.strip()
# Strip markdown fences if present
if text.startswith("```"):
parts = text.split("```")
for part in parts:
part = part.strip()
if part.startswith("json"):
part = part[4:].strip()
if part.startswith("{"):
text = part
break
# Try direct parse first
try:
data = json.loads(text)
except json.JSONDecodeError:
# Extract first JSON object embedded in prose
match = re.search(r"\{.*\}", text, re.DOTALL)
if not match:
raise ValueError(f"Could not parse JSON from VLM output: {text[:200]!r}")
try:
data = json.loads(match.group(0))
except json.JSONDecodeError as exc:
raise ValueError(f"Could not parse JSON from VLM output: {exc}") from exc
if isinstance(data, dict) and data.get("error") == "not_a_recipe":
raise ValueError("not_a_recipe: image does not appear to contain a recipe")
return data
# ── Pantry cross-reference ─────────────────────────────────────────────────────
def _cross_reference_pantry(
ingredients: list[ScannedIngredient],
pantry_names: list[str],
) -> tuple[list[ScannedIngredient], int]:
"""Mark ingredients found in the pantry and return updated list + match percent.
Matching is bidirectional by token:
- "broccoli florets" matches pantry item "broccoli" (pantry token in ingredient)
- "pumpkin seeds" matches pantry "pumpkin seeds" (exact)
Returns (updated_ingredients, pantry_match_pct).
"""
if not ingredients:
return ingredients, 0
normalized_pantry = [_normalize_ingredient_name(p) for p in pantry_names]
updated: list[ScannedIngredient] = []
matched = 0
for ingr in ingredients:
norm_ingr = _normalize_ingredient_name(ingr.name)
in_pantry = any(
(p_tok in norm_ingr or norm_ingr in p_tok)
for p in normalized_pantry
for p_tok in p.split()
if len(p_tok) >= 4 # skip short stop-words like "of", "and", "the"
)
updated.append(ScannedIngredient(
name=ingr.name,
qty=ingr.qty,
unit=ingr.unit,
raw=ingr.raw,
in_pantry=in_pantry,
))
if in_pantry:
matched += 1
pct = round(matched / len(ingredients) * 100)
return updated, pct
# ── Main scanner class ─────────────────────────────────────────────────────────
class RecipeScanner:
"""Stateless recipe scanner. One instance can be reused across requests."""
def scan(
self,
image_paths: list[Path],
pantry_names: list[str] | None = None,
) -> ScannedRecipeResult:
"""Extract a structured recipe from one or more photos.
Args:
image_paths: 1-4 image files (phone photos, scans).
pantry_names: Flat list of product names from user's inventory.
Pass [] or None to skip pantry cross-reference.
Returns:
ScannedRecipeResult with all fields populated.
Raises:
ValueError: Image is not a recipe, or JSON could not be parsed.
RuntimeError: No vision backend is configured.
"""
if not image_paths:
raise ValueError("At least one image is required")
if len(image_paths) > MAX_IMAGES:
raise ValueError(f"Maximum {MAX_IMAGES} images per scan (got {len(image_paths)})")
# Call vision backend
raw_text = _call_vision_backend(image_paths, _EXTRACTION_PROMPT)
# Parse JSON from VLM output
data = _parse_scanner_json(raw_text)
# Build ingredient list
raw_ingredients = data.get("ingredients") or []
ingredients: list[ScannedIngredient] = [
ScannedIngredient(
name=str(item.get("name") or "").strip() or "unknown",
qty=str(item["qty"]) if item.get("qty") is not None else None,
unit=str(item["unit"]) if item.get("unit") is not None else None,
raw=str(item["raw"]) if item.get("raw") is not None else None,
)
for item in raw_ingredients
if isinstance(item, dict)
]
# Pantry cross-reference
ingredients, pct = _cross_reference_pantry(
ingredients,
pantry_names or [],
)
return ScannedRecipeResult(
title=data.get("title") or None,
subtitle=data.get("subtitle") or None,
servings=str(data["servings"]) if data.get("servings") is not None else None,
cook_time=str(data["cook_time"]) if data.get("cook_time") is not None else None,
source_note=data.get("source_note") or None,
ingredients=ingredients,
steps=[str(s) for s in (data.get("steps") or []) if s],
notes=data.get("notes") or None,
tags=list(data.get("tags") or []),
pantry_match_pct=pct,
confidence=data.get("confidence") or "medium",
warnings=list(data.get("warnings") or []),
)