New feature: photograph a recipe card, cookbook page, or handwritten note and have it extracted into a structured, editable recipe. Backend: - POST /recipes/scan: accept 1-4 photos, run VLM extraction, return structured JSON for review (not auto-saved) - POST /recipes/scan/save: persist a reviewed/edited recipe - GET/DELETE /recipes/user: user-created recipe CRUD - Vision backend priority: cf-orch -> local Qwen2.5-VL -> Anthropic BYOK - 503 with clear config hint when no vision backend available - Multi-photo support: facing pages (ingredients/directions) sent together - Pantry cross-reference: marks which ingredients are already on hand - migration 041: user_recipes table (title, servings, cook_time, steps, ingredients JSON, source, pantry_match_pct) - Tier gate: recipe_scan -> paid, BYOK-unlockable Frontend: - "Scan" button in the Recipes tab bar (camera icon) - RecipeScanModal: upload step (drag-drop + file picker, up to 4 photos, live previews), processing step (spinner), review/edit step (all fields inline-editable before save), pantry match badge, warning banner for low-confidence or incomplete scans Tests: 35 new tests (23 unit + 12 API), 404 total passing
411 lines
15 KiB
Python
411 lines
15 KiB
Python
"""Recipe scanner service (kiwi#9).
|
|
|
|
Extracts structured recipe data from one or more photos of recipe cards,
|
|
cookbook pages, or handwritten notes.
|
|
|
|
Pipeline:
|
|
photo(s) -> EXIF correction -> VLM extraction -> JSON parse -> pantry cross-ref
|
|
|
|
Vision backend priority (mirrors receipt OCR pattern):
|
|
1. cf-orch vision service (if CF_ORCH_URL set)
|
|
2. Local Qwen2.5-VL (if GPU available)
|
|
3. Anthropic API (BYOK -- if ANTHROPIC_API_KEY set)
|
|
|
|
BSL 1.1 -- requires Paid tier or BYOK.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import io
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Maximum number of photos per scan call (to limit VLM context / VRAM)
|
|
MAX_IMAGES = 4
|
|
|
|
# VLM prompt -- adapted from tests/fixtures/recipe_scan/extract_test.py
|
|
_EXTRACTION_PROMPT = """
|
|
You are extracting a recipe from a photograph of a recipe card, cookbook page, or handwritten note.
|
|
|
|
If two or more images are provided, treat them as a single recipe across multiple pages
|
|
(e.g. ingredients on page 1, directions on page 2).
|
|
|
|
Return a single JSON object with these fields:
|
|
- title: recipe name (string)
|
|
- subtitle: any secondary title or serving suggestion e.g. "with Broccoli & Ranch Dressing" (string or null)
|
|
- servings: serving size if shown, as a string e.g. "2", "4-6" (string or null)
|
|
- cook_time: total cook time if shown, e.g. "15 min", "1 hour" (string or null)
|
|
- source_note: any attribution text like "From Betty Crocker" or "Purple Carrot" (string or null)
|
|
- ingredients: array of ingredient objects, each with:
|
|
- name: normalized generic ingredient name, lowercase, no quantities, no brand names
|
|
(e.g. "Follow Your Heart Vegan Ranch" becomes "ranch dressing")
|
|
- qty: quantity as a string, preserving fractions e.g. "1/2", a quarter symbol (string or null)
|
|
- unit: unit of measure, null for countable items (e.g. "3 eggs" has unit: null)
|
|
- raw: the original ingredient line verbatim, exactly as it appears
|
|
- steps: ordered array of instruction strings, one distinct step per element
|
|
- notes: any tips, substitutions, storage instructions, or variations (string or null)
|
|
- confidence: "high" if text is clear and complete, "medium" if some parts are uncertain,
|
|
"low" if mostly handwritten or significantly degraded
|
|
- warnings: array of strings describing anything the user should double-check
|
|
(e.g. "Directions appear to continue on another page not shown")
|
|
|
|
Return only valid JSON. No markdown fences. No explanation outside the JSON.
|
|
If the image does not appear to be a recipe at all, return: {"error": "not_a_recipe"}
|
|
""".strip()
|
|
|
|
|
|
# ── Data types ─────────────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class ScannedIngredient:
|
|
name: str
|
|
qty: str | None = None
|
|
unit: str | None = None
|
|
raw: str | None = None
|
|
in_pantry: bool = False
|
|
|
|
|
|
@dataclass
|
|
class ScannedRecipeResult:
|
|
title: str | None
|
|
subtitle: str | None
|
|
servings: str | None
|
|
cook_time: str | None
|
|
source_note: str | None
|
|
ingredients: list[ScannedIngredient]
|
|
steps: list[str]
|
|
notes: str | None
|
|
tags: list[str]
|
|
pantry_match_pct: int
|
|
confidence: str
|
|
warnings: list[str]
|
|
|
|
|
|
# ── Image helpers ──────────────────────────────────────────────────────────────
|
|
|
|
def _load_image_b64(path: Path) -> str:
|
|
"""Load image, apply EXIF rotation, return base64-encoded JPEG bytes."""
|
|
from PIL import Image, ImageOps
|
|
|
|
with open(path, "rb") as f:
|
|
raw = f.read()
|
|
img = Image.open(io.BytesIO(raw))
|
|
img = ImageOps.exif_transpose(img).convert("RGB")
|
|
buf = io.BytesIO()
|
|
img.save(buf, format="JPEG", quality=90)
|
|
return base64.b64encode(buf.getvalue()).decode()
|
|
|
|
|
|
# ── Vision backend ─────────────────────────────────────────────────────────────
|
|
|
|
def _call_via_anthropic(image_paths: list[Path], prompt: str) -> str:
|
|
"""Send image(s) + prompt to Anthropic API. Raises RuntimeError if unavailable."""
|
|
try:
|
|
import anthropic
|
|
except ImportError as exc:
|
|
raise RuntimeError("anthropic package not installed") from exc
|
|
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
raise RuntimeError("ANTHROPIC_API_KEY not set")
|
|
|
|
client = anthropic.Anthropic(api_key=api_key)
|
|
|
|
content: list[dict] = []
|
|
for i, path in enumerate(image_paths):
|
|
if i > 0:
|
|
content.append({"type": "text", "text": f"(Page {i + 1} of the same recipe:)"})
|
|
content.append({
|
|
"type": "image",
|
|
"source": {
|
|
"type": "base64",
|
|
"media_type": "image/jpeg",
|
|
"data": _load_image_b64(path),
|
|
},
|
|
})
|
|
content.append({"type": "text", "text": prompt})
|
|
|
|
msg = client.messages.create(
|
|
# Haiku is cost-efficient for well-structured extraction prompts
|
|
model="claude-haiku-4-5-20251001",
|
|
max_tokens=2048,
|
|
messages=[{"role": "user", "content": content}],
|
|
)
|
|
return msg.content[0].text.strip()
|
|
|
|
|
|
def _call_via_local_vlm(image_paths: list[Path], prompt: str) -> str:
|
|
"""Send image(s) + prompt to local Qwen2.5-VL. Raises RuntimeError if unavailable."""
|
|
try:
|
|
import torch
|
|
except ImportError as exc:
|
|
raise RuntimeError("torch not installed") from exc
|
|
|
|
if not torch.cuda.is_available():
|
|
raise RuntimeError("No CUDA device -- local VLM unavailable")
|
|
|
|
# Lazy import so the module loads fast when GPU is absent
|
|
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
|
|
from PIL import Image, ImageOps
|
|
|
|
model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
|
|
logger.info("Loading local VLM for recipe scan: %s", model_name)
|
|
|
|
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
model_name,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
low_cpu_mem_usage=True,
|
|
)
|
|
processor = AutoProcessor.from_pretrained(model_name)
|
|
model.train(False) # inference mode
|
|
|
|
images = []
|
|
for path in image_paths:
|
|
with open(path, "rb") as f:
|
|
raw = f.read()
|
|
img = Image.open(io.BytesIO(raw))
|
|
img = ImageOps.exif_transpose(img).convert("RGB")
|
|
images.append(img)
|
|
|
|
inputs = processor(images=images, text=prompt, return_tensors="pt")
|
|
inputs = {k: v.to("cuda", torch.float16) if isinstance(v, torch.Tensor) else v
|
|
for k, v in inputs.items()}
|
|
|
|
with torch.no_grad():
|
|
output_ids = model.generate(
|
|
**inputs,
|
|
max_new_tokens=2048,
|
|
do_sample=False,
|
|
temperature=0.0,
|
|
)
|
|
|
|
output = processor.decode(output_ids[0], skip_special_tokens=True)
|
|
output = output.replace(prompt, "").strip()
|
|
|
|
# Free VRAM
|
|
del model
|
|
torch.cuda.empty_cache()
|
|
|
|
return output
|
|
|
|
|
|
def _call_vision_backend(image_paths: list[Path], prompt: str) -> str:
|
|
"""Dispatch to the best available vision backend.
|
|
|
|
Priority: cf-orch vision -> local Qwen2.5-VL -> Anthropic API.
|
|
Raises RuntimeError with a clear message when no backend is available.
|
|
"""
|
|
errors: list[str] = []
|
|
|
|
# 1. Try cf-orch vision allocation
|
|
cf_orch_url = os.environ.get("CF_ORCH_URL")
|
|
if cf_orch_url:
|
|
try:
|
|
from circuitforge_orch.client import CFOrchClient
|
|
from app.services.ocr.docuvision_client import DocuvisionClient
|
|
|
|
client = CFOrchClient(cf_orch_url)
|
|
with client.allocate(
|
|
service="cf-vision",
|
|
model_candidates=["qwen2.5-vl-7b", "cf-docuvision"],
|
|
ttl_s=90.0,
|
|
caller="kiwi-recipe-scan",
|
|
) as alloc:
|
|
if alloc is not None:
|
|
doc_client = DocuvisionClient(alloc.url)
|
|
# docuvision takes a single image -- use first image only for now
|
|
result = doc_client.extract_text(image_paths[0])
|
|
if result.text:
|
|
return result.text
|
|
except Exception as exc:
|
|
logger.debug("cf-orch vision failed for recipe scan: %s", exc)
|
|
errors.append(f"cf-orch: {exc}")
|
|
|
|
# 2. Try local Qwen2.5-VL
|
|
try:
|
|
return _call_via_local_vlm(image_paths, prompt)
|
|
except Exception as exc:
|
|
logger.debug("Local VLM unavailable for recipe scan: %s", exc)
|
|
errors.append(f"local VLM: {exc}")
|
|
|
|
# 3. Try Anthropic API (BYOK)
|
|
try:
|
|
return _call_via_anthropic(image_paths, prompt)
|
|
except Exception as exc:
|
|
logger.debug("Anthropic API failed for recipe scan: %s", exc)
|
|
errors.append(f"Anthropic: {exc}")
|
|
|
|
raise RuntimeError(
|
|
"No vision backend configured for recipe scanning. "
|
|
"Options: cf-orch (CF_ORCH_URL), local GPU, or ANTHROPIC_API_KEY (BYOK). "
|
|
f"Errors: {'; '.join(errors)}"
|
|
)
|
|
|
|
|
|
# ── Parsing helpers ────────────────────────────────────────────────────────────
|
|
|
|
def _normalize_ingredient_name(name: str) -> str:
|
|
"""Lowercase + strip whitespace. Preserves multi-word names as-is."""
|
|
return name.lower().strip()
|
|
|
|
|
|
def _parse_scanner_json(raw_text: str) -> dict:
|
|
"""Extract and return the JSON dict from VLM output.
|
|
|
|
Handles:
|
|
- Pure JSON
|
|
- JSON wrapped in ```json ... ``` markdown fences
|
|
- JSON preceded by a line of prose ("Here is the recipe: {...}")
|
|
|
|
Raises ValueError on not_a_recipe or unparseable output.
|
|
"""
|
|
text = raw_text.strip()
|
|
|
|
# Strip markdown fences if present
|
|
if text.startswith("```"):
|
|
parts = text.split("```")
|
|
for part in parts:
|
|
part = part.strip()
|
|
if part.startswith("json"):
|
|
part = part[4:].strip()
|
|
if part.startswith("{"):
|
|
text = part
|
|
break
|
|
|
|
# Try direct parse first
|
|
try:
|
|
data = json.loads(text)
|
|
except json.JSONDecodeError:
|
|
# Extract first JSON object embedded in prose
|
|
match = re.search(r"\{.*\}", text, re.DOTALL)
|
|
if not match:
|
|
raise ValueError(f"Could not parse JSON from VLM output: {text[:200]!r}")
|
|
try:
|
|
data = json.loads(match.group(0))
|
|
except json.JSONDecodeError as exc:
|
|
raise ValueError(f"Could not parse JSON from VLM output: {exc}") from exc
|
|
|
|
if isinstance(data, dict) and data.get("error") == "not_a_recipe":
|
|
raise ValueError("not_a_recipe: image does not appear to contain a recipe")
|
|
|
|
return data
|
|
|
|
|
|
# ── Pantry cross-reference ─────────────────────────────────────────────────────
|
|
|
|
def _cross_reference_pantry(
|
|
ingredients: list[ScannedIngredient],
|
|
pantry_names: list[str],
|
|
) -> tuple[list[ScannedIngredient], int]:
|
|
"""Mark ingredients found in the pantry and return updated list + match percent.
|
|
|
|
Matching is bidirectional by token:
|
|
- "broccoli florets" matches pantry item "broccoli" (pantry token in ingredient)
|
|
- "pumpkin seeds" matches pantry "pumpkin seeds" (exact)
|
|
|
|
Returns (updated_ingredients, pantry_match_pct).
|
|
"""
|
|
if not ingredients:
|
|
return ingredients, 0
|
|
|
|
normalized_pantry = [_normalize_ingredient_name(p) for p in pantry_names]
|
|
updated: list[ScannedIngredient] = []
|
|
matched = 0
|
|
|
|
for ingr in ingredients:
|
|
norm_ingr = _normalize_ingredient_name(ingr.name)
|
|
in_pantry = any(
|
|
(p_tok in norm_ingr or norm_ingr in p_tok)
|
|
for p in normalized_pantry
|
|
for p_tok in p.split()
|
|
if len(p_tok) >= 4 # skip short stop-words like "of", "and", "the"
|
|
)
|
|
updated.append(ScannedIngredient(
|
|
name=ingr.name,
|
|
qty=ingr.qty,
|
|
unit=ingr.unit,
|
|
raw=ingr.raw,
|
|
in_pantry=in_pantry,
|
|
))
|
|
if in_pantry:
|
|
matched += 1
|
|
|
|
pct = round(matched / len(ingredients) * 100)
|
|
return updated, pct
|
|
|
|
|
|
# ── Main scanner class ─────────────────────────────────────────────────────────
|
|
|
|
class RecipeScanner:
|
|
"""Stateless recipe scanner. One instance can be reused across requests."""
|
|
|
|
def scan(
|
|
self,
|
|
image_paths: list[Path],
|
|
pantry_names: list[str] | None = None,
|
|
) -> ScannedRecipeResult:
|
|
"""Extract a structured recipe from one or more photos.
|
|
|
|
Args:
|
|
image_paths: 1-4 image files (phone photos, scans).
|
|
pantry_names: Flat list of product names from user's inventory.
|
|
Pass [] or None to skip pantry cross-reference.
|
|
|
|
Returns:
|
|
ScannedRecipeResult with all fields populated.
|
|
|
|
Raises:
|
|
ValueError: Image is not a recipe, or JSON could not be parsed.
|
|
RuntimeError: No vision backend is configured.
|
|
"""
|
|
if not image_paths:
|
|
raise ValueError("At least one image is required")
|
|
if len(image_paths) > MAX_IMAGES:
|
|
raise ValueError(f"Maximum {MAX_IMAGES} images per scan (got {len(image_paths)})")
|
|
|
|
# Call vision backend
|
|
raw_text = _call_vision_backend(image_paths, _EXTRACTION_PROMPT)
|
|
|
|
# Parse JSON from VLM output
|
|
data = _parse_scanner_json(raw_text)
|
|
|
|
# Build ingredient list
|
|
raw_ingredients = data.get("ingredients") or []
|
|
ingredients: list[ScannedIngredient] = [
|
|
ScannedIngredient(
|
|
name=str(item.get("name") or "").strip() or "unknown",
|
|
qty=str(item["qty"]) if item.get("qty") is not None else None,
|
|
unit=str(item["unit"]) if item.get("unit") is not None else None,
|
|
raw=str(item["raw"]) if item.get("raw") is not None else None,
|
|
)
|
|
for item in raw_ingredients
|
|
if isinstance(item, dict)
|
|
]
|
|
|
|
# Pantry cross-reference
|
|
ingredients, pct = _cross_reference_pantry(
|
|
ingredients,
|
|
pantry_names or [],
|
|
)
|
|
|
|
return ScannedRecipeResult(
|
|
title=data.get("title") or None,
|
|
subtitle=data.get("subtitle") or None,
|
|
servings=str(data["servings"]) if data.get("servings") is not None else None,
|
|
cook_time=str(data["cook_time"]) if data.get("cook_time") is not None else None,
|
|
source_note=data.get("source_note") or None,
|
|
ingredients=ingredients,
|
|
steps=[str(s) for s in (data.get("steps") or []) if s],
|
|
notes=data.get("notes") or None,
|
|
tags=list(data.get("tags") or []),
|
|
pantry_match_pct=pct,
|
|
confidence=data.get("confidence") or "medium",
|
|
warnings=list(data.get("warnings") or []),
|
|
)
|