diff --git a/app/services/recipe/recipe_scanner.py b/app/services/recipe/recipe_scanner.py index a1a3f7a..71db038 100644 --- a/app/services/recipe/recipe_scanner.py +++ b/app/services/recipe/recipe_scanner.py @@ -215,6 +215,35 @@ def _build_ocr_extraction_prompt(ocr_text: str) -> str: ) +def _call_via_cf_text_vlm(alloc_url: str, image_paths: list[Path], prompt: str) -> str: + """Call the cf-text OpenAI-compat API with images via the llama.cpp multimodal backend.""" + import httpx + + content: list[dict] = [] + for i, path in enumerate(image_paths): + if i > 0: + content.append({"type": "text", "text": f"(Page {i + 1} of the same recipe:)"}) + b64 = _load_image_b64(path) + content.append({ + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, + }) + content.append({"type": "text", "text": prompt}) + + resp = httpx.post( + f"{alloc_url.rstrip('/')}/v1/chat/completions", + json={ + "model": "local", + "messages": [{"role": "user", "content": content}], + "max_tokens": 2048, + "temperature": 0.0, + }, + timeout=180.0, + ) + resp.raise_for_status() + return resp.json()["choices"][0]["message"]["content"].strip() + + def _call_vision_backend( image_paths: list[Path], prompt: str, @@ -222,7 +251,7 @@ def _call_vision_backend( ) -> str: """Dispatch to the best available vision backend. - Priority: cf-orch docuvision (OCR + text LLM) -> local Qwen2.5-VL -> Anthropic API. + Priority: cf-orch (Qwen2-VL GGUF via cf-text) -> local Qwen2.5-VL -> Anthropic API. Raises RuntimeError with a clear message when no backend is available. Args: @@ -237,35 +266,18 @@ def _call_vision_backend( errors: list[str] = [] - # 1. Try cf-orch task allocation → cf-docuvision OCR, then text LLM structuring. - # Two-step: docuvision extracts text from the image(s), then LLMRouter - # converts the OCR text to structured recipe JSON using the extraction prompt. + # 1. Try cf-orch task allocation → Qwen2-VL GGUF on cf-text (direct multimodal extraction). + # One-step: the VLM receives the image(s) directly and returns structured recipe JSON. cf_orch_url = os.environ.get("CF_ORCH_URL") if cf_orch_url: try: from app.services.task_inference import TaskNotRegistered, task_allocate - from app.services.ocr.docuvision_client import DocuvisionClient - from circuitforge_core.llm.router import LLMRouter try: _progress("allocating", "Starting vision service...") - with task_allocate("kiwi", "recipe_scan", service_hint="cf-docuvision", ttl_s=120.0) as alloc: - # Step 1: OCR each image via cf-docuvision - _progress("scanning", "Extracting recipe text from photo...") - doc_client = DocuvisionClient(alloc.url) - ocr_parts: list[str] = [] - for i, path in enumerate(image_paths): - result = doc_client.extract_text(path, hint="text") - prefix = f"(Page {i + 1} of the same recipe)\n" if len(image_paths) > 1 else "" - ocr_parts.append(f"{prefix}{result.text}") - combined_ocr = "\n\n".join(ocr_parts) - - if not combined_ocr.strip(): - raise ValueError("Docuvision returned no text — image may not be a recipe") - - # Step 2: Text LLM structures OCR output into recipe JSON - _progress("structuring", "Parsing recipe structure...") - text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr)) + with task_allocate("kiwi", "recipe_scan", service_hint="cf-text", ttl_s=120.0) as alloc: + _progress("scanning", "Extracting recipe from photo...") + text = _call_via_cf_text_vlm(alloc.url, image_paths, prompt) if text: return text