diff --git a/app/services/ocr/docuvision_client.py b/app/services/ocr/docuvision_client.py index dfa1fed..cd14089 100644 --- a/app/services/ocr/docuvision_client.py +++ b/app/services/ocr/docuvision_client.py @@ -18,43 +18,51 @@ class DocuvisionResult: class DocuvisionClient: """Thin client for the cf-docuvision service.""" - def __init__(self, base_url: str) -> None: + def __init__(self, base_url: str, timeout: float = 120.0) -> None: self._base_url = base_url.rstrip("/") + self._timeout = timeout - def extract_text(self, image_path: str | Path) -> DocuvisionResult: - """Send an image to docuvision and return extracted text.""" + def extract_text(self, image_path: str | Path, hint: str = "text") -> DocuvisionResult: + """Send an image to docuvision and return extracted text. + + Args: + image_path: Path to the image file. + hint: Docuvision extraction hint — "text" for dense prose (recipes), + "table" for tabular data, "form" for form fields, "auto" for + automatic detection. + """ image_bytes = Path(image_path).read_bytes() b64 = base64.b64encode(image_bytes).decode() - with httpx.Client(timeout=30.0) as client: + with httpx.Client(timeout=self._timeout) as client: resp = client.post( f"{self._base_url}/extract", - json={"image": b64}, + json={"image_b64": b64, "hint": hint}, ) resp.raise_for_status() data = resp.json() return DocuvisionResult( - text=data.get("text", ""), - confidence=data.get("confidence"), + text=data.get("raw_text", ""), + confidence=data.get("metadata", {}).get("confidence"), raw=data, ) - async def extract_text_async(self, image_path: str | Path) -> DocuvisionResult: + async def extract_text_async(self, image_path: str | Path, hint: str = "text") -> DocuvisionResult: """Async version.""" image_bytes = Path(image_path).read_bytes() b64 = base64.b64encode(image_bytes).decode() - async with httpx.AsyncClient(timeout=30.0) as client: + async with httpx.AsyncClient(timeout=self._timeout) as client: resp = await client.post( f"{self._base_url}/extract", - json={"image": b64}, + json={"image_b64": b64, "hint": hint}, ) resp.raise_for_status() data = resp.json() return DocuvisionResult( - text=data.get("text", ""), - confidence=data.get("confidence"), + text=data.get("raw_text", ""), + confidence=data.get("metadata", {}).get("confidence"), raw=data, ) diff --git a/app/services/recipe/recipe_scanner.py b/app/services/recipe/recipe_scanner.py index 58b3d2a..fde512a 100644 --- a/app/services/recipe/recipe_scanner.py +++ b/app/services/recipe/recipe_scanner.py @@ -196,34 +196,63 @@ def _call_via_local_vlm(image_paths: list[Path], prompt: str) -> str: return output +def _build_ocr_extraction_prompt(ocr_text: str) -> str: + """Build a text-LLM prompt for structuring OCR output into recipe JSON. + + Swaps the image-centric preamble of _EXTRACTION_PROMPT for an OCR-centric + one, then appends the combined OCR text as input. The JSON schema section + is shared verbatim to keep the two paths in sync. + """ + schema_idx = _EXTRACTION_PROMPT.find("Return a single JSON object") + schema_part = _EXTRACTION_PROMPT[schema_idx:] if schema_idx != -1 else _EXTRACTION_PROMPT + return ( + "You are extracting a recipe from OCR text taken from a recipe card, " + "cookbook page, or handwritten note.\n\n" + "The text below was obtained via optical character recognition and may " + "contain minor scanning artifacts or formatting irregularities.\n\n" + f"{schema_part}\n\nOCR Text:\n{ocr_text}" + ) + + def _call_vision_backend(image_paths: list[Path], prompt: str) -> str: """Dispatch to the best available vision backend. - Priority: cf-orch vision -> local Qwen2.5-VL -> Anthropic API. + Priority: cf-orch docuvision (OCR + text LLM) -> local Qwen2.5-VL -> Anthropic API. Raises RuntimeError with a clear message when no backend is available. """ errors: list[str] = [] - # 1. Try cf-orch vision allocation + # 1. Try cf-orch task allocation → cf-docuvision OCR, then text LLM structuring. + # Two-step: docuvision extracts text from the image(s), then LLMRouter + # converts the OCR text to structured recipe JSON using the extraction prompt. cf_orch_url = os.environ.get("CF_ORCH_URL") if cf_orch_url: try: - from circuitforge_orch.client import CFOrchClient + from app.services.task_inference import TaskNotRegistered, task_allocate from app.services.ocr.docuvision_client import DocuvisionClient + from circuitforge_core.llm.router import LLMRouter - client = CFOrchClient(cf_orch_url) - with client.allocate( - service="cf-vision", - model_candidates=["qwen2.5-vl-7b", "cf-docuvision"], - ttl_s=90.0, - caller="kiwi-recipe-scan", - ) as alloc: - if alloc is not None: + try: + with task_allocate("kiwi", "recipe_scan", service_hint="cf-docuvision", ttl_s=120.0) as alloc: + # Step 1: OCR each image via cf-docuvision doc_client = DocuvisionClient(alloc.url) - # docuvision takes a single image -- use first image only for now - result = doc_client.extract_text(image_paths[0]) - if result.text: - return result.text + ocr_parts: list[str] = [] + for i, path in enumerate(image_paths): + result = doc_client.extract_text(path, hint="text") + prefix = f"(Page {i + 1} of the same recipe)\n" if len(image_paths) > 1 else "" + ocr_parts.append(f"{prefix}{result.text}") + combined_ocr = "\n\n".join(ocr_parts) + + if not combined_ocr.strip(): + raise ValueError("Docuvision returned no text — image may not be a recipe") + + # Step 2: Text LLM structures OCR output into recipe JSON + text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr)) + if text: + return text + + except TaskNotRegistered: + logger.debug("kiwi.recipe_scan not yet registered in cf-orch assignments") except Exception as exc: logger.debug("cf-orch vision failed for recipe scan: %s", exc) errors.append(f"cf-orch: {exc}") diff --git a/tests/test_services/test_docuvision_client.py b/tests/test_services/test_docuvision_client.py index 32df008..433aea4 100644 --- a/tests/test_services/test_docuvision_client.py +++ b/tests/test_services/test_docuvision_client.py @@ -17,12 +17,17 @@ from app.services.ocr.docuvision_client import DocuvisionClient, DocuvisionResul def test_extract_text_sends_base64_image(tmp_path: Path) -> None: - """extract_text() POSTs a base64-encoded image and returns parsed text.""" + """extract_text() POSTs image_b64 and returns parsed raw_text.""" image_file = tmp_path / "test.jpg" image_file.write_bytes(b"fake-image-bytes") mock_response = MagicMock() - mock_response.json.return_value = {"text": "Cheerios", "confidence": 0.95} + mock_response.json.return_value = { + "raw_text": "Cheerios", + "elements": [], + "tables": [], + "metadata": {"hint": "text", "confidence": 0.95}, + } mock_response.raise_for_status.return_value = None with patch("httpx.Client") as mock_client_cls: @@ -41,7 +46,8 @@ def test_extract_text_sends_base64_image(tmp_path: Path) -> None: assert call_kwargs[0][0] == "http://docuvision:8080/extract" posted_json = call_kwargs[1]["json"] expected_b64 = base64.b64encode(b"fake-image-bytes").decode() - assert posted_json["image"] == expected_b64 + assert posted_json["image_b64"] == expected_b64 + assert posted_json["hint"] == "text" def test_extract_text_raises_on_http_error(tmp_path: Path) -> None: