kiwi/app/services/ocr/docuvision_client.py
pyr0ball 4ac24e7920
Some checks are pending
CI / Backend (Python) (push) Waiting to run
CI / Frontend (Vue) (push) Waiting to run
Mirror / mirror (push) Waiting to run
fix(recipe-scan): wire cf-docuvision OCR + LLMRouter for cloud recipe scanning (kiwi#136)
Two-step pipeline: task_allocate("kiwi", "recipe_scan", service_hint="cf-docuvision")
acquires a docuvision allocation, calls /extract per image to get OCR text, then
LLMRouter structures the combined OCR output into recipe JSON via the text
extraction prompt.

Also fixes DocuvisionClient bugs:
- POST field was "image" (ignored by Pydantic) — should be "image_b64"
- Response read "text" key — docuvision returns "raw_text"
- Add hint parameter (use "text" for recipe cards, dense prose)
- Configurable timeout (default 120s; docuvision lazy-loads model on first request)
2026-05-16 14:21:15 -07:00

68 lines
2.2 KiB
Python

"""Thin HTTP client for the cf-docuvision document vision service."""
from __future__ import annotations
import base64
from dataclasses import dataclass
from pathlib import Path
import httpx
@dataclass
class DocuvisionResult:
text: str
confidence: float | None = None
raw: dict | None = None
class DocuvisionClient:
"""Thin client for the cf-docuvision service."""
def __init__(self, base_url: str, timeout: float = 120.0) -> None:
self._base_url = base_url.rstrip("/")
self._timeout = timeout
def extract_text(self, image_path: str | Path, hint: str = "text") -> DocuvisionResult:
"""Send an image to docuvision and return extracted text.
Args:
image_path: Path to the image file.
hint: Docuvision extraction hint — "text" for dense prose (recipes),
"table" for tabular data, "form" for form fields, "auto" for
automatic detection.
"""
image_bytes = Path(image_path).read_bytes()
b64 = base64.b64encode(image_bytes).decode()
with httpx.Client(timeout=self._timeout) as client:
resp = client.post(
f"{self._base_url}/extract",
json={"image_b64": b64, "hint": hint},
)
resp.raise_for_status()
data = resp.json()
return DocuvisionResult(
text=data.get("raw_text", ""),
confidence=data.get("metadata", {}).get("confidence"),
raw=data,
)
async def extract_text_async(self, image_path: str | Path, hint: str = "text") -> DocuvisionResult:
"""Async version."""
image_bytes = Path(image_path).read_bytes()
b64 = base64.b64encode(image_bytes).decode()
async with httpx.AsyncClient(timeout=self._timeout) as client:
resp = await client.post(
f"{self._base_url}/extract",
json={"image_b64": b64, "hint": hint},
)
resp.raise_for_status()
data = resp.json()
return DocuvisionResult(
text=data.get("raw_text", ""),
confidence=data.get("metadata", {}).get("confidence"),
raw=data,
)