fix(recipe-scan): wire cf-docuvision OCR + LLMRouter for cloud recipe scanning (kiwi#136)
Two-step pipeline: task_allocate("kiwi", "recipe_scan", service_hint="cf-docuvision")
acquires a docuvision allocation, calls /extract per image to get OCR text, then
LLMRouter structures the combined OCR output into recipe JSON via the text
extraction prompt.
Also fixes DocuvisionClient bugs:
- POST field was "image" (ignored by Pydantic) — should be "image_b64"
- Response read "text" key — docuvision returns "raw_text"
- Add hint parameter (use "text" for recipe cards, dense prose)
- Configurable timeout (default 120s; docuvision lazy-loads model on first request)
This commit is contained in:
parent
cdbc24240a
commit
4ac24e7920
3 changed files with 73 additions and 30 deletions
|
|
@ -18,43 +18,51 @@ class DocuvisionResult:
|
||||||
class DocuvisionClient:
|
class DocuvisionClient:
|
||||||
"""Thin client for the cf-docuvision service."""
|
"""Thin client for the cf-docuvision service."""
|
||||||
|
|
||||||
def __init__(self, base_url: str) -> None:
|
def __init__(self, base_url: str, timeout: float = 120.0) -> None:
|
||||||
self._base_url = base_url.rstrip("/")
|
self._base_url = base_url.rstrip("/")
|
||||||
|
self._timeout = timeout
|
||||||
|
|
||||||
def extract_text(self, image_path: str | Path) -> DocuvisionResult:
|
def extract_text(self, image_path: str | Path, hint: str = "text") -> DocuvisionResult:
|
||||||
"""Send an image to docuvision and return extracted text."""
|
"""Send an image to docuvision and return extracted text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
image_path: Path to the image file.
|
||||||
|
hint: Docuvision extraction hint — "text" for dense prose (recipes),
|
||||||
|
"table" for tabular data, "form" for form fields, "auto" for
|
||||||
|
automatic detection.
|
||||||
|
"""
|
||||||
image_bytes = Path(image_path).read_bytes()
|
image_bytes = Path(image_path).read_bytes()
|
||||||
b64 = base64.b64encode(image_bytes).decode()
|
b64 = base64.b64encode(image_bytes).decode()
|
||||||
|
|
||||||
with httpx.Client(timeout=30.0) as client:
|
with httpx.Client(timeout=self._timeout) as client:
|
||||||
resp = client.post(
|
resp = client.post(
|
||||||
f"{self._base_url}/extract",
|
f"{self._base_url}/extract",
|
||||||
json={"image": b64},
|
json={"image_b64": b64, "hint": hint},
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
|
|
||||||
return DocuvisionResult(
|
return DocuvisionResult(
|
||||||
text=data.get("text", ""),
|
text=data.get("raw_text", ""),
|
||||||
confidence=data.get("confidence"),
|
confidence=data.get("metadata", {}).get("confidence"),
|
||||||
raw=data,
|
raw=data,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def extract_text_async(self, image_path: str | Path) -> DocuvisionResult:
|
async def extract_text_async(self, image_path: str | Path, hint: str = "text") -> DocuvisionResult:
|
||||||
"""Async version."""
|
"""Async version."""
|
||||||
image_bytes = Path(image_path).read_bytes()
|
image_bytes = Path(image_path).read_bytes()
|
||||||
b64 = base64.b64encode(image_bytes).decode()
|
b64 = base64.b64encode(image_bytes).decode()
|
||||||
|
|
||||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
||||||
resp = await client.post(
|
resp = await client.post(
|
||||||
f"{self._base_url}/extract",
|
f"{self._base_url}/extract",
|
||||||
json={"image": b64},
|
json={"image_b64": b64, "hint": hint},
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
|
|
||||||
return DocuvisionResult(
|
return DocuvisionResult(
|
||||||
text=data.get("text", ""),
|
text=data.get("raw_text", ""),
|
||||||
confidence=data.get("confidence"),
|
confidence=data.get("metadata", {}).get("confidence"),
|
||||||
raw=data,
|
raw=data,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -196,34 +196,63 @@ def _call_via_local_vlm(image_paths: list[Path], prompt: str) -> str:
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def _build_ocr_extraction_prompt(ocr_text: str) -> str:
|
||||||
|
"""Build a text-LLM prompt for structuring OCR output into recipe JSON.
|
||||||
|
|
||||||
|
Swaps the image-centric preamble of _EXTRACTION_PROMPT for an OCR-centric
|
||||||
|
one, then appends the combined OCR text as input. The JSON schema section
|
||||||
|
is shared verbatim to keep the two paths in sync.
|
||||||
|
"""
|
||||||
|
schema_idx = _EXTRACTION_PROMPT.find("Return a single JSON object")
|
||||||
|
schema_part = _EXTRACTION_PROMPT[schema_idx:] if schema_idx != -1 else _EXTRACTION_PROMPT
|
||||||
|
return (
|
||||||
|
"You are extracting a recipe from OCR text taken from a recipe card, "
|
||||||
|
"cookbook page, or handwritten note.\n\n"
|
||||||
|
"The text below was obtained via optical character recognition and may "
|
||||||
|
"contain minor scanning artifacts or formatting irregularities.\n\n"
|
||||||
|
f"{schema_part}\n\nOCR Text:\n{ocr_text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _call_vision_backend(image_paths: list[Path], prompt: str) -> str:
|
def _call_vision_backend(image_paths: list[Path], prompt: str) -> str:
|
||||||
"""Dispatch to the best available vision backend.
|
"""Dispatch to the best available vision backend.
|
||||||
|
|
||||||
Priority: cf-orch vision -> local Qwen2.5-VL -> Anthropic API.
|
Priority: cf-orch docuvision (OCR + text LLM) -> local Qwen2.5-VL -> Anthropic API.
|
||||||
Raises RuntimeError with a clear message when no backend is available.
|
Raises RuntimeError with a clear message when no backend is available.
|
||||||
"""
|
"""
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
|
||||||
# 1. Try cf-orch vision allocation
|
# 1. Try cf-orch task allocation → cf-docuvision OCR, then text LLM structuring.
|
||||||
|
# Two-step: docuvision extracts text from the image(s), then LLMRouter
|
||||||
|
# converts the OCR text to structured recipe JSON using the extraction prompt.
|
||||||
cf_orch_url = os.environ.get("CF_ORCH_URL")
|
cf_orch_url = os.environ.get("CF_ORCH_URL")
|
||||||
if cf_orch_url:
|
if cf_orch_url:
|
||||||
try:
|
try:
|
||||||
from circuitforge_orch.client import CFOrchClient
|
from app.services.task_inference import TaskNotRegistered, task_allocate
|
||||||
from app.services.ocr.docuvision_client import DocuvisionClient
|
from app.services.ocr.docuvision_client import DocuvisionClient
|
||||||
|
from circuitforge_core.llm.router import LLMRouter
|
||||||
|
|
||||||
client = CFOrchClient(cf_orch_url)
|
try:
|
||||||
with client.allocate(
|
with task_allocate("kiwi", "recipe_scan", service_hint="cf-docuvision", ttl_s=120.0) as alloc:
|
||||||
service="cf-vision",
|
# Step 1: OCR each image via cf-docuvision
|
||||||
model_candidates=["qwen2.5-vl-7b", "cf-docuvision"],
|
|
||||||
ttl_s=90.0,
|
|
||||||
caller="kiwi-recipe-scan",
|
|
||||||
) as alloc:
|
|
||||||
if alloc is not None:
|
|
||||||
doc_client = DocuvisionClient(alloc.url)
|
doc_client = DocuvisionClient(alloc.url)
|
||||||
# docuvision takes a single image -- use first image only for now
|
ocr_parts: list[str] = []
|
||||||
result = doc_client.extract_text(image_paths[0])
|
for i, path in enumerate(image_paths):
|
||||||
if result.text:
|
result = doc_client.extract_text(path, hint="text")
|
||||||
return result.text
|
prefix = f"(Page {i + 1} of the same recipe)\n" if len(image_paths) > 1 else ""
|
||||||
|
ocr_parts.append(f"{prefix}{result.text}")
|
||||||
|
combined_ocr = "\n\n".join(ocr_parts)
|
||||||
|
|
||||||
|
if not combined_ocr.strip():
|
||||||
|
raise ValueError("Docuvision returned no text — image may not be a recipe")
|
||||||
|
|
||||||
|
# Step 2: Text LLM structures OCR output into recipe JSON
|
||||||
|
text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr))
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
|
||||||
|
except TaskNotRegistered:
|
||||||
|
logger.debug("kiwi.recipe_scan not yet registered in cf-orch assignments")
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.debug("cf-orch vision failed for recipe scan: %s", exc)
|
logger.debug("cf-orch vision failed for recipe scan: %s", exc)
|
||||||
errors.append(f"cf-orch: {exc}")
|
errors.append(f"cf-orch: {exc}")
|
||||||
|
|
|
||||||
|
|
@ -17,12 +17,17 @@ from app.services.ocr.docuvision_client import DocuvisionClient, DocuvisionResul
|
||||||
|
|
||||||
|
|
||||||
def test_extract_text_sends_base64_image(tmp_path: Path) -> None:
|
def test_extract_text_sends_base64_image(tmp_path: Path) -> None:
|
||||||
"""extract_text() POSTs a base64-encoded image and returns parsed text."""
|
"""extract_text() POSTs image_b64 and returns parsed raw_text."""
|
||||||
image_file = tmp_path / "test.jpg"
|
image_file = tmp_path / "test.jpg"
|
||||||
image_file.write_bytes(b"fake-image-bytes")
|
image_file.write_bytes(b"fake-image-bytes")
|
||||||
|
|
||||||
mock_response = MagicMock()
|
mock_response = MagicMock()
|
||||||
mock_response.json.return_value = {"text": "Cheerios", "confidence": 0.95}
|
mock_response.json.return_value = {
|
||||||
|
"raw_text": "Cheerios",
|
||||||
|
"elements": [],
|
||||||
|
"tables": [],
|
||||||
|
"metadata": {"hint": "text", "confidence": 0.95},
|
||||||
|
}
|
||||||
mock_response.raise_for_status.return_value = None
|
mock_response.raise_for_status.return_value = None
|
||||||
|
|
||||||
with patch("httpx.Client") as mock_client_cls:
|
with patch("httpx.Client") as mock_client_cls:
|
||||||
|
|
@ -41,7 +46,8 @@ def test_extract_text_sends_base64_image(tmp_path: Path) -> None:
|
||||||
assert call_kwargs[0][0] == "http://docuvision:8080/extract"
|
assert call_kwargs[0][0] == "http://docuvision:8080/extract"
|
||||||
posted_json = call_kwargs[1]["json"]
|
posted_json = call_kwargs[1]["json"]
|
||||||
expected_b64 = base64.b64encode(b"fake-image-bytes").decode()
|
expected_b64 = base64.b64encode(b"fake-image-bytes").decode()
|
||||||
assert posted_json["image"] == expected_b64
|
assert posted_json["image_b64"] == expected_b64
|
||||||
|
assert posted_json["hint"] == "text"
|
||||||
|
|
||||||
|
|
||||||
def test_extract_text_raises_on_http_error(tmp_path: Path) -> None:
|
def test_extract_text_raises_on_http_error(tmp_path: Path) -> None:
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue