feat(recipe_scan): use Qwen2-VL GGUF via cf-text OpenAI-compat API
Replace two-step docuvision OCR + LLM structuring pipeline with a single multimodal VLM call. The bartowski Qwen2-VL-7B-Instruct Q5_K_M GGUF is served by cf-text (llama.cpp) and accepts image_url content blocks identical to the OpenAI vision API format. Removes docuvision dependency for recipe scanning; the addict-missing / DeepseekVLV2Processor-missing cf-docuvision error no longer blocks scans. Receipt OCR (kiwi.ocr task) still routes to cf-docuvision separately.
This commit is contained in:
parent
2df17ec719
commit
c72b4415db
1 changed files with 35 additions and 23 deletions
|
|
@ -215,6 +215,35 @@ def _build_ocr_extraction_prompt(ocr_text: str) -> str:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _call_via_cf_text_vlm(alloc_url: str, image_paths: list[Path], prompt: str) -> str:
|
||||||
|
"""Call the cf-text OpenAI-compat API with images via the llama.cpp multimodal backend."""
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
content: list[dict] = []
|
||||||
|
for i, path in enumerate(image_paths):
|
||||||
|
if i > 0:
|
||||||
|
content.append({"type": "text", "text": f"(Page {i + 1} of the same recipe:)"})
|
||||||
|
b64 = _load_image_b64(path)
|
||||||
|
content.append({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/jpeg;base64,{b64}"},
|
||||||
|
})
|
||||||
|
content.append({"type": "text", "text": prompt})
|
||||||
|
|
||||||
|
resp = httpx.post(
|
||||||
|
f"{alloc_url.rstrip('/')}/v1/chat/completions",
|
||||||
|
json={
|
||||||
|
"model": "local",
|
||||||
|
"messages": [{"role": "user", "content": content}],
|
||||||
|
"max_tokens": 2048,
|
||||||
|
"temperature": 0.0,
|
||||||
|
},
|
||||||
|
timeout=180.0,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.json()["choices"][0]["message"]["content"].strip()
|
||||||
|
|
||||||
|
|
||||||
def _call_vision_backend(
|
def _call_vision_backend(
|
||||||
image_paths: list[Path],
|
image_paths: list[Path],
|
||||||
prompt: str,
|
prompt: str,
|
||||||
|
|
@ -222,7 +251,7 @@ def _call_vision_backend(
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Dispatch to the best available vision backend.
|
"""Dispatch to the best available vision backend.
|
||||||
|
|
||||||
Priority: cf-orch docuvision (OCR + text LLM) -> local Qwen2.5-VL -> Anthropic API.
|
Priority: cf-orch (Qwen2-VL GGUF via cf-text) -> local Qwen2.5-VL -> Anthropic API.
|
||||||
Raises RuntimeError with a clear message when no backend is available.
|
Raises RuntimeError with a clear message when no backend is available.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -237,35 +266,18 @@ def _call_vision_backend(
|
||||||
|
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
|
||||||
# 1. Try cf-orch task allocation → cf-docuvision OCR, then text LLM structuring.
|
# 1. Try cf-orch task allocation → Qwen2-VL GGUF on cf-text (direct multimodal extraction).
|
||||||
# Two-step: docuvision extracts text from the image(s), then LLMRouter
|
# One-step: the VLM receives the image(s) directly and returns structured recipe JSON.
|
||||||
# converts the OCR text to structured recipe JSON using the extraction prompt.
|
|
||||||
cf_orch_url = os.environ.get("CF_ORCH_URL")
|
cf_orch_url = os.environ.get("CF_ORCH_URL")
|
||||||
if cf_orch_url:
|
if cf_orch_url:
|
||||||
try:
|
try:
|
||||||
from app.services.task_inference import TaskNotRegistered, task_allocate
|
from app.services.task_inference import TaskNotRegistered, task_allocate
|
||||||
from app.services.ocr.docuvision_client import DocuvisionClient
|
|
||||||
from circuitforge_core.llm.router import LLMRouter
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
_progress("allocating", "Starting vision service...")
|
_progress("allocating", "Starting vision service...")
|
||||||
with task_allocate("kiwi", "recipe_scan", service_hint="cf-docuvision", ttl_s=120.0) as alloc:
|
with task_allocate("kiwi", "recipe_scan", service_hint="cf-text", ttl_s=120.0) as alloc:
|
||||||
# Step 1: OCR each image via cf-docuvision
|
_progress("scanning", "Extracting recipe from photo...")
|
||||||
_progress("scanning", "Extracting recipe text from photo...")
|
text = _call_via_cf_text_vlm(alloc.url, image_paths, prompt)
|
||||||
doc_client = DocuvisionClient(alloc.url)
|
|
||||||
ocr_parts: list[str] = []
|
|
||||||
for i, path in enumerate(image_paths):
|
|
||||||
result = doc_client.extract_text(path, hint="text")
|
|
||||||
prefix = f"(Page {i + 1} of the same recipe)\n" if len(image_paths) > 1 else ""
|
|
||||||
ocr_parts.append(f"{prefix}{result.text}")
|
|
||||||
combined_ocr = "\n\n".join(ocr_parts)
|
|
||||||
|
|
||||||
if not combined_ocr.strip():
|
|
||||||
raise ValueError("Docuvision returned no text — image may not be a recipe")
|
|
||||||
|
|
||||||
# Step 2: Text LLM structures OCR output into recipe JSON
|
|
||||||
_progress("structuring", "Parsing recipe structure...")
|
|
||||||
text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr))
|
|
||||||
if text:
|
if text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue