feat: add DocuvisionClient + cf-docuvision fast-path for OCR

Introduces a thin HTTP client for the cf-docuvision service and wires it
as a fast path in VisionLanguageOCR.extract_receipt_data(). When CF_ORCH_URL
is set, the pipeline attempts docuvision allocation via CFOrchClient before
loading the heavy local VLM; falls back gracefully if unavailable.
This commit is contained in:
pyr0ball 2026-04-02 12:33:05 -07:00
parent b9eadcdf0e
commit 22e57118df
6 changed files with 318 additions and 63 deletions

View file

@ -0,0 +1,60 @@
"""Thin HTTP client for the cf-docuvision document vision service."""
from __future__ import annotations
import base64
from dataclasses import dataclass
from pathlib import Path
import httpx
@dataclass
class DocuvisionResult:
text: str
confidence: float | None = None
raw: dict | None = None
class DocuvisionClient:
"""Thin client for the cf-docuvision service."""
def __init__(self, base_url: str) -> None:
self._base_url = base_url.rstrip("/")
def extract_text(self, image_path: str | Path) -> DocuvisionResult:
"""Send an image to docuvision and return extracted text."""
image_bytes = Path(image_path).read_bytes()
b64 = base64.b64encode(image_bytes).decode()
with httpx.Client(timeout=30.0) as client:
resp = client.post(
f"{self._base_url}/extract",
json={"image": b64},
)
resp.raise_for_status()
data = resp.json()
return DocuvisionResult(
text=data.get("text", ""),
confidence=data.get("confidence"),
raw=data,
)
async def extract_text_async(self, image_path: str | Path) -> DocuvisionResult:
"""Async version."""
image_bytes = Path(image_path).read_bytes()
b64 = base64.b64encode(image_bytes).decode()
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{self._base_url}/extract",
json={"image": b64},
)
resp.raise_for_status()
data = resp.json()
return DocuvisionResult(
text=data.get("text", ""),
confidence=data.get("confidence"),
raw=data,
)

View file

@ -8,6 +8,7 @@ OCR with understanding of receipt structure to extract structured JSON data.
import json import json
import logging import logging
import os
import re import re
from pathlib import Path from pathlib import Path
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List
@ -26,6 +27,31 @@ from app.core.config import settings
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def _try_docuvision(image_path: str | Path) -> str | None:
"""Try to extract text via cf-docuvision. Returns None if unavailable."""
cf_orch_url = os.environ.get("CF_ORCH_URL")
if not cf_orch_url:
return None
try:
from circuitforge_core.resources import CFOrchClient
from app.services.ocr.docuvision_client import DocuvisionClient
client = CFOrchClient(cf_orch_url)
with client.allocate(
service="cf-docuvision",
model_candidates=[], # cf-docuvision has no model selection
ttl_s=60.0,
caller="kiwi-ocr",
) as alloc:
if alloc is None:
return None
doc_client = DocuvisionClient(alloc.url)
result = doc_client.extract_text(image_path)
return result.text if result.text else None
except Exception:
return None # graceful degradation
class VisionLanguageOCR: class VisionLanguageOCR:
"""Vision-Language Model for receipt OCR and structured extraction.""" """Vision-Language Model for receipt OCR and structured extraction."""
@ -40,7 +66,7 @@ class VisionLanguageOCR:
self.processor = None self.processor = None
self.device = "cuda" if torch.cuda.is_available() and settings.USE_GPU else "cpu" self.device = "cuda" if torch.cuda.is_available() and settings.USE_GPU else "cpu"
self.use_quantization = use_quantization self.use_quantization = use_quantization
self.model_name = "Qwen/Qwen2-VL-2B-Instruct" self.model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
logger.info(f"Initializing VisionLanguageOCR with device: {self.device}") logger.info(f"Initializing VisionLanguageOCR with device: {self.device}")
@ -112,6 +138,19 @@ class VisionLanguageOCR:
"warnings": [...] "warnings": [...]
} }
""" """
# Try docuvision fast path first (skips heavy local VLM if available)
docuvision_text = _try_docuvision(image_path)
if docuvision_text is not None:
return {
"raw_text": docuvision_text,
"merchant": {},
"transaction": {},
"items": [],
"totals": {},
"confidence": {"overall": None},
"warnings": [],
}
self._load_model() self._load_model()
try: try:

View file

@ -2,8 +2,12 @@
from __future__ import annotations from __future__ import annotations
import logging import logging
import os
from contextlib import nullcontext
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from openai import OpenAI
if TYPE_CHECKING: if TYPE_CHECKING:
from app.db.store import Store from app.db.store import Store
@ -113,76 +117,57 @@ class LLMRecipeGenerator:
return "\n".join(lines) return "\n".join(lines)
def _acquire_vram_lease(self) -> str | None: _MODEL_CANDIDATES: list[str] = ["Ouro-2.6B-Thinking", "Ouro-1.4B"]
"""Request a VRAM lease from the CF-core coordinator. Best-effort — returns None if unavailable."""
try:
import httpx
from app.core.config import settings
from app.tasks.runner import VRAM_BUDGETS
budget_mb = int(VRAM_BUDGETS.get("recipe_llm", 4.0) * 1024) def _get_llm_context(self):
coordinator = settings.COORDINATOR_URL """Return a sync context manager that yields an Allocation or None.
nodes_resp = httpx.get(f"{coordinator}/api/nodes", timeout=2.0) When CF_ORCH_URL is set, uses CFOrchClient to acquire a vLLM allocation
if nodes_resp.status_code != 200: (which handles service lifecycle and VRAM). Falls back to nullcontext(None)
return None when the env var is absent or CFOrchClient raises on construction.
nodes = nodes_resp.json().get("nodes", []) """
if not nodes: cf_orch_url = os.environ.get("CF_ORCH_URL")
return None if cf_orch_url:
try:
best_node = best_gpu = best_free = None from circuitforge_core.resources import CFOrchClient
for node in nodes: client = CFOrchClient(cf_orch_url)
for gpu in node.get("gpus", []): return client.allocate(
free = gpu.get("vram_free_mb", 0) service="vllm",
if best_free is None or free > best_free: model_candidates=self._MODEL_CANDIDATES,
best_node = node["node_id"] ttl_s=300.0,
best_gpu = gpu["gpu_id"] caller="kiwi-recipe",
best_free = free )
if best_node is None: except Exception as exc:
return None logger.debug("CFOrchClient init failed, falling back to direct URL: %s", exc)
return nullcontext(None)
lease_resp = httpx.post(
f"{coordinator}/api/leases",
json={
"node_id": best_node,
"gpu_id": best_gpu,
"mb": budget_mb,
"service": "kiwi",
"priority": 5,
},
timeout=3.0,
)
if lease_resp.status_code == 200:
lease_id = lease_resp.json()["lease"]["lease_id"]
logger.debug("Acquired VRAM lease %s for recipe_llm (%d MB)", lease_id, budget_mb)
return lease_id
except Exception as exc:
logger.debug("VRAM lease acquire failed (non-fatal): %s", exc)
return None
def _release_vram_lease(self, lease_id: str) -> None:
"""Release a VRAM lease. Best-effort."""
try:
import httpx
from app.core.config import settings
httpx.delete(f"{settings.COORDINATOR_URL}/api/leases/{lease_id}", timeout=3.0)
logger.debug("Released VRAM lease %s", lease_id)
except Exception as exc:
logger.debug("VRAM lease release failed (non-fatal): %s", exc)
def _call_llm(self, prompt: str) -> str: def _call_llm(self, prompt: str) -> str:
"""Call the LLM router with a VRAM lease held for the duration.""" """Call the LLM, using CFOrchClient allocation when CF_ORCH_URL is set.
lease_id = self._acquire_vram_lease()
With CF_ORCH_URL set: acquires a vLLM allocation via CFOrchClient and
calls the OpenAI-compatible API directly against the allocated service URL.
Without CF_ORCH_URL: falls back to LLMRouter using its configured backends.
"""
try: try:
from circuitforge_core.llm.router import LLMRouter with self._get_llm_context() as alloc:
router = LLMRouter() if alloc is not None:
return router.complete(prompt) base_url = alloc.url.rstrip("/") + "/v1"
client = OpenAI(base_url=base_url, api_key="any")
model = alloc.model or "__auto__"
if model == "__auto__":
model = client.models.list().data[0].id
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
)
return resp.choices[0].message.content or ""
else:
from circuitforge_core.llm.router import LLMRouter
router = LLMRouter()
return router.complete(prompt)
except Exception as exc: except Exception as exc:
logger.error("LLM call failed: %s", exc) logger.error("LLM call failed: %s", exc)
return "" return ""
finally:
if lease_id:
self._release_vram_lease(lease_id)
def _parse_response(self, response: str) -> dict[str, str | list[str]]: def _parse_response(self, response: str) -> dict[str, str | list[str]]:
"""Parse LLM response text into structured recipe fields.""" """Parse LLM response text into structured recipe fields."""

View file

@ -1,6 +1,11 @@
"""Tests for LLMRecipeGenerator — prompt builders and allergy filtering.""" """Tests for LLMRecipeGenerator — prompt builders and allergy filtering."""
from __future__ import annotations from __future__ import annotations
import os
from contextlib import contextmanager
from dataclasses import dataclass
from unittest.mock import MagicMock, patch
import pytest import pytest
from app.models.schemas.recipe import RecipeRequest from app.models.schemas.recipe import RecipeRequest
@ -139,3 +144,82 @@ def test_generate_returns_result_when_llm_responds(monkeypatch):
assert len(suggestion.directions) > 0 assert len(suggestion.directions) > 0
assert "parmesan" in suggestion.notes.lower() assert "parmesan" in suggestion.notes.lower()
assert result.element_gaps == ["Brightness"] assert result.element_gaps == ["Brightness"]
# ---------------------------------------------------------------------------
# CFOrchClient integration tests
# ---------------------------------------------------------------------------
@dataclass
class _FakeAllocation:
allocation_id: str = "alloc-test-1"
service: str = "vllm"
node_id: str = "node-1"
gpu_id: int = 0
model: str | None = "Ouro-2.6B-Thinking"
url: str = "http://test:8000"
started: bool = True
warm: bool = True
def test_recipe_gen_uses_cf_orch_when_env_set(monkeypatch):
"""When CF_ORCH_URL is set, _call_llm uses alloc.url+/v1 as the OpenAI base_url."""
from app.services.recipe.llm_recipe import LLMRecipeGenerator
store = _make_store()
gen = LLMRecipeGenerator(store)
fake_alloc = _FakeAllocation()
@contextmanager
def _fake_llm_context():
yield fake_alloc
captured = {}
# Fake OpenAI that records the base_url it was constructed with
class _FakeOpenAI:
def __init__(self, *, base_url, api_key):
captured["base_url"] = base_url
msg = MagicMock()
msg.content = "Title: Test\nIngredients: a\nDirections: do it.\nNotes: none."
choice = MagicMock()
choice.message = msg
completion = MagicMock()
completion.choices = [choice]
self.chat = MagicMock()
self.chat.completions = MagicMock()
self.chat.completions.create = MagicMock(return_value=completion)
# Patch _get_llm_context directly so no real HTTP call is made
monkeypatch.setattr(gen, "_get_llm_context", _fake_llm_context)
with patch("app.services.recipe.llm_recipe.OpenAI", _FakeOpenAI):
gen._call_llm("make me a recipe")
assert captured.get("base_url") == "http://test:8000/v1"
def test_recipe_gen_falls_back_without_cf_orch(monkeypatch):
"""When CF_ORCH_URL is not set, _call_llm falls back to LLMRouter."""
from app.services.recipe.llm_recipe import LLMRecipeGenerator
store = _make_store()
gen = LLMRecipeGenerator(store)
monkeypatch.delenv("CF_ORCH_URL", raising=False)
router_called = {}
def _fake_complete(prompt, **_kwargs):
router_called["prompt"] = prompt
return "Title: Direct\nIngredients: x\nDirections: go.\nNotes: ok."
fake_router = MagicMock()
fake_router.complete.side_effect = _fake_complete
# Patch where LLMRouter is imported inside _call_llm
with patch("circuitforge_core.llm.router.LLMRouter", return_value=fake_router):
gen._call_llm("direct path prompt")
assert router_called.get("prompt") == "direct path prompt"

View file

View file

@ -0,0 +1,87 @@
"""Tests for DocuvisionClient and the _try_docuvision fast path."""
from __future__ import annotations
import base64
from pathlib import Path
from unittest.mock import MagicMock, patch
import httpx
import pytest
from app.services.ocr.docuvision_client import DocuvisionClient, DocuvisionResult
# ---------------------------------------------------------------------------
# DocuvisionClient unit tests
# ---------------------------------------------------------------------------
def test_extract_text_sends_base64_image(tmp_path: Path) -> None:
"""extract_text() POSTs a base64-encoded image and returns parsed text."""
image_file = tmp_path / "test.jpg"
image_file.write_bytes(b"fake-image-bytes")
mock_response = MagicMock()
mock_response.json.return_value = {"text": "Cheerios", "confidence": 0.95}
mock_response.raise_for_status.return_value = None
with patch("httpx.Client") as mock_client_cls:
mock_client = MagicMock()
mock_client_cls.return_value.__enter__.return_value = mock_client
mock_client.post.return_value = mock_response
client = DocuvisionClient("http://docuvision:8080")
result = client.extract_text(image_file)
assert result.text == "Cheerios"
assert result.confidence == 0.95
mock_client.post.assert_called_once()
call_kwargs = mock_client.post.call_args
assert call_kwargs[0][0] == "http://docuvision:8080/extract"
posted_json = call_kwargs[1]["json"]
expected_b64 = base64.b64encode(b"fake-image-bytes").decode()
assert posted_json["image"] == expected_b64
def test_extract_text_raises_on_http_error(tmp_path: Path) -> None:
"""extract_text() propagates HTTP errors from the server."""
image_file = tmp_path / "test.jpg"
image_file.write_bytes(b"fake-image-bytes")
mock_response = MagicMock()
mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
"500 Internal Server Error",
request=MagicMock(),
response=MagicMock(),
)
with patch("httpx.Client") as mock_client_cls:
mock_client = MagicMock()
mock_client_cls.return_value.__enter__.return_value = mock_client
mock_client.post.return_value = mock_response
client = DocuvisionClient("http://docuvision:8080")
with pytest.raises(httpx.HTTPStatusError):
client.extract_text(image_file)
# ---------------------------------------------------------------------------
# _try_docuvision fast-path tests
# ---------------------------------------------------------------------------
def test_try_docuvision_returns_none_without_cf_orch_url(
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""_try_docuvision() returns None immediately when CF_ORCH_URL is not set."""
monkeypatch.delenv("CF_ORCH_URL", raising=False)
# Import after env manipulation so the function sees the unset var
from app.services.ocr.vl_model import _try_docuvision
with patch("httpx.Client") as mock_client_cls:
result = _try_docuvision(tmp_path / "test.jpg")
assert result is None
mock_client_cls.assert_not_called()