feat: add DocuvisionClient + cf-docuvision fast-path for OCR

Introduces a thin HTTP client for the cf-docuvision service and wires it as a fast path in VisionLanguageOCR.extract_receipt_data(). When CF_ORCH_URL is set, the pipeline attempts docuvision allocation via CFOrchClient before loading the heavy local VLM; falls back gracefully if unavailable.
2026-04-02 12:33:05 -07:00 · 2026-04-02 12:33:05 -07:00 · 22e57118df
commit 22e57118df
parent b9eadcdf0e
6 changed files with 318 additions and 63 deletions
--- a/app/services/ocr/docuvision_client.py
+++ b/app/services/ocr/docuvision_client.py
@ -0,0 +1,60 @@
 """Thin HTTP client for the cf-docuvision document vision service."""
 from __future__ import annotations
 import base64
 from dataclasses import dataclass
 from pathlib import Path
 import httpx
@dataclass
 class DocuvisionResult:
    text: str
    confidence: float | None = None
    raw: dict | None = None
 class DocuvisionClient:
    """Thin client for the cf-docuvision service."""
    def __init__(self, base_url: str) -> None:
        self._base_url = base_url.rstrip("/")
    def extract_text(self, image_path: str | Path) -> DocuvisionResult:
        """Send an image to docuvision and return extracted text."""
        image_bytes = Path(image_path).read_bytes()
        b64 = base64.b64encode(image_bytes).decode()
        with httpx.Client(timeout=30.0) as client:
            resp = client.post(
                f"{self._base_url}/extract",
                json={"image": b64},
            )
            resp.raise_for_status()
            data = resp.json()
        return DocuvisionResult(
            text=data.get("text", ""),
            confidence=data.get("confidence"),
            raw=data,
        )
    async def extract_text_async(self, image_path: str | Path) -> DocuvisionResult:
        """Async version."""
        image_bytes = Path(image_path).read_bytes()
        b64 = base64.b64encode(image_bytes).decode()
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.post(
                f"{self._base_url}/extract",
                json={"image": b64},
            )
            resp.raise_for_status()
            data = resp.json()
        return DocuvisionResult(
            text=data.get("text", ""),
            confidence=data.get("confidence"),
            raw=data,
        )
--- a/app/services/ocr/vl_model.py
+++ b/app/services/ocr/vl_model.py
@ -8,6 +8,7 @@ OCR with understanding of receipt structure to extract structured JSON data.
 import json
 import logging
 import os
 import re
 from pathlib import Path
 from typing import Dict, Any, Optional, List
@ -26,6 +27,31 @@ from app.core.config import settings
 logger = logging.getLogger(__name__)
 def _try_docuvision(image_path: str | Path) -> str | None:
    """Try to extract text via cf-docuvision. Returns None if unavailable."""
    cf_orch_url = os.environ.get("CF_ORCH_URL")
    if not cf_orch_url:
        return None
    try:
        from circuitforge_core.resources import CFOrchClient
        from app.services.ocr.docuvision_client import DocuvisionClient
        client = CFOrchClient(cf_orch_url)
        with client.allocate(
            service="cf-docuvision",
            model_candidates=[],  # cf-docuvision has no model selection
            ttl_s=60.0,
            caller="kiwi-ocr",
        ) as alloc:
            if alloc is None:
                return None
            doc_client = DocuvisionClient(alloc.url)
            result = doc_client.extract_text(image_path)
            return result.text if result.text else None
    except Exception:
        return None  # graceful degradation
 class VisionLanguageOCR:
    """Vision-Language Model for receipt OCR and structured extraction."""
@ -40,7 +66,7 @@ class VisionLanguageOCR:
        self.processor = None
        self.device = "cuda" if torch.cuda.is_available() and settings.USE_GPU else "cpu"
        self.use_quantization = use_quantization
-        self.model_name = "Qwen/Qwen2-VL-2B-Instruct"
+        self.model_name = "Qwen/Qwen2.5-VL-7B-Instruct"
        logger.info(f"Initializing VisionLanguageOCR with device: {self.device}")
@ -112,6 +138,19 @@ class VisionLanguageOCR:
                "warnings": [...]
            }
        """
        # Try docuvision fast path first (skips heavy local VLM if available)
        docuvision_text = _try_docuvision(image_path)
        if docuvision_text is not None:
            return {
                "raw_text": docuvision_text,
                "merchant": {},
                "transaction": {},
                "items": [],
                "totals": {},
                "confidence": {"overall": None},
                "warnings": [],
            }
        self._load_model()
        try:
--- a/app/services/recipe/llm_recipe.py
+++ b/app/services/recipe/llm_recipe.py
@ -2,8 +2,12 @@
 from __future__ import annotations
 import logging
 import os
 from contextlib import nullcontext
 from typing import TYPE_CHECKING
 from openai import OpenAI
 if TYPE_CHECKING:
    from app.db.store import Store
@ -113,76 +117,57 @@ class LLMRecipeGenerator:
        return "\n".join(lines)
-    def _acquire_vram_lease(self) -> str | None:
+    _MODEL_CANDIDATES: list[str] = ["Ouro-2.6B-Thinking", "Ouro-1.4B"]
        """Request a VRAM lease from the CF-core coordinator. Best-effort — returns None if unavailable."""
        try:
            import httpx
            from app.core.config import settings
            from app.tasks.runner import VRAM_BUDGETS
-            budget_mb = int(VRAM_BUDGETS.get("recipe_llm", 4.0) * 1024)
+    def _get_llm_context(self):
-            coordinator = settings.COORDINATOR_URL
+        """Return a sync context manager that yields an Allocation or None.
-            nodes_resp = httpx.get(f"{coordinator}/api/nodes", timeout=2.0)
+        When CF_ORCH_URL is set, uses CFOrchClient to acquire a vLLM allocation
-            if nodes_resp.status_code != 200:
+        (which handles service lifecycle and VRAM). Falls back to nullcontext(None)
-                return None
+        when the env var is absent or CFOrchClient raises on construction.
-            nodes = nodes_resp.json().get("nodes", [])
+        """
-            if not nodes:
+        cf_orch_url = os.environ.get("CF_ORCH_URL")
-                return None
+        if cf_orch_url:
-
+            try:
-            best_node = best_gpu = best_free = None
+                from circuitforge_core.resources import CFOrchClient
-            for node in nodes:
+                client = CFOrchClient(cf_orch_url)
-                for gpu in node.get("gpus", []):
+                return client.allocate(
-                    free = gpu.get("vram_free_mb", 0)
+                    service="vllm",
-                    if best_free is None or free > best_free:
+                    model_candidates=self._MODEL_CANDIDATES,
-                        best_node = node["node_id"]
+                    ttl_s=300.0,
-                        best_gpu = gpu["gpu_id"]
+                    caller="kiwi-recipe",
-                        best_free = free
+                )
-            if best_node is None:
+            except Exception as exc:
-                return None
+                logger.debug("CFOrchClient init failed, falling back to direct URL: %s", exc)
-
+        return nullcontext(None)
            lease_resp = httpx.post(
                f"{coordinator}/api/leases",
                json={
                    "node_id": best_node,
                    "gpu_id": best_gpu,
                    "mb": budget_mb,
                    "service": "kiwi",
                    "priority": 5,
                },
                timeout=3.0,
            )
            if lease_resp.status_code == 200:
                lease_id = lease_resp.json()["lease"]["lease_id"]
                logger.debug("Acquired VRAM lease %s for recipe_llm (%d MB)", lease_id, budget_mb)
                return lease_id
        except Exception as exc:
            logger.debug("VRAM lease acquire failed (non-fatal): %s", exc)
        return None
    def _release_vram_lease(self, lease_id: str) -> None:
        """Release a VRAM lease. Best-effort."""
        try:
            import httpx
            from app.core.config import settings
            httpx.delete(f"{settings.COORDINATOR_URL}/api/leases/{lease_id}", timeout=3.0)
            logger.debug("Released VRAM lease %s", lease_id)
        except Exception as exc:
            logger.debug("VRAM lease release failed (non-fatal): %s", exc)
    def _call_llm(self, prompt: str) -> str:
-        """Call the LLM router with a VRAM lease held for the duration."""
+        """Call the LLM, using CFOrchClient allocation when CF_ORCH_URL is set.
-        lease_id = self._acquire_vram_lease()
+
        With CF_ORCH_URL set: acquires a vLLM allocation via CFOrchClient and
        calls the OpenAI-compatible API directly against the allocated service URL.
        Without CF_ORCH_URL: falls back to LLMRouter using its configured backends.
        """
        try:
-            from circuitforge_core.llm.router import LLMRouter
+            with self._get_llm_context() as alloc:
-            router = LLMRouter()
+                if alloc is not None:
-            return router.complete(prompt)
+                    base_url = alloc.url.rstrip("/") + "/v1"
                    client = OpenAI(base_url=base_url, api_key="any")
                    model = alloc.model or "__auto__"
                    if model == "__auto__":
                        model = client.models.list().data[0].id
                    resp = client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": prompt}],
                    )
                    return resp.choices[0].message.content or ""
                else:
                    from circuitforge_core.llm.router import LLMRouter
                    router = LLMRouter()
                    return router.complete(prompt)
        except Exception as exc:
            logger.error("LLM call failed: %s", exc)
            return ""
        finally:
            if lease_id:
                self._release_vram_lease(lease_id)
    def _parse_response(self, response: str) -> dict[str, str | list[str]]:
        """Parse LLM response text into structured recipe fields."""
--- a/tests/services/recipe/test_llm_recipe.py
+++ b/tests/services/recipe/test_llm_recipe.py
@ -1,6 +1,11 @@
 """Tests for LLMRecipeGenerator — prompt builders and allergy filtering."""
 from __future__ import annotations
 import os
 from contextlib import contextmanager
 from dataclasses import dataclass
 from unittest.mock import MagicMock, patch
 import pytest
 from app.models.schemas.recipe import RecipeRequest
@ -139,3 +144,82 @@ def test_generate_returns_result_when_llm_responds(monkeypatch):
    assert len(suggestion.directions) > 0
    assert "parmesan" in suggestion.notes.lower()
    assert result.element_gaps == ["Brightness"]
 # ---------------------------------------------------------------------------
 # CFOrchClient integration tests
 # ---------------------------------------------------------------------------
@dataclass
 class _FakeAllocation:
    allocation_id: str = "alloc-test-1"
    service: str = "vllm"
    node_id: str = "node-1"
    gpu_id: int = 0
    model: str | None = "Ouro-2.6B-Thinking"
    url: str = "http://test:8000"
    started: bool = True
    warm: bool = True
 def test_recipe_gen_uses_cf_orch_when_env_set(monkeypatch):
    """When CF_ORCH_URL is set, _call_llm uses alloc.url+/v1 as the OpenAI base_url."""
    from app.services.recipe.llm_recipe import LLMRecipeGenerator
    store = _make_store()
    gen = LLMRecipeGenerator(store)
    fake_alloc = _FakeAllocation()
    @contextmanager
    def _fake_llm_context():
        yield fake_alloc
    captured = {}
    # Fake OpenAI that records the base_url it was constructed with
    class _FakeOpenAI:
        def __init__(self, *, base_url, api_key):
            captured["base_url"] = base_url
            msg = MagicMock()
            msg.content = "Title: Test\nIngredients: a\nDirections: do it.\nNotes: none."
            choice = MagicMock()
            choice.message = msg
            completion = MagicMock()
            completion.choices = [choice]
            self.chat = MagicMock()
            self.chat.completions = MagicMock()
            self.chat.completions.create = MagicMock(return_value=completion)
    # Patch _get_llm_context directly so no real HTTP call is made
    monkeypatch.setattr(gen, "_get_llm_context", _fake_llm_context)
    with patch("app.services.recipe.llm_recipe.OpenAI", _FakeOpenAI):
        gen._call_llm("make me a recipe")
    assert captured.get("base_url") == "http://test:8000/v1"
 def test_recipe_gen_falls_back_without_cf_orch(monkeypatch):
    """When CF_ORCH_URL is not set, _call_llm falls back to LLMRouter."""
    from app.services.recipe.llm_recipe import LLMRecipeGenerator
    store = _make_store()
    gen = LLMRecipeGenerator(store)
    monkeypatch.delenv("CF_ORCH_URL", raising=False)
    router_called = {}
    def _fake_complete(prompt, **_kwargs):
        router_called["prompt"] = prompt
        return "Title: Direct\nIngredients: x\nDirections: go.\nNotes: ok."
    fake_router = MagicMock()
    fake_router.complete.side_effect = _fake_complete
    # Patch where LLMRouter is imported inside _call_llm
    with patch("circuitforge_core.llm.router.LLMRouter", return_value=fake_router):
        gen._call_llm("direct path prompt")
    assert router_called.get("prompt") == "direct path prompt"
--- a/tests/test_services/init.py
+++ b/tests/test_services/init.py
--- a/tests/test_services/test_docuvision_client.py
+++ b/tests/test_services/test_docuvision_client.py
@ -0,0 +1,87 @@
 """Tests for DocuvisionClient and the _try_docuvision fast path."""
 from __future__ import annotations
 import base64
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 import httpx
 import pytest
 from app.services.ocr.docuvision_client import DocuvisionClient, DocuvisionResult
 # ---------------------------------------------------------------------------
 # DocuvisionClient unit tests
 # ---------------------------------------------------------------------------
 def test_extract_text_sends_base64_image(tmp_path: Path) -> None:
    """extract_text() POSTs a base64-encoded image and returns parsed text."""
    image_file = tmp_path / "test.jpg"
    image_file.write_bytes(b"fake-image-bytes")
    mock_response = MagicMock()
    mock_response.json.return_value = {"text": "Cheerios", "confidence": 0.95}
    mock_response.raise_for_status.return_value = None
    with patch("httpx.Client") as mock_client_cls:
        mock_client = MagicMock()
        mock_client_cls.return_value.__enter__.return_value = mock_client
        mock_client.post.return_value = mock_response
        client = DocuvisionClient("http://docuvision:8080")
        result = client.extract_text(image_file)
    assert result.text == "Cheerios"
    assert result.confidence == 0.95
    mock_client.post.assert_called_once()
    call_kwargs = mock_client.post.call_args
    assert call_kwargs[0][0] == "http://docuvision:8080/extract"
    posted_json = call_kwargs[1]["json"]
    expected_b64 = base64.b64encode(b"fake-image-bytes").decode()
    assert posted_json["image"] == expected_b64
 def test_extract_text_raises_on_http_error(tmp_path: Path) -> None:
    """extract_text() propagates HTTP errors from the server."""
    image_file = tmp_path / "test.jpg"
    image_file.write_bytes(b"fake-image-bytes")
    mock_response = MagicMock()
    mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
        "500 Internal Server Error",
        request=MagicMock(),
        response=MagicMock(),
    )
    with patch("httpx.Client") as mock_client_cls:
        mock_client = MagicMock()
        mock_client_cls.return_value.__enter__.return_value = mock_client
        mock_client.post.return_value = mock_response
        client = DocuvisionClient("http://docuvision:8080")
        with pytest.raises(httpx.HTTPStatusError):
            client.extract_text(image_file)
 # ---------------------------------------------------------------------------
 # _try_docuvision fast-path tests
 # ---------------------------------------------------------------------------
 def test_try_docuvision_returns_none_without_cf_orch_url(
    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
 ) -> None:
    """_try_docuvision() returns None immediately when CF_ORCH_URL is not set."""
    monkeypatch.delenv("CF_ORCH_URL", raising=False)
    # Import after env manipulation so the function sees the unset var
    from app.services.ocr.vl_model import _try_docuvision
    with patch("httpx.Client") as mock_client_cls:
        result = _try_docuvision(tmp_path / "test.jpg")
    assert result is None
    mock_client_cls.assert_not_called()