From b9c308ab28b00499fef5f8c7b7c0a4c7443333b6 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 2 Apr 2026 13:49:38 -0700 Subject: [PATCH] fix: docuvision fast-path falls through when parse yields no items MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _parse_json_from_text always returns a dict (never None), so the previous `if parsed is not None` guard was permanently true — garbled docuvision output would return an empty skeleton instead of falling through to the local VLM. Replace the check with a meaningful-content test (items or merchant present). Add two tests: one that asserts the fallthrough behavior on an empty parse, one that confirms the fast path is taken when parsing succeeds. --- app/services/ocr/vl_model.py | 7 +- tests/test_services/test_docuvision_client.py | 117 ++++++++++++++++++ 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/app/services/ocr/vl_model.py b/app/services/ocr/vl_model.py index 737a415..f7580ca 100644 --- a/app/services/ocr/vl_model.py +++ b/app/services/ocr/vl_model.py @@ -143,10 +143,13 @@ class VisionLanguageOCR: docuvision_text = _try_docuvision(image_path) if docuvision_text is not None: parsed = self._parse_json_from_text(docuvision_text) - if parsed is not None: + # Only accept the docuvision result if it yielded meaningful content; + # an empty-skeleton dict (no items, no merchant) means the text was + # garbled and we should fall through to the local VLM instead. + if parsed.get("items") or parsed.get("merchant"): parsed["raw_text"] = docuvision_text return self._validate_result(parsed) - # If parsing fails, fall through to local VLM + # Parsed result has no meaningful content — fall through to local VLM self._load_model() diff --git a/tests/test_services/test_docuvision_client.py b/tests/test_services/test_docuvision_client.py index bba0b70..32df008 100644 --- a/tests/test_services/test_docuvision_client.py +++ b/tests/test_services/test_docuvision_client.py @@ -85,3 +85,120 @@ def test_try_docuvision_returns_none_without_cf_orch_url( assert result is None mock_client_cls.assert_not_called() + + +# --------------------------------------------------------------------------- +# extract_receipt_data docuvision fast-path fallthrough tests +# --------------------------------------------------------------------------- + + +def test_extract_receipt_data_falls_through_when_docuvision_yields_empty_parse( + tmp_path: Path, monkeypatch: pytest.MonkeyPatch +) -> None: + """When docuvision returns garbled text that parses to an empty structure, + extract_receipt_data must fall through to the local VLM rather than + returning an empty skeleton dict as a successful result.""" + from app.services.ocr.vl_model import VisionLanguageOCR + + vlm = VisionLanguageOCR() + + # Simulate docuvision returning some text that cannot be meaningfully parsed + garbled_text = "not valid json at all @@##!!" + + local_vlm_result = { + "merchant": {"name": "Whole Foods"}, + "transaction": {}, + "items": [{"name": "Milk", "quantity": 1, "unit_price": 3.99, "total_price": 3.99}], + "totals": {"total": 3.99}, + "confidence": {"overall": 0.9}, + "raw_text": "Whole Foods\nMilk $3.99", + } + + with ( + patch("app.services.ocr.vl_model._try_docuvision", return_value=garbled_text), + patch.object(vlm, "_load_model"), + patch.object(vlm, "_parse_json_from_text", wraps=vlm._parse_json_from_text) as spy_parse, + patch.object(vlm, "_validate_result", side_effect=lambda r: r) as mock_validate, + ): + # Intercept the VLM path by making generate/processor unavailable + # by patching extract_receipt_data at the local-VLM branch entry. + # We do this by replacing the second call to _parse_json_from_text + # (the one from the local VLM branch) with the known good result. + call_count = {"n": 0} + original_parse = vlm._parse_json_from_text.__wrapped__ if hasattr( + vlm._parse_json_from_text, "__wrapped__" + ) else None + + def _fake_parse(text: str) -> dict: + call_count["n"] += 1 + if call_count["n"] == 1: + # First call: docuvision path — return the real (empty) result + return vlm.__class__._parse_json_from_text(vlm, text) + # Second call: local VLM path — return populated result + return local_vlm_result + + spy_parse.side_effect = _fake_parse + + # Also stub the model inference bits so we don't need a real GPU + from unittest.mock import MagicMock + import torch + + vlm._model_loaded = True + vlm.model = MagicMock() + vlm.processor = MagicMock() + vlm.processor.return_value = {} + vlm.processor.decode.return_value = "Whole Foods\nMilk $3.99" + vlm.processor.tokenizer = MagicMock() + vlm.model.generate.return_value = [torch.tensor([1, 2, 3])] + + # Provide a minimal image file + img_path = tmp_path / "receipt.jpg" + from PIL import Image as PILImage + + img = PILImage.new("RGB", (10, 10), color=(255, 255, 255)) + img.save(img_path) + + result = vlm.extract_receipt_data(str(img_path)) + + # The result must NOT be the empty skeleton — it should come from the local VLM path + assert result.get("merchant") or result.get("items"), ( + "extract_receipt_data returned an empty skeleton instead of falling " + "through to the local VLM when docuvision parse yielded no content" + ) + # parse was called at least twice (once for docuvision, once for local VLM) + assert call_count["n"] >= 2, ( + "Expected _parse_json_from_text to be called for both the docuvision " + f"path and the local VLM path, but it was called {call_count['n']} time(s)" + ) + + +def test_extract_receipt_data_uses_docuvision_when_parse_succeeds( + tmp_path: Path, +) -> None: + """When docuvision returns text that yields meaningful parsed content, + extract_receipt_data must return that result and skip the local VLM.""" + from app.services.ocr.vl_model import VisionLanguageOCR + + vlm = VisionLanguageOCR() + + populated_parse = { + "merchant": {"name": "Target"}, + "transaction": {}, + "items": [{"name": "Shampoo", "quantity": 1, "unit_price": 5.99, "total_price": 5.99}], + "totals": {"total": 5.99}, + "confidence": {"overall": 0.88}, + } + docuvision_text = '{"merchant": {"name": "Target"}, "items": [...]}' + + with ( + patch("app.services.ocr.vl_model._try_docuvision", return_value=docuvision_text), + patch.object(vlm, "_parse_json_from_text", return_value=populated_parse), + patch.object(vlm, "_validate_result", side_effect=lambda r: r), + patch.object(vlm, "_load_model") as mock_load, + ): + result = vlm.extract_receipt_data(str(tmp_path / "receipt.jpg")) + + # Local VLM should NOT have been loaded — docuvision fast path handled it + mock_load.assert_not_called() + assert result["merchant"]["name"] == "Target" + assert result["raw_text"] == docuvision_text