From b9c308ab28b00499fef5f8c7b7c0a4c7443333b6 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Thu, 2 Apr 2026 13:49:38 -0700
Subject: [PATCH] fix: docuvision fast-path falls through when parse yields no
 items
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_parse_json_from_text always returns a dict (never None), so the
previous `if parsed is not None` guard was permanently true — garbled
docuvision output would return an empty skeleton instead of falling
through to the local VLM. Replace the check with a meaningful-content
test (items or merchant present). Add two tests: one that asserts the
fallthrough behavior on an empty parse, one that confirms the fast path
is taken when parsing succeeds.
---
 app/services/ocr/vl_model.py                  |   7 +-
 tests/test_services/test_docuvision_client.py | 117 ++++++++++++++++++
 2 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/app/services/ocr/vl_model.py b/app/services/ocr/vl_model.py
index 737a415..f7580ca 100644
--- a/app/services/ocr/vl_model.py
+++ b/app/services/ocr/vl_model.py
@@ -143,10 +143,13 @@ class VisionLanguageOCR:
         docuvision_text = _try_docuvision(image_path)
         if docuvision_text is not None:
             parsed = self._parse_json_from_text(docuvision_text)
-            if parsed is not None:
+            # Only accept the docuvision result if it yielded meaningful content;
+            # an empty-skeleton dict (no items, no merchant) means the text was
+            # garbled and we should fall through to the local VLM instead.
+            if parsed.get("items") or parsed.get("merchant"):
                 parsed["raw_text"] = docuvision_text
                 return self._validate_result(parsed)
-            # If parsing fails, fall through to local VLM
+            # Parsed result has no meaningful content — fall through to local VLM
 
         self._load_model()
 
diff --git a/tests/test_services/test_docuvision_client.py b/tests/test_services/test_docuvision_client.py
index bba0b70..32df008 100644
--- a/tests/test_services/test_docuvision_client.py
+++ b/tests/test_services/test_docuvision_client.py
@@ -85,3 +85,120 @@ def test_try_docuvision_returns_none_without_cf_orch_url(
 
     assert result is None
     mock_client_cls.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# extract_receipt_data docuvision fast-path fallthrough tests
+# ---------------------------------------------------------------------------
+
+
+def test_extract_receipt_data_falls_through_when_docuvision_yields_empty_parse(
+    tmp_path: Path, monkeypatch: pytest.MonkeyPatch
+) -> None:
+    """When docuvision returns garbled text that parses to an empty structure,
+    extract_receipt_data must fall through to the local VLM rather than
+    returning an empty skeleton dict as a successful result."""
+    from app.services.ocr.vl_model import VisionLanguageOCR
+
+    vlm = VisionLanguageOCR()
+
+    # Simulate docuvision returning some text that cannot be meaningfully parsed
+    garbled_text = "not valid json at all @@##!!"
+
+    local_vlm_result = {
+        "merchant": {"name": "Whole Foods"},
+        "transaction": {},
+        "items": [{"name": "Milk", "quantity": 1, "unit_price": 3.99, "total_price": 3.99}],
+        "totals": {"total": 3.99},
+        "confidence": {"overall": 0.9},
+        "raw_text": "Whole Foods\nMilk $3.99",
+    }
+
+    with (
+        patch("app.services.ocr.vl_model._try_docuvision", return_value=garbled_text),
+        patch.object(vlm, "_load_model"),
+        patch.object(vlm, "_parse_json_from_text", wraps=vlm._parse_json_from_text) as spy_parse,
+        patch.object(vlm, "_validate_result", side_effect=lambda r: r) as mock_validate,
+    ):
+        # Intercept the VLM path by making generate/processor unavailable
+        # by patching extract_receipt_data at the local-VLM branch entry.
+        # We do this by replacing the second call to _parse_json_from_text
+        # (the one from the local VLM branch) with the known good result.
+        call_count = {"n": 0}
+        original_parse = vlm._parse_json_from_text.__wrapped__ if hasattr(
+            vlm._parse_json_from_text, "__wrapped__"
+        ) else None
+
+        def _fake_parse(text: str) -> dict:
+            call_count["n"] += 1
+            if call_count["n"] == 1:
+                # First call: docuvision path — return the real (empty) result
+                return vlm.__class__._parse_json_from_text(vlm, text)
+            # Second call: local VLM path — return populated result
+            return local_vlm_result
+
+        spy_parse.side_effect = _fake_parse
+
+        # Also stub the model inference bits so we don't need a real GPU
+        from unittest.mock import MagicMock
+        import torch
+
+        vlm._model_loaded = True
+        vlm.model = MagicMock()
+        vlm.processor = MagicMock()
+        vlm.processor.return_value = {}
+        vlm.processor.decode.return_value = "Whole Foods\nMilk $3.99"
+        vlm.processor.tokenizer = MagicMock()
+        vlm.model.generate.return_value = [torch.tensor([1, 2, 3])]
+
+        # Provide a minimal image file
+        img_path = tmp_path / "receipt.jpg"
+        from PIL import Image as PILImage
+
+        img = PILImage.new("RGB", (10, 10), color=(255, 255, 255))
+        img.save(img_path)
+
+        result = vlm.extract_receipt_data(str(img_path))
+
+    # The result must NOT be the empty skeleton — it should come from the local VLM path
+    assert result.get("merchant") or result.get("items"), (
+        "extract_receipt_data returned an empty skeleton instead of falling "
+        "through to the local VLM when docuvision parse yielded no content"
+    )
+    # parse was called at least twice (once for docuvision, once for local VLM)
+    assert call_count["n"] >= 2, (
+        "Expected _parse_json_from_text to be called for both the docuvision "
+        f"path and the local VLM path, but it was called {call_count['n']} time(s)"
+    )
+
+
+def test_extract_receipt_data_uses_docuvision_when_parse_succeeds(
+    tmp_path: Path,
+) -> None:
+    """When docuvision returns text that yields meaningful parsed content,
+    extract_receipt_data must return that result and skip the local VLM."""
+    from app.services.ocr.vl_model import VisionLanguageOCR
+
+    vlm = VisionLanguageOCR()
+
+    populated_parse = {
+        "merchant": {"name": "Target"},
+        "transaction": {},
+        "items": [{"name": "Shampoo", "quantity": 1, "unit_price": 5.99, "total_price": 5.99}],
+        "totals": {"total": 5.99},
+        "confidence": {"overall": 0.88},
+    }
+    docuvision_text = '{"merchant": {"name": "Target"}, "items": [...]}'
+
+    with (
+        patch("app.services.ocr.vl_model._try_docuvision", return_value=docuvision_text),
+        patch.object(vlm, "_parse_json_from_text", return_value=populated_parse),
+        patch.object(vlm, "_validate_result", side_effect=lambda r: r),
+        patch.object(vlm, "_load_model") as mock_load,
+    ):
+        result = vlm.extract_receipt_data(str(tmp_path / "receipt.jpg"))
+
+    # Local VLM should NOT have been loaded — docuvision fast path handled it
+    mock_load.assert_not_called()
+    assert result["merchant"]["name"] == "Target"
+    assert result["raw_text"] == docuvision_text