cf-vision/cf_vision/router.py

# cf_vision/router.py — VisionRouter, the primary consumer API
#
# BSL 1.1 when real inference models are integrated (Dolphin-v2, Claude vision).
# Currently a stub: analyze() raises NotImplementedError unless mock=True.
from __future__ import annotations

import os
from typing import Literal

from cf_vision.models import ImageFrame, ImageElement, BoundingBox

_MOCK_ELEMENTS = [
    ImageElement(
        element_type="title",
        text="[Mock document title]",
        confidence=0.99,
        bbox=BoundingBox(x=0.05, y=0.02, width=0.9, height=0.06),
    ),
    ImageElement(
        element_type="plain_text",
        text="[Mock paragraph — real content requires cf-vision[inference] and a vision model.]",
        confidence=0.95,
        bbox=BoundingBox(x=0.05, y=0.12, width=0.9, height=0.08),
    ),
]


class VisionRouter:
    """
    Routes image analysis requests to local or cloud vision models.

    Local models (Free tier):
      - Dolphin-v2 (ByteDance) — universal document parser, 21 element types
      - pyzbar — barcode / QR code scanning (no GPU required)

    Cloud fallback (Paid tier):
      - Claude vision API — general-purpose image understanding

    Usage
    -----
        router = VisionRouter.from_env()
        frame = router.analyze(image_bytes, task="document")
        for element in frame.elements:
            print(element.element_type, element.text)
    """

    def __init__(
        self,
        mock: bool = False,
        device: str = "auto",
    ) -> None:
        self._mock = mock
        self._device = device

    @classmethod
    def from_env(cls) -> "VisionRouter":
        """Construct from CF_VISION_MOCK and CF_VISION_DEVICE env vars."""
        mock = os.environ.get("CF_VISION_MOCK", "") == "1"
        device = os.environ.get("CF_VISION_DEVICE", "auto")
        return cls(mock=mock, device=device)

    def analyze(
        self,
        image_bytes: bytes,
        task: Literal["document", "barcode", "receipt", "general"] = "document",
        prompt: str = "",
    ) -> ImageFrame:
        """
        Analyse image_bytes and return a structured ImageFrame.

        task:
          "document"  — full document parsing via Dolphin-v2 (all 21 element types)
          "barcode"   — barcode / QR code extraction via pyzbar (lightweight)
          "receipt"   — receipt line-item extraction (Dolphin-v2 + post-processing)
          "general"   — general image understanding via Claude vision (cloud, Paid tier)

        Stub: raises NotImplementedError unless CF_VISION_MOCK=1 or mock=True.
        Real implementation lands with Kiwi Phase 2 (cf_vision.ocr, cf_vision.barcode).
        """
        if self._mock:
            return self._mock_frame(image_bytes, task)

        raise NotImplementedError(
            "VisionRouter real inference is not yet implemented. "
            "Set CF_VISION_MOCK=1 or mock=True to use synthetic frames. "
            "Real analysis requires: pip install cf-vision[inference]"
        )

    def _mock_frame(self, image_bytes: bytes, task: str) -> ImageFrame:
        from cf_vision.models import ImageElement, BoundingBox
        if task == "barcode":
            elements = [
                ImageElement(
                    element_type="barcode",
                    text="0123456789012",
                    confidence=0.99,
                    metadata={"format": "EAN13"},
                )
            ]
        else:
            elements = list(_MOCK_ELEMENTS)
        return ImageFrame(
            source="mock",
            image_bytes=None,
            elements=elements,
            model="mock",
        )