cf-vision/cf_vision/ocr.py

# cf_vision/ocr.py — Dolphin-v2 document parser
#
# BSL 1.1: real inference. Requires [inference] extras + ~8GB VRAM.
# Stub: raises NotImplementedError until Kiwi Phase 2 wires in the model.
#
# Model: ByteDance/Dolphin-v2
# Handles 21 element types: title, plain_text, table, figure, barcode,
# handwriting, formula, signature, watermark, and more.
# Reference: https://huggingface.co/ByteDance/Dolphin-v2
from __future__ import annotations

import asyncio
import logging
import os
from functools import partial

from cf_vision.models import ImageFrame

logger = logging.getLogger(__name__)

_DOLPHIN_MODEL_ID = "ByteDance/Dolphin"   # HuggingFace model ID


class DolphinOCR:
    """
    Async wrapper around Dolphin-v2 for structured document parsing.

    Loads the model lazily on first call. Runs in a thread pool executor
    so it never blocks the asyncio event loop (~200ms–2s per page on A100).

    Usage
    -----
        ocr = DolphinOCR.from_env()
        frame = await ocr.parse_async(image_bytes)
        for element in frame.elements:
            print(element.element_type, element.text[:80])

    Navigation note: Dolphin-v2 returns elements in reading order
    (top-to-bottom, left-to-right). Use ImageFrame.full_text() for a
    plain concatenation or iterate elements for structured access.

    Consumer roadmap:
        Kiwi Phase 2  — receipt line-item extraction
        Peregrine     — resume document parsing
        Falcon        — government form scanning
        Godwit        — identity document recovery
    """

    def __init__(self, device: str = "auto") -> None:
        self._device = device
        self._model = None
        self._processor = None

    @classmethod
    def from_env(cls) -> "DolphinOCR":
        return cls(device=os.environ.get("CF_VISION_DEVICE", "auto"))

    def _load(self) -> None:
        if self._model is not None:
            return
        try:
            from transformers import AutoModelForCausalLM, AutoProcessor
            import torch
        except ImportError as exc:
            raise ImportError(
                "Dolphin-v2 requires [inference] extras: "
                "pip install cf-vision[inference]"
            ) from exc

        device = self._device
        if device == "auto":
            device = "cuda" if _cuda_available() else "cpu"

        hf_token = os.environ.get("HF_TOKEN") or None
        logger.info("Loading Dolphin-v2 on %s", device)
        self._processor = AutoProcessor.from_pretrained(
            _DOLPHIN_MODEL_ID, token=hf_token
        )
        self._model = AutoModelForCausalLM.from_pretrained(
            _DOLPHIN_MODEL_ID,
            token=hf_token,
            torch_dtype="auto",
            device_map=device,
        )

    def parse(self, image_bytes: bytes) -> ImageFrame:
        """
        Parse document image bytes into a structured ImageFrame.

        Stub: raises NotImplementedError. Real implementation coming in Kiwi Phase 2.
        """
        self._load()
        raise NotImplementedError(
            "DolphinOCR.parse() is not yet implemented. "
            "Tracking: Kiwi Phase 2 / cf-vision#TBD"
        )

    async def parse_async(self, image_bytes: bytes) -> ImageFrame:
        """parse() without blocking the event loop."""
        loop = asyncio.get_event_loop()
        return await loop.run_in_executor(None, self.parse, image_bytes)


def _cuda_available() -> bool:
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False