# cf_vision/ocr.py — Dolphin-v2 document parser # # BSL 1.1: real inference. Requires [inference] extras + ~8GB VRAM. # Stub: raises NotImplementedError until Kiwi Phase 2 wires in the model. # # Model: ByteDance/Dolphin-v2 # Handles 21 element types: title, plain_text, table, figure, barcode, # handwriting, formula, signature, watermark, and more. # Reference: https://huggingface.co/ByteDance/Dolphin-v2 from __future__ import annotations import asyncio import logging import os from functools import partial from cf_vision.models import ImageFrame logger = logging.getLogger(__name__) _DOLPHIN_MODEL_ID = "ByteDance/Dolphin" # HuggingFace model ID class DolphinOCR: """ Async wrapper around Dolphin-v2 for structured document parsing. Loads the model lazily on first call. Runs in a thread pool executor so it never blocks the asyncio event loop (~200ms–2s per page on A100). Usage ----- ocr = DolphinOCR.from_env() frame = await ocr.parse_async(image_bytes) for element in frame.elements: print(element.element_type, element.text[:80]) Navigation note: Dolphin-v2 returns elements in reading order (top-to-bottom, left-to-right). Use ImageFrame.full_text() for a plain concatenation or iterate elements for structured access. Consumer roadmap: Kiwi Phase 2 — receipt line-item extraction Peregrine — resume document parsing Falcon — government form scanning Godwit — identity document recovery """ def __init__(self, device: str = "auto") -> None: self._device = device self._model = None self._processor = None @classmethod def from_env(cls) -> "DolphinOCR": return cls(device=os.environ.get("CF_VISION_DEVICE", "auto")) def _load(self) -> None: if self._model is not None: return try: from transformers import AutoModelForCausalLM, AutoProcessor import torch except ImportError as exc: raise ImportError( "Dolphin-v2 requires [inference] extras: " "pip install cf-vision[inference]" ) from exc device = self._device if device == "auto": device = "cuda" if _cuda_available() else "cpu" hf_token = os.environ.get("HF_TOKEN") or None logger.info("Loading Dolphin-v2 on %s", device) self._processor = AutoProcessor.from_pretrained( _DOLPHIN_MODEL_ID, token=hf_token ) self._model = AutoModelForCausalLM.from_pretrained( _DOLPHIN_MODEL_ID, token=hf_token, torch_dtype="auto", device_map=device, ) def parse(self, image_bytes: bytes) -> ImageFrame: """ Parse document image bytes into a structured ImageFrame. Stub: raises NotImplementedError. Real implementation coming in Kiwi Phase 2. """ self._load() raise NotImplementedError( "DolphinOCR.parse() is not yet implemented. " "Tracking: Kiwi Phase 2 / cf-vision#TBD" ) async def parse_async(self, image_bytes: bytes) -> ImageFrame: """parse() without blocking the event loop.""" loop = asyncio.get_event_loop() return await loop.run_in_executor(None, self.parse, image_bytes) def _cuda_available() -> bool: try: import torch return torch.cuda.is_available() except ImportError: return False