- cf_vision/models.py: ImageFrame + ImageElement + BoundingBox (MIT) Full Dolphin-v2 element taxonomy (21 types), convenience accessors (text_blocks, barcodes, tables, full_text) - cf_vision/router.py: VisionRouter — mock + real paths, task routing (document, barcode, receipt, general) - cf_vision/barcode.py: BarcodeScanner — pyzbar wrapper, CPU-only, MIT - cf_vision/ocr.py: DolphinOCR — ByteDance/Dolphin-v2 async stub (BSL 1.1) - cf_vision/receipt.py: ReceiptParser stub — Kiwi Phase 2 target (BSL 1.1) - cf_vision/camera.py: CameraCapture — OpenCV single-frame capture (MIT) - pyproject.toml: inference / barcode / camera optional extras - .env.example: HF_TOKEN, CF_VISION_DEVICE, CF_VISION_MOCK - README: module map, ImageFrame API reference, consumer roadmap - tests: 6 passing (ImageFrame accessors, VisionRouter mock/real) Extracted from circuitforge_core.vision per cf-core#36.
109 lines
3.5 KiB
Python
109 lines
3.5 KiB
Python
# cf_vision/ocr.py — Dolphin-v2 document parser
|
||
#
|
||
# BSL 1.1: real inference. Requires [inference] extras + ~8GB VRAM.
|
||
# Stub: raises NotImplementedError until Kiwi Phase 2 wires in the model.
|
||
#
|
||
# Model: ByteDance/Dolphin-v2
|
||
# Handles 21 element types: title, plain_text, table, figure, barcode,
|
||
# handwriting, formula, signature, watermark, and more.
|
||
# Reference: https://huggingface.co/ByteDance/Dolphin-v2
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
from functools import partial
|
||
|
||
from cf_vision.models import ImageFrame
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
_DOLPHIN_MODEL_ID = "ByteDance/Dolphin" # HuggingFace model ID
|
||
|
||
|
||
class DolphinOCR:
|
||
"""
|
||
Async wrapper around Dolphin-v2 for structured document parsing.
|
||
|
||
Loads the model lazily on first call. Runs in a thread pool executor
|
||
so it never blocks the asyncio event loop (~200ms–2s per page on A100).
|
||
|
||
Usage
|
||
-----
|
||
ocr = DolphinOCR.from_env()
|
||
frame = await ocr.parse_async(image_bytes)
|
||
for element in frame.elements:
|
||
print(element.element_type, element.text[:80])
|
||
|
||
Navigation note: Dolphin-v2 returns elements in reading order
|
||
(top-to-bottom, left-to-right). Use ImageFrame.full_text() for a
|
||
plain concatenation or iterate elements for structured access.
|
||
|
||
Consumer roadmap:
|
||
Kiwi Phase 2 — receipt line-item extraction
|
||
Peregrine — resume document parsing
|
||
Falcon — government form scanning
|
||
Godwit — identity document recovery
|
||
"""
|
||
|
||
def __init__(self, device: str = "auto") -> None:
|
||
self._device = device
|
||
self._model = None
|
||
self._processor = None
|
||
|
||
@classmethod
|
||
def from_env(cls) -> "DolphinOCR":
|
||
return cls(device=os.environ.get("CF_VISION_DEVICE", "auto"))
|
||
|
||
def _load(self) -> None:
|
||
if self._model is not None:
|
||
return
|
||
try:
|
||
from transformers import AutoModelForCausalLM, AutoProcessor
|
||
import torch
|
||
except ImportError as exc:
|
||
raise ImportError(
|
||
"Dolphin-v2 requires [inference] extras: "
|
||
"pip install cf-vision[inference]"
|
||
) from exc
|
||
|
||
device = self._device
|
||
if device == "auto":
|
||
device = "cuda" if _cuda_available() else "cpu"
|
||
|
||
hf_token = os.environ.get("HF_TOKEN") or None
|
||
logger.info("Loading Dolphin-v2 on %s", device)
|
||
self._processor = AutoProcessor.from_pretrained(
|
||
_DOLPHIN_MODEL_ID, token=hf_token
|
||
)
|
||
self._model = AutoModelForCausalLM.from_pretrained(
|
||
_DOLPHIN_MODEL_ID,
|
||
token=hf_token,
|
||
torch_dtype="auto",
|
||
device_map=device,
|
||
)
|
||
|
||
def parse(self, image_bytes: bytes) -> ImageFrame:
|
||
"""
|
||
Parse document image bytes into a structured ImageFrame.
|
||
|
||
Stub: raises NotImplementedError. Real implementation coming in Kiwi Phase 2.
|
||
"""
|
||
self._load()
|
||
raise NotImplementedError(
|
||
"DolphinOCR.parse() is not yet implemented. "
|
||
"Tracking: Kiwi Phase 2 / cf-vision#TBD"
|
||
)
|
||
|
||
async def parse_async(self, image_bytes: bytes) -> ImageFrame:
|
||
"""parse() without blocking the event loop."""
|
||
loop = asyncio.get_event_loop()
|
||
return await loop.run_in_executor(None, self.parse, image_bytes)
|
||
|
||
|
||
def _cuda_available() -> bool:
|
||
try:
|
||
import torch
|
||
return torch.cuda.is_available()
|
||
except ImportError:
|
||
return False
|