cf-vision/cf_vision/ocr.py
pyr0ball 353525c1f4 feat: initial cf-vision scaffold — ImageFrame API, stub inference modules
- cf_vision/models.py: ImageFrame + ImageElement + BoundingBox (MIT)
  Full Dolphin-v2 element taxonomy (21 types), convenience accessors
  (text_blocks, barcodes, tables, full_text)
- cf_vision/router.py: VisionRouter — mock + real paths, task routing
  (document, barcode, receipt, general)
- cf_vision/barcode.py: BarcodeScanner — pyzbar wrapper, CPU-only, MIT
- cf_vision/ocr.py: DolphinOCR — ByteDance/Dolphin-v2 async stub (BSL 1.1)
- cf_vision/receipt.py: ReceiptParser stub — Kiwi Phase 2 target (BSL 1.1)
- cf_vision/camera.py: CameraCapture — OpenCV single-frame capture (MIT)
- pyproject.toml: inference / barcode / camera optional extras
- .env.example: HF_TOKEN, CF_VISION_DEVICE, CF_VISION_MOCK
- README: module map, ImageFrame API reference, consumer roadmap
- tests: 6 passing (ImageFrame accessors, VisionRouter mock/real)

Extracted from circuitforge_core.vision per cf-core#36.
2026-04-06 17:59:00 -07:00

109 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# cf_vision/ocr.py — Dolphin-v2 document parser
#
# BSL 1.1: real inference. Requires [inference] extras + ~8GB VRAM.
# Stub: raises NotImplementedError until Kiwi Phase 2 wires in the model.
#
# Model: ByteDance/Dolphin-v2
# Handles 21 element types: title, plain_text, table, figure, barcode,
# handwriting, formula, signature, watermark, and more.
# Reference: https://huggingface.co/ByteDance/Dolphin-v2
from __future__ import annotations
import asyncio
import logging
import os
from functools import partial
from cf_vision.models import ImageFrame
logger = logging.getLogger(__name__)
_DOLPHIN_MODEL_ID = "ByteDance/Dolphin" # HuggingFace model ID
class DolphinOCR:
"""
Async wrapper around Dolphin-v2 for structured document parsing.
Loads the model lazily on first call. Runs in a thread pool executor
so it never blocks the asyncio event loop (~200ms2s per page on A100).
Usage
-----
ocr = DolphinOCR.from_env()
frame = await ocr.parse_async(image_bytes)
for element in frame.elements:
print(element.element_type, element.text[:80])
Navigation note: Dolphin-v2 returns elements in reading order
(top-to-bottom, left-to-right). Use ImageFrame.full_text() for a
plain concatenation or iterate elements for structured access.
Consumer roadmap:
Kiwi Phase 2 — receipt line-item extraction
Peregrine — resume document parsing
Falcon — government form scanning
Godwit — identity document recovery
"""
def __init__(self, device: str = "auto") -> None:
self._device = device
self._model = None
self._processor = None
@classmethod
def from_env(cls) -> "DolphinOCR":
return cls(device=os.environ.get("CF_VISION_DEVICE", "auto"))
def _load(self) -> None:
if self._model is not None:
return
try:
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
except ImportError as exc:
raise ImportError(
"Dolphin-v2 requires [inference] extras: "
"pip install cf-vision[inference]"
) from exc
device = self._device
if device == "auto":
device = "cuda" if _cuda_available() else "cpu"
hf_token = os.environ.get("HF_TOKEN") or None
logger.info("Loading Dolphin-v2 on %s", device)
self._processor = AutoProcessor.from_pretrained(
_DOLPHIN_MODEL_ID, token=hf_token
)
self._model = AutoModelForCausalLM.from_pretrained(
_DOLPHIN_MODEL_ID,
token=hf_token,
torch_dtype="auto",
device_map=device,
)
def parse(self, image_bytes: bytes) -> ImageFrame:
"""
Parse document image bytes into a structured ImageFrame.
Stub: raises NotImplementedError. Real implementation coming in Kiwi Phase 2.
"""
self._load()
raise NotImplementedError(
"DolphinOCR.parse() is not yet implemented. "
"Tracking: Kiwi Phase 2 / cf-vision#TBD"
)
async def parse_async(self, image_bytes: bytes) -> ImageFrame:
"""parse() without blocking the event loop."""
loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, self.parse, image_bytes)
def _cuda_available() -> bool:
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False