feat: initial cf-vision scaffold — ImageFrame API, stub inference modules
- cf_vision/models.py: ImageFrame + ImageElement + BoundingBox (MIT) Full Dolphin-v2 element taxonomy (21 types), convenience accessors (text_blocks, barcodes, tables, full_text) - cf_vision/router.py: VisionRouter — mock + real paths, task routing (document, barcode, receipt, general) - cf_vision/barcode.py: BarcodeScanner — pyzbar wrapper, CPU-only, MIT - cf_vision/ocr.py: DolphinOCR — ByteDance/Dolphin-v2 async stub (BSL 1.1) - cf_vision/receipt.py: ReceiptParser stub — Kiwi Phase 2 target (BSL 1.1) - cf_vision/camera.py: CameraCapture — OpenCV single-frame capture (MIT) - pyproject.toml: inference / barcode / camera optional extras - .env.example: HF_TOKEN, CF_VISION_DEVICE, CF_VISION_MOCK - README: module map, ImageFrame API reference, consumer roadmap - tests: 6 passing (ImageFrame accessors, VisionRouter mock/real) Extracted from circuitforge_core.vision per cf-core#36.
This commit is contained in:
commit
353525c1f4
13 changed files with 764 additions and 0 deletions
20
.env.example
Normal file
20
.env.example
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
# cf-vision environment — copy to .env and fill in values
|
||||||
|
# cf-vision does not auto-load .env; consumers load it in their own startup.
|
||||||
|
|
||||||
|
# ── Dolphin-v2 document parser ────────────────────────────────────────────────
|
||||||
|
# HuggingFace model: ByteDance/Dolphin-v2
|
||||||
|
# Requires ~8GB VRAM. Download cached automatically on first use.
|
||||||
|
# Get a token at https://huggingface.co/settings/tokens
|
||||||
|
HF_TOKEN=
|
||||||
|
|
||||||
|
# ── Compute ───────────────────────────────────────────────────────────────────
|
||||||
|
# auto (detect GPU), cuda, cpu
|
||||||
|
CF_VISION_DEVICE=auto
|
||||||
|
|
||||||
|
# ── Mock mode ─────────────────────────────────────────────────────────────────
|
||||||
|
# Set to 1 to use synthetic ImageFrame responses — no GPU or camera required.
|
||||||
|
CF_VISION_MOCK=
|
||||||
|
|
||||||
|
# ── OCR confidence threshold ──────────────────────────────────────────────────
|
||||||
|
# Results below this are marked low-confidence in the ImageFrame output.
|
||||||
|
CF_VISION_CONFIDENCE_THRESHOLD=0.7
|
||||||
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
.env
|
||||||
|
__pycache__/
|
||||||
|
*.egg-info/
|
||||||
|
.pytest_cache/
|
||||||
100
README.md
Normal file
100
README.md
Normal file
|
|
@ -0,0 +1,100 @@
|
||||||
|
# cf-vision
|
||||||
|
|
||||||
|
CircuitForge vision pipeline. Produces `ImageFrame` objects from image sources -- documents, barcodes, receipts, camera captures -- using Dolphin-v2 (local, Free tier) or Claude vision (cloud, Paid tier).
|
||||||
|
|
||||||
|
**Status:** Stub. `VisionRouter` and `BarcodeScanner` API surface locked; real Dolphin-v2 inference lands with Kiwi Phase 2.
|
||||||
|
|
||||||
|
## Install
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Stub / mock mode (no GPU required)
|
||||||
|
pip install -e ../cf-vision
|
||||||
|
|
||||||
|
# Real document OCR (Dolphin-v2, ~8GB VRAM)
|
||||||
|
pip install -e "../cf-vision[inference]"
|
||||||
|
|
||||||
|
# Barcode / QR scanning (CPU, no GPU)
|
||||||
|
pip install -e "../cf-vision[barcode]"
|
||||||
|
|
||||||
|
# Camera capture
|
||||||
|
pip install -e "../cf-vision[camera]"
|
||||||
|
```
|
||||||
|
|
||||||
|
Copy `.env.example` to `.env` and fill in `HF_TOKEN` for Dolphin-v2.
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
```python
|
||||||
|
from cf_vision.router import VisionRouter
|
||||||
|
|
||||||
|
# Mock mode (no hardware needed)
|
||||||
|
router = VisionRouter(mock=True)
|
||||||
|
frame = router.analyze(image_bytes, task="document")
|
||||||
|
for element in frame.elements:
|
||||||
|
print(element.element_type, element.text)
|
||||||
|
|
||||||
|
# Real mode (requires [inference] extras)
|
||||||
|
router = VisionRouter.from_env() # reads CF_VISION_MOCK, CF_VISION_DEVICE
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## ImageFrame
|
||||||
|
|
||||||
|
The primary output type. MIT licensed.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class ImageFrame:
|
||||||
|
source: Literal["camera", "upload", "url", "mock"]
|
||||||
|
image_bytes: bytes | None
|
||||||
|
elements: list[ImageElement]
|
||||||
|
width_px: int
|
||||||
|
height_px: int
|
||||||
|
model: str # "dolphin-v2", "pyzbar", "claude", "mock"
|
||||||
|
|
||||||
|
def text_blocks() -> list[ImageElement]
|
||||||
|
def barcodes() -> list[ImageElement]
|
||||||
|
def tables() -> list[ImageElement]
|
||||||
|
def full_text(separator="\n") -> str
|
||||||
|
```
|
||||||
|
|
||||||
|
### ImageElement
|
||||||
|
|
||||||
|
```python
|
||||||
|
@dataclass
|
||||||
|
class ImageElement:
|
||||||
|
element_type: ElementType # one of 21 Dolphin-v2 types
|
||||||
|
text: str
|
||||||
|
confidence: float # 0.0–1.0
|
||||||
|
bbox: BoundingBox | None
|
||||||
|
metadata: dict # e.g. {"format": "EAN13"} for barcodes
|
||||||
|
```
|
||||||
|
|
||||||
|
### ElementType (21 types from Dolphin-v2)
|
||||||
|
|
||||||
|
`title` · `plain_text` · `text_block` · `header` · `footer` · `table` · `table_caption` · `table_footnote` · `figure` · `figure_caption` · `isolate_formula` · `formula_caption` · `inline_formula` · `page_number` · `seal` · `handwriting` · `barcode` · `qr_code` · `signature` · `watermark` · `abandon`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Module structure
|
||||||
|
|
||||||
|
| Module | License | Purpose |
|
||||||
|
|--------|---------|---------|
|
||||||
|
| `cf_vision.models` | MIT | `ImageFrame`, `ImageElement`, `BoundingBox` |
|
||||||
|
| `cf_vision.router` | BSL 1.1* | `VisionRouter` — routes to local or cloud model |
|
||||||
|
| `cf_vision.barcode` | MIT | `BarcodeScanner` — pyzbar wrapper, no GPU |
|
||||||
|
| `cf_vision.ocr` | BSL 1.1 | `DolphinOCR` — Dolphin-v2 async wrapper |
|
||||||
|
| `cf_vision.receipt` | BSL 1.1 | `ReceiptParser` — line-item extraction (stub) |
|
||||||
|
| `cf_vision.camera` | MIT | `CameraCapture` — OpenCV frame capture |
|
||||||
|
|
||||||
|
*BSL applies to inference modules. Models + barcode + camera = MIT.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Consumed by
|
||||||
|
|
||||||
|
- `Circuit-Forge/kiwi` — barcode scan + receipt OCR (Phase 2, primary consumer)
|
||||||
|
- `Circuit-Forge/peregrine` — resume document parsing
|
||||||
|
- `Circuit-Forge/falcon` (planned) — government form scanning
|
||||||
|
- `Circuit-Forge/godwit` (planned) — emergency identity document capture
|
||||||
7
cf_vision/__init__.py
Normal file
7
cf_vision/__init__.py
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
"""
|
||||||
|
cf-vision — CircuitForge vision pipeline.
|
||||||
|
|
||||||
|
Primary API surface:
|
||||||
|
from cf_vision.models import ImageFrame
|
||||||
|
from cf_vision.router import VisionRouter
|
||||||
|
"""
|
||||||
85
cf_vision/barcode.py
Normal file
85
cf_vision/barcode.py
Normal file
|
|
@ -0,0 +1,85 @@
|
||||||
|
# cf_vision/barcode.py — barcode and QR code scanning
|
||||||
|
#
|
||||||
|
# MIT licensed. Uses pyzbar (libzbar wrapper) — no GPU required.
|
||||||
|
# Requires [barcode] extras: pip install cf-vision[barcode]
|
||||||
|
#
|
||||||
|
# Primary consumer: Kiwi (pantry item lookup by UPC/EAN barcode scan)
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from cf_vision.models import BoundingBox, ImageElement, ImageFrame
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
BarcodeFormat = Literal[
|
||||||
|
"EAN13", "EAN8", "UPCA", "UPCE", "CODE128", "CODE39",
|
||||||
|
"QR_CODE", "PDF417", "DATAMATRIX", "ITF", "CODABAR",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class BarcodeScanner:
|
||||||
|
"""
|
||||||
|
Lightweight barcode and QR code scanner using pyzbar.
|
||||||
|
|
||||||
|
No GPU required. Works on CPU with ~5ms per image.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
scanner = BarcodeScanner()
|
||||||
|
frame = scanner.scan(image_bytes)
|
||||||
|
for b in frame.barcodes():
|
||||||
|
print(b.text, b.metadata["format"])
|
||||||
|
|
||||||
|
Requires: pip install cf-vision[barcode]
|
||||||
|
"""
|
||||||
|
|
||||||
|
def scan(self, image_bytes: bytes) -> ImageFrame:
|
||||||
|
"""
|
||||||
|
Scan image_bytes for barcodes and QR codes.
|
||||||
|
|
||||||
|
Returns an ImageFrame with element_type "barcode" or "qr_code" for
|
||||||
|
each detected code. Elements include decoded text and bounding box.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from pyzbar.pyzbar import decode as pyzbar_decode
|
||||||
|
from PIL import Image
|
||||||
|
import io
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"pyzbar and Pillow are required for barcode scanning. "
|
||||||
|
"Install with: pip install cf-vision[barcode]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
img = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
||||||
|
w, h = img.size
|
||||||
|
decoded = pyzbar_decode(img)
|
||||||
|
|
||||||
|
elements: list[ImageElement] = []
|
||||||
|
for symbol in decoded:
|
||||||
|
fmt = symbol.type.upper()
|
||||||
|
el_type = "qr_code" if fmt == "QRCODE" else "barcode"
|
||||||
|
rect = symbol.rect
|
||||||
|
bbox = BoundingBox(
|
||||||
|
x=rect.left / w,
|
||||||
|
y=rect.top / h,
|
||||||
|
width=rect.width / w,
|
||||||
|
height=rect.height / h,
|
||||||
|
)
|
||||||
|
elements.append(ImageElement(
|
||||||
|
element_type=el_type,
|
||||||
|
text=symbol.data.decode("utf-8", errors="replace"),
|
||||||
|
confidence=1.0, # pyzbar doesn't give confidence scores
|
||||||
|
bbox=bbox,
|
||||||
|
metadata={"format": fmt},
|
||||||
|
))
|
||||||
|
|
||||||
|
return ImageFrame(
|
||||||
|
source="upload",
|
||||||
|
image_bytes=image_bytes,
|
||||||
|
elements=elements,
|
||||||
|
width_px=w,
|
||||||
|
height_px=h,
|
||||||
|
model="pyzbar",
|
||||||
|
)
|
||||||
79
cf_vision/camera.py
Normal file
79
cf_vision/camera.py
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
# cf_vision/camera.py — camera capture and preprocessing
|
||||||
|
#
|
||||||
|
# MIT licensed. Uses OpenCV for capture; no GPU required for capture itself.
|
||||||
|
# Requires [camera] extras: pip install cf-vision[camera]
|
||||||
|
#
|
||||||
|
# Planned consumers:
|
||||||
|
# Kiwi — live barcode scan from phone camera
|
||||||
|
# Godwit — fingerprint/ID document capture for emergency identity recovery
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class CameraCapture:
|
||||||
|
"""
|
||||||
|
Single-frame camera capture with preprocessing.
|
||||||
|
|
||||||
|
Captures one frame from a camera device, normalises it to JPEG bytes
|
||||||
|
suitable for VisionRouter.analyze() or BarcodeScanner.scan().
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
capture = CameraCapture(device_index=0)
|
||||||
|
jpeg_bytes = await capture.capture_async()
|
||||||
|
frame = router.analyze(jpeg_bytes, task="barcode")
|
||||||
|
|
||||||
|
Requires: pip install cf-vision[camera]
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
device_index: int = 0,
|
||||||
|
width: int = 1280,
|
||||||
|
height: int = 720,
|
||||||
|
jpeg_quality: int = 92,
|
||||||
|
) -> None:
|
||||||
|
self._device_index = device_index
|
||||||
|
self._width = width
|
||||||
|
self._height = height
|
||||||
|
self._jpeg_quality = jpeg_quality
|
||||||
|
|
||||||
|
def capture(self) -> bytes:
|
||||||
|
"""
|
||||||
|
Capture one frame and return JPEG bytes.
|
||||||
|
Stub: raises NotImplementedError until Kiwi Phase 2.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import cv2
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"OpenCV is required for camera capture. "
|
||||||
|
"Install with: pip install cf-vision[camera]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
cap = cv2.VideoCapture(self._device_index)
|
||||||
|
try:
|
||||||
|
cap.set(cv2.CAP_PROP_FRAME_WIDTH, self._width)
|
||||||
|
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self._height)
|
||||||
|
ok, frame = cap.read()
|
||||||
|
if not ok or frame is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Camera device {self._device_index} did not return a frame."
|
||||||
|
)
|
||||||
|
ok, buf = cv2.imencode(
|
||||||
|
".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, self._jpeg_quality]
|
||||||
|
)
|
||||||
|
if not ok:
|
||||||
|
raise RuntimeError("JPEG encoding failed")
|
||||||
|
return bytes(buf)
|
||||||
|
finally:
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
async def capture_async(self) -> bytes:
|
||||||
|
"""capture() without blocking the event loop."""
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, self.capture)
|
||||||
90
cf_vision/models.py
Normal file
90
cf_vision/models.py
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
# cf_vision/models.py — ImageFrame API contract
|
||||||
|
#
|
||||||
|
# MIT licensed. All consumers (Kiwi, Peregrine, Falcon, Godwit) import
|
||||||
|
# ImageFrame from here so the shape is consistent across the stack.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
ElementType = Literal[
|
||||||
|
# Dolphin-v2 element taxonomy (21 types)
|
||||||
|
"title", "plain_text", "abandon", "figure", "figure_caption",
|
||||||
|
"table", "table_caption", "table_footnote", "isolate_formula",
|
||||||
|
"formula_caption", "text_block", "inline_formula", "header",
|
||||||
|
"footer", "page_number", "seal", "handwriting", "barcode",
|
||||||
|
"qr_code", "signature", "watermark",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class BoundingBox:
|
||||||
|
"""Pixel coordinates of a detected element, relative to the source image."""
|
||||||
|
x: float # left edge, 0.0–1.0 (normalised) or pixels if absolute=True
|
||||||
|
y: float # top edge
|
||||||
|
width: float
|
||||||
|
height: float
|
||||||
|
absolute: bool = False # True when coordinates are in pixels
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ImageElement:
|
||||||
|
"""
|
||||||
|
A single structured element extracted from an image.
|
||||||
|
|
||||||
|
Produced by cf_vision.ocr (Dolphin-v2) or cf_vision.barcode.
|
||||||
|
Consumers iterate over ImageFrame.elements to reconstruct document structure.
|
||||||
|
"""
|
||||||
|
element_type: ElementType
|
||||||
|
text: str # extracted text, or empty for non-text types
|
||||||
|
confidence: float # 0.0–1.0
|
||||||
|
bbox: BoundingBox | None = None # None when position is unknown
|
||||||
|
metadata: dict = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ImageFrame:
|
||||||
|
"""
|
||||||
|
A fully analysed image from cf-vision.
|
||||||
|
|
||||||
|
Produced by VisionRouter.analyze() and consumed by products that need
|
||||||
|
structured content from image sources (receipts, barcodes, documents,
|
||||||
|
camera captures).
|
||||||
|
|
||||||
|
Fields
|
||||||
|
------
|
||||||
|
source How the image arrived: "camera" | "upload" | "url" | "mock"
|
||||||
|
image_bytes Original image bytes (JPEG/PNG). None when source="mock".
|
||||||
|
elements Ordered list of extracted elements (top-to-bottom, left-to-right).
|
||||||
|
width_px Source image dimensions, or 0 if unknown.
|
||||||
|
height_px
|
||||||
|
model Model that produced the elements, e.g. "dolphin-v2", "mock".
|
||||||
|
"""
|
||||||
|
source: Literal["camera", "upload", "url", "mock"]
|
||||||
|
image_bytes: bytes | None
|
||||||
|
elements: list[ImageElement] = field(default_factory=list)
|
||||||
|
width_px: int = 0
|
||||||
|
height_px: int = 0
|
||||||
|
model: str = "stub"
|
||||||
|
|
||||||
|
# ── Convenience accessors ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def text_blocks(self) -> list[ImageElement]:
|
||||||
|
"""All elements that carry text, in document order."""
|
||||||
|
text_types = {
|
||||||
|
"title", "plain_text", "text_block", "header", "footer",
|
||||||
|
"table_caption", "figure_caption", "handwriting",
|
||||||
|
}
|
||||||
|
return [e for e in self.elements if e.element_type in text_types]
|
||||||
|
|
||||||
|
def barcodes(self) -> list[ImageElement]:
|
||||||
|
"""All barcode and QR code elements."""
|
||||||
|
return [e for e in self.elements if e.element_type in ("barcode", "qr_code")]
|
||||||
|
|
||||||
|
def tables(self) -> list[ImageElement]:
|
||||||
|
"""All table elements."""
|
||||||
|
return [e for e in self.elements if e.element_type == "table"]
|
||||||
|
|
||||||
|
def full_text(self, separator: str = "\n") -> str:
|
||||||
|
"""Concatenated text from all text-bearing elements."""
|
||||||
|
return separator.join(e.text for e in self.text_blocks() if e.text)
|
||||||
109
cf_vision/ocr.py
Normal file
109
cf_vision/ocr.py
Normal file
|
|
@ -0,0 +1,109 @@
|
||||||
|
# cf_vision/ocr.py — Dolphin-v2 document parser
|
||||||
|
#
|
||||||
|
# BSL 1.1: real inference. Requires [inference] extras + ~8GB VRAM.
|
||||||
|
# Stub: raises NotImplementedError until Kiwi Phase 2 wires in the model.
|
||||||
|
#
|
||||||
|
# Model: ByteDance/Dolphin-v2
|
||||||
|
# Handles 21 element types: title, plain_text, table, figure, barcode,
|
||||||
|
# handwriting, formula, signature, watermark, and more.
|
||||||
|
# Reference: https://huggingface.co/ByteDance/Dolphin-v2
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
|
from cf_vision.models import ImageFrame
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_DOLPHIN_MODEL_ID = "ByteDance/Dolphin" # HuggingFace model ID
|
||||||
|
|
||||||
|
|
||||||
|
class DolphinOCR:
|
||||||
|
"""
|
||||||
|
Async wrapper around Dolphin-v2 for structured document parsing.
|
||||||
|
|
||||||
|
Loads the model lazily on first call. Runs in a thread pool executor
|
||||||
|
so it never blocks the asyncio event loop (~200ms–2s per page on A100).
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
ocr = DolphinOCR.from_env()
|
||||||
|
frame = await ocr.parse_async(image_bytes)
|
||||||
|
for element in frame.elements:
|
||||||
|
print(element.element_type, element.text[:80])
|
||||||
|
|
||||||
|
Navigation note: Dolphin-v2 returns elements in reading order
|
||||||
|
(top-to-bottom, left-to-right). Use ImageFrame.full_text() for a
|
||||||
|
plain concatenation or iterate elements for structured access.
|
||||||
|
|
||||||
|
Consumer roadmap:
|
||||||
|
Kiwi Phase 2 — receipt line-item extraction
|
||||||
|
Peregrine — resume document parsing
|
||||||
|
Falcon — government form scanning
|
||||||
|
Godwit — identity document recovery
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, device: str = "auto") -> None:
|
||||||
|
self._device = device
|
||||||
|
self._model = None
|
||||||
|
self._processor = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_env(cls) -> "DolphinOCR":
|
||||||
|
return cls(device=os.environ.get("CF_VISION_DEVICE", "auto"))
|
||||||
|
|
||||||
|
def _load(self) -> None:
|
||||||
|
if self._model is not None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from transformers import AutoModelForCausalLM, AutoProcessor
|
||||||
|
import torch
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"Dolphin-v2 requires [inference] extras: "
|
||||||
|
"pip install cf-vision[inference]"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
device = self._device
|
||||||
|
if device == "auto":
|
||||||
|
device = "cuda" if _cuda_available() else "cpu"
|
||||||
|
|
||||||
|
hf_token = os.environ.get("HF_TOKEN") or None
|
||||||
|
logger.info("Loading Dolphin-v2 on %s", device)
|
||||||
|
self._processor = AutoProcessor.from_pretrained(
|
||||||
|
_DOLPHIN_MODEL_ID, token=hf_token
|
||||||
|
)
|
||||||
|
self._model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
_DOLPHIN_MODEL_ID,
|
||||||
|
token=hf_token,
|
||||||
|
torch_dtype="auto",
|
||||||
|
device_map=device,
|
||||||
|
)
|
||||||
|
|
||||||
|
def parse(self, image_bytes: bytes) -> ImageFrame:
|
||||||
|
"""
|
||||||
|
Parse document image bytes into a structured ImageFrame.
|
||||||
|
|
||||||
|
Stub: raises NotImplementedError. Real implementation coming in Kiwi Phase 2.
|
||||||
|
"""
|
||||||
|
self._load()
|
||||||
|
raise NotImplementedError(
|
||||||
|
"DolphinOCR.parse() is not yet implemented. "
|
||||||
|
"Tracking: Kiwi Phase 2 / cf-vision#TBD"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def parse_async(self, image_bytes: bytes) -> ImageFrame:
|
||||||
|
"""parse() without blocking the event loop."""
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
return await loop.run_in_executor(None, self.parse, image_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def _cuda_available() -> bool:
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
return torch.cuda.is_available()
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
44
cf_vision/receipt.py
Normal file
44
cf_vision/receipt.py
Normal file
|
|
@ -0,0 +1,44 @@
|
||||||
|
# cf_vision/receipt.py — receipt line-item extraction
|
||||||
|
#
|
||||||
|
# BSL 1.1: real inference. Dolphin-v2 + post-processing.
|
||||||
|
# Stub: raises NotImplementedError until Kiwi Phase 2.
|
||||||
|
#
|
||||||
|
# Planned pipeline:
|
||||||
|
# DolphinOCR.parse(image_bytes) → ImageFrame with table/text elements
|
||||||
|
# ReceiptParser.extract(frame) → list[LineItem]
|
||||||
|
# ProductResolver.resolve(items) → matched pantry items (Kiwi-specific)
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LineItem:
|
||||||
|
"""A single line item extracted from a receipt."""
|
||||||
|
name: str
|
||||||
|
quantity: float = 1.0
|
||||||
|
unit: str = "" # "g", "ml", "oz", "each", etc.
|
||||||
|
price: float | None = None
|
||||||
|
barcode: str | None = None
|
||||||
|
confidence: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
class ReceiptParser:
|
||||||
|
"""
|
||||||
|
Extract line items from a receipt ImageFrame.
|
||||||
|
|
||||||
|
Stub: raises NotImplementedError until Kiwi Phase 2.
|
||||||
|
Consumer: Kiwi Phase 2 pantry auto-population from receipt photos.
|
||||||
|
|
||||||
|
Real pipeline:
|
||||||
|
1. DolphinOCR produces an ImageFrame with table rows and text blocks
|
||||||
|
2. ReceiptParser identifies the items section (skip header/footer/totals)
|
||||||
|
3. Per-row NLP extracts name, quantity, unit, price
|
||||||
|
4. Optional: barcode lookup if any barcode elements present
|
||||||
|
"""
|
||||||
|
|
||||||
|
def extract(self, frame: "ImageFrame") -> list[LineItem]: # type: ignore[name-defined]
|
||||||
|
raise NotImplementedError(
|
||||||
|
"ReceiptParser.extract() is not yet implemented. "
|
||||||
|
"Tracking: Kiwi Phase 2 / cf-vision#TBD"
|
||||||
|
)
|
||||||
107
cf_vision/router.py
Normal file
107
cf_vision/router.py
Normal file
|
|
@ -0,0 +1,107 @@
|
||||||
|
# cf_vision/router.py — VisionRouter, the primary consumer API
|
||||||
|
#
|
||||||
|
# BSL 1.1 when real inference models are integrated (Dolphin-v2, Claude vision).
|
||||||
|
# Currently a stub: analyze() raises NotImplementedError unless mock=True.
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
from cf_vision.models import ImageFrame, ImageElement, BoundingBox
|
||||||
|
|
||||||
|
_MOCK_ELEMENTS = [
|
||||||
|
ImageElement(
|
||||||
|
element_type="title",
|
||||||
|
text="[Mock document title]",
|
||||||
|
confidence=0.99,
|
||||||
|
bbox=BoundingBox(x=0.05, y=0.02, width=0.9, height=0.06),
|
||||||
|
),
|
||||||
|
ImageElement(
|
||||||
|
element_type="plain_text",
|
||||||
|
text="[Mock paragraph — real content requires cf-vision[inference] and a vision model.]",
|
||||||
|
confidence=0.95,
|
||||||
|
bbox=BoundingBox(x=0.05, y=0.12, width=0.9, height=0.08),
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class VisionRouter:
|
||||||
|
"""
|
||||||
|
Routes image analysis requests to local or cloud vision models.
|
||||||
|
|
||||||
|
Local models (Free tier):
|
||||||
|
- Dolphin-v2 (ByteDance) — universal document parser, 21 element types
|
||||||
|
- pyzbar — barcode / QR code scanning (no GPU required)
|
||||||
|
|
||||||
|
Cloud fallback (Paid tier):
|
||||||
|
- Claude vision API — general-purpose image understanding
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
router = VisionRouter.from_env()
|
||||||
|
frame = router.analyze(image_bytes, task="document")
|
||||||
|
for element in frame.elements:
|
||||||
|
print(element.element_type, element.text)
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
mock: bool = False,
|
||||||
|
device: str = "auto",
|
||||||
|
) -> None:
|
||||||
|
self._mock = mock
|
||||||
|
self._device = device
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_env(cls) -> "VisionRouter":
|
||||||
|
"""Construct from CF_VISION_MOCK and CF_VISION_DEVICE env vars."""
|
||||||
|
mock = os.environ.get("CF_VISION_MOCK", "") == "1"
|
||||||
|
device = os.environ.get("CF_VISION_DEVICE", "auto")
|
||||||
|
return cls(mock=mock, device=device)
|
||||||
|
|
||||||
|
def analyze(
|
||||||
|
self,
|
||||||
|
image_bytes: bytes,
|
||||||
|
task: Literal["document", "barcode", "receipt", "general"] = "document",
|
||||||
|
prompt: str = "",
|
||||||
|
) -> ImageFrame:
|
||||||
|
"""
|
||||||
|
Analyse image_bytes and return a structured ImageFrame.
|
||||||
|
|
||||||
|
task:
|
||||||
|
"document" — full document parsing via Dolphin-v2 (all 21 element types)
|
||||||
|
"barcode" — barcode / QR code extraction via pyzbar (lightweight)
|
||||||
|
"receipt" — receipt line-item extraction (Dolphin-v2 + post-processing)
|
||||||
|
"general" — general image understanding via Claude vision (cloud, Paid tier)
|
||||||
|
|
||||||
|
Stub: raises NotImplementedError unless CF_VISION_MOCK=1 or mock=True.
|
||||||
|
Real implementation lands with Kiwi Phase 2 (cf_vision.ocr, cf_vision.barcode).
|
||||||
|
"""
|
||||||
|
if self._mock:
|
||||||
|
return self._mock_frame(image_bytes, task)
|
||||||
|
|
||||||
|
raise NotImplementedError(
|
||||||
|
"VisionRouter real inference is not yet implemented. "
|
||||||
|
"Set CF_VISION_MOCK=1 or mock=True to use synthetic frames. "
|
||||||
|
"Real analysis requires: pip install cf-vision[inference]"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _mock_frame(self, image_bytes: bytes, task: str) -> ImageFrame:
|
||||||
|
from cf_vision.models import ImageElement, BoundingBox
|
||||||
|
if task == "barcode":
|
||||||
|
elements = [
|
||||||
|
ImageElement(
|
||||||
|
element_type="barcode",
|
||||||
|
text="0123456789012",
|
||||||
|
confidence=0.99,
|
||||||
|
metadata={"format": "EAN13"},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
elements = list(_MOCK_ELEMENTS)
|
||||||
|
return ImageFrame(
|
||||||
|
source="mock",
|
||||||
|
image_bytes=None,
|
||||||
|
elements=elements,
|
||||||
|
model="mock",
|
||||||
|
)
|
||||||
48
pyproject.toml
Normal file
48
pyproject.toml
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
[build-system]
|
||||||
|
requires = ["setuptools>=68"]
|
||||||
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
||||||
|
[project]
|
||||||
|
name = "cf-vision"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "CircuitForge vision pipeline — ImageFrame API, OCR, barcode, receipt extraction"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.11"
|
||||||
|
license = {text = "MIT"}
|
||||||
|
dependencies = [
|
||||||
|
"pydantic>=2.0",
|
||||||
|
]
|
||||||
|
|
||||||
|
[project.optional-dependencies]
|
||||||
|
# Real inference backends — not required for stub/mock mode
|
||||||
|
inference = [
|
||||||
|
"torch>=2.0",
|
||||||
|
"torchvision>=0.15",
|
||||||
|
"numpy>=1.24",
|
||||||
|
"Pillow>=10.0",
|
||||||
|
"transformers>=4.40",
|
||||||
|
"python-dotenv>=1.0",
|
||||||
|
]
|
||||||
|
# Barcode / QR scanning
|
||||||
|
barcode = [
|
||||||
|
"pyzbar>=0.1.9",
|
||||||
|
"Pillow>=10.0",
|
||||||
|
]
|
||||||
|
# Camera capture
|
||||||
|
camera = [
|
||||||
|
"opencv-python>=4.8",
|
||||||
|
]
|
||||||
|
dev = [
|
||||||
|
"pytest>=8.0",
|
||||||
|
"pytest-asyncio>=0.23",
|
||||||
|
"Pillow>=10.0",
|
||||||
|
"numpy>=1.24",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
where = ["."]
|
||||||
|
include = ["cf_vision*"]
|
||||||
|
|
||||||
|
[tool.pytest.ini_options]
|
||||||
|
testpaths = ["tests"]
|
||||||
|
asyncio_mode = "auto"
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
71
tests/test_models.py
Normal file
71
tests/test_models.py
Normal file
|
|
@ -0,0 +1,71 @@
|
||||||
|
"""Tests for ImageFrame API contract."""
|
||||||
|
import pytest
|
||||||
|
from cf_vision.models import BoundingBox, ImageElement, ImageFrame
|
||||||
|
|
||||||
|
|
||||||
|
def test_imageframe_text_blocks():
|
||||||
|
frame = ImageFrame(
|
||||||
|
source="mock",
|
||||||
|
image_bytes=None,
|
||||||
|
elements=[
|
||||||
|
ImageElement(element_type="title", text="My Doc", confidence=0.99),
|
||||||
|
ImageElement(element_type="barcode", text="123456", confidence=1.0),
|
||||||
|
ImageElement(element_type="plain_text", text="Body text.", confidence=0.9),
|
||||||
|
],
|
||||||
|
model="mock",
|
||||||
|
)
|
||||||
|
blocks = frame.text_blocks()
|
||||||
|
assert len(blocks) == 2
|
||||||
|
assert all(b.element_type in ("title", "plain_text") for b in blocks)
|
||||||
|
|
||||||
|
|
||||||
|
def test_imageframe_barcodes():
|
||||||
|
frame = ImageFrame(
|
||||||
|
source="mock",
|
||||||
|
image_bytes=None,
|
||||||
|
elements=[
|
||||||
|
ImageElement(element_type="barcode", text="0123456789012", confidence=1.0,
|
||||||
|
metadata={"format": "EAN13"}),
|
||||||
|
],
|
||||||
|
model="mock",
|
||||||
|
)
|
||||||
|
barcodes = frame.barcodes()
|
||||||
|
assert len(barcodes) == 1
|
||||||
|
assert barcodes[0].text == "0123456789012"
|
||||||
|
|
||||||
|
|
||||||
|
def test_imageframe_full_text():
|
||||||
|
frame = ImageFrame(
|
||||||
|
source="mock",
|
||||||
|
image_bytes=None,
|
||||||
|
elements=[
|
||||||
|
ImageElement(element_type="title", text="Title", confidence=0.99),
|
||||||
|
ImageElement(element_type="plain_text", text="Paragraph.", confidence=0.9),
|
||||||
|
],
|
||||||
|
model="mock",
|
||||||
|
)
|
||||||
|
assert frame.full_text() == "Title\nParagraph."
|
||||||
|
|
||||||
|
|
||||||
|
def test_visionrouter_mock():
|
||||||
|
from cf_vision.router import VisionRouter
|
||||||
|
router = VisionRouter(mock=True)
|
||||||
|
frame = router.analyze(b"fake_image_bytes", task="document")
|
||||||
|
assert frame.source == "mock"
|
||||||
|
assert len(frame.elements) > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_visionrouter_mock_barcode():
|
||||||
|
from cf_vision.router import VisionRouter
|
||||||
|
router = VisionRouter(mock=True)
|
||||||
|
frame = router.analyze(b"fake", task="barcode")
|
||||||
|
barcodes = frame.barcodes()
|
||||||
|
assert len(barcodes) == 1
|
||||||
|
assert barcodes[0].element_type == "barcode"
|
||||||
|
|
||||||
|
|
||||||
|
def test_visionrouter_real_raises():
|
||||||
|
from cf_vision.router import VisionRouter
|
||||||
|
router = VisionRouter(mock=False)
|
||||||
|
with pytest.raises(NotImplementedError):
|
||||||
|
router.analyze(b"fake")
|
||||||
Loading…
Reference in a new issue