- cf_vision/models.py: ImageFrame + ImageElement + BoundingBox (MIT) Full Dolphin-v2 element taxonomy (21 types), convenience accessors (text_blocks, barcodes, tables, full_text) - cf_vision/router.py: VisionRouter — mock + real paths, task routing (document, barcode, receipt, general) - cf_vision/barcode.py: BarcodeScanner — pyzbar wrapper, CPU-only, MIT - cf_vision/ocr.py: DolphinOCR — ByteDance/Dolphin-v2 async stub (BSL 1.1) - cf_vision/receipt.py: ReceiptParser stub — Kiwi Phase 2 target (BSL 1.1) - cf_vision/camera.py: CameraCapture — OpenCV single-frame capture (MIT) - pyproject.toml: inference / barcode / camera optional extras - .env.example: HF_TOKEN, CF_VISION_DEVICE, CF_VISION_MOCK - README: module map, ImageFrame API reference, consumer roadmap - tests: 6 passing (ImageFrame accessors, VisionRouter mock/real) Extracted from circuitforge_core.vision per cf-core#36.
90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
# cf_vision/models.py — ImageFrame API contract
|
||
#
|
||
# MIT licensed. All consumers (Kiwi, Peregrine, Falcon, Godwit) import
|
||
# ImageFrame from here so the shape is consistent across the stack.
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass, field
|
||
from typing import Literal
|
||
|
||
ElementType = Literal[
|
||
# Dolphin-v2 element taxonomy (21 types)
|
||
"title", "plain_text", "abandon", "figure", "figure_caption",
|
||
"table", "table_caption", "table_footnote", "isolate_formula",
|
||
"formula_caption", "text_block", "inline_formula", "header",
|
||
"footer", "page_number", "seal", "handwriting", "barcode",
|
||
"qr_code", "signature", "watermark",
|
||
]
|
||
|
||
|
||
@dataclass
|
||
class BoundingBox:
|
||
"""Pixel coordinates of a detected element, relative to the source image."""
|
||
x: float # left edge, 0.0–1.0 (normalised) or pixels if absolute=True
|
||
y: float # top edge
|
||
width: float
|
||
height: float
|
||
absolute: bool = False # True when coordinates are in pixels
|
||
|
||
|
||
@dataclass
|
||
class ImageElement:
|
||
"""
|
||
A single structured element extracted from an image.
|
||
|
||
Produced by cf_vision.ocr (Dolphin-v2) or cf_vision.barcode.
|
||
Consumers iterate over ImageFrame.elements to reconstruct document structure.
|
||
"""
|
||
element_type: ElementType
|
||
text: str # extracted text, or empty for non-text types
|
||
confidence: float # 0.0–1.0
|
||
bbox: BoundingBox | None = None # None when position is unknown
|
||
metadata: dict = field(default_factory=dict)
|
||
|
||
|
||
@dataclass
|
||
class ImageFrame:
|
||
"""
|
||
A fully analysed image from cf-vision.
|
||
|
||
Produced by VisionRouter.analyze() and consumed by products that need
|
||
structured content from image sources (receipts, barcodes, documents,
|
||
camera captures).
|
||
|
||
Fields
|
||
------
|
||
source How the image arrived: "camera" | "upload" | "url" | "mock"
|
||
image_bytes Original image bytes (JPEG/PNG). None when source="mock".
|
||
elements Ordered list of extracted elements (top-to-bottom, left-to-right).
|
||
width_px Source image dimensions, or 0 if unknown.
|
||
height_px
|
||
model Model that produced the elements, e.g. "dolphin-v2", "mock".
|
||
"""
|
||
source: Literal["camera", "upload", "url", "mock"]
|
||
image_bytes: bytes | None
|
||
elements: list[ImageElement] = field(default_factory=list)
|
||
width_px: int = 0
|
||
height_px: int = 0
|
||
model: str = "stub"
|
||
|
||
# ── Convenience accessors ─────────────────────────────────────────────────
|
||
|
||
def text_blocks(self) -> list[ImageElement]:
|
||
"""All elements that carry text, in document order."""
|
||
text_types = {
|
||
"title", "plain_text", "text_block", "header", "footer",
|
||
"table_caption", "figure_caption", "handwriting",
|
||
}
|
||
return [e for e in self.elements if e.element_type in text_types]
|
||
|
||
def barcodes(self) -> list[ImageElement]:
|
||
"""All barcode and QR code elements."""
|
||
return [e for e in self.elements if e.element_type in ("barcode", "qr_code")]
|
||
|
||
def tables(self) -> list[ImageElement]:
|
||
"""All table elements."""
|
||
return [e for e in self.elements if e.element_type == "table"]
|
||
|
||
def full_text(self, separator: str = "\n") -> str:
|
||
"""Concatenated text from all text-bearing elements."""
|
||
return separator.join(e.text for e in self.text_blocks() if e.text)
|