# cf_vision/models.py — ImageFrame API contract # # MIT licensed. All consumers (Kiwi, Peregrine, Falcon, Godwit) import # ImageFrame from here so the shape is consistent across the stack. from __future__ import annotations from dataclasses import dataclass, field from typing import Literal ElementType = Literal[ # Dolphin-v2 element taxonomy (21 types) "title", "plain_text", "abandon", "figure", "figure_caption", "table", "table_caption", "table_footnote", "isolate_formula", "formula_caption", "text_block", "inline_formula", "header", "footer", "page_number", "seal", "handwriting", "barcode", "qr_code", "signature", "watermark", ] @dataclass class BoundingBox: """Pixel coordinates of a detected element, relative to the source image.""" x: float # left edge, 0.0–1.0 (normalised) or pixels if absolute=True y: float # top edge width: float height: float absolute: bool = False # True when coordinates are in pixels @dataclass class ImageElement: """ A single structured element extracted from an image. Produced by cf_vision.ocr (Dolphin-v2) or cf_vision.barcode. Consumers iterate over ImageFrame.elements to reconstruct document structure. """ element_type: ElementType text: str # extracted text, or empty for non-text types confidence: float # 0.0–1.0 bbox: BoundingBox | None = None # None when position is unknown metadata: dict = field(default_factory=dict) @dataclass class ImageFrame: """ A fully analysed image from cf-vision. Produced by VisionRouter.analyze() and consumed by products that need structured content from image sources (receipts, barcodes, documents, camera captures). Fields ------ source How the image arrived: "camera" | "upload" | "url" | "mock" image_bytes Original image bytes (JPEG/PNG). None when source="mock". elements Ordered list of extracted elements (top-to-bottom, left-to-right). width_px Source image dimensions, or 0 if unknown. height_px model Model that produced the elements, e.g. "dolphin-v2", "mock". """ source: Literal["camera", "upload", "url", "mock"] image_bytes: bytes | None elements: list[ImageElement] = field(default_factory=list) width_px: int = 0 height_px: int = 0 model: str = "stub" # ── Convenience accessors ───────────────────────────────────────────────── def text_blocks(self) -> list[ImageElement]: """All elements that carry text, in document order.""" text_types = { "title", "plain_text", "text_block", "header", "footer", "table_caption", "figure_caption", "handwriting", } return [e for e in self.elements if e.element_type in text_types] def barcodes(self) -> list[ImageElement]: """All barcode and QR code elements.""" return [e for e in self.elements if e.element_type in ("barcode", "qr_code")] def tables(self) -> list[ImageElement]: """All table elements.""" return [e for e in self.elements if e.element_type == "table"] def full_text(self, separator: str = "\n") -> str: """Concatenated text from all text-bearing elements.""" return separator.join(e.text for e in self.text_blocks() if e.text)