cf-vision/cf_vision/models.py

# cf_vision/models.py — ImageFrame API contract
#
# MIT licensed. All consumers (Kiwi, Peregrine, Falcon, Godwit) import
# ImageFrame from here so the shape is consistent across the stack.
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Literal

ElementType = Literal[
    # Dolphin-v2 element taxonomy (21 types)
    "title", "plain_text", "abandon", "figure", "figure_caption",
    "table", "table_caption", "table_footnote", "isolate_formula",
    "formula_caption", "text_block", "inline_formula", "header",
    "footer", "page_number", "seal", "handwriting", "barcode",
    "qr_code", "signature", "watermark",
]


@dataclass
class BoundingBox:
    """Pixel coordinates of a detected element, relative to the source image."""
    x: float      # left edge, 0.0–1.0 (normalised) or pixels if absolute=True
    y: float      # top edge
    width: float
    height: float
    absolute: bool = False   # True when coordinates are in pixels


@dataclass
class ImageElement:
    """
    A single structured element extracted from an image.

    Produced by cf_vision.ocr (Dolphin-v2) or cf_vision.barcode.
    Consumers iterate over ImageFrame.elements to reconstruct document structure.
    """
    element_type: ElementType
    text: str                         # extracted text, or empty for non-text types
    confidence: float                 # 0.0–1.0
    bbox: BoundingBox | None = None   # None when position is unknown
    metadata: dict = field(default_factory=dict)


@dataclass
class ImageFrame:
    """
    A fully analysed image from cf-vision.

    Produced by VisionRouter.analyze() and consumed by products that need
    structured content from image sources (receipts, barcodes, documents,
    camera captures).

    Fields
    ------
    source          How the image arrived: "camera" | "upload" | "url" | "mock"
    image_bytes     Original image bytes (JPEG/PNG). None when source="mock".
    elements        Ordered list of extracted elements (top-to-bottom, left-to-right).
    width_px        Source image dimensions, or 0 if unknown.
    height_px
    model           Model that produced the elements, e.g. "dolphin-v2", "mock".
    """
    source: Literal["camera", "upload", "url", "mock"]
    image_bytes: bytes | None
    elements: list[ImageElement] = field(default_factory=list)
    width_px: int = 0
    height_px: int = 0
    model: str = "stub"

    # ── Convenience accessors ─────────────────────────────────────────────────

    def text_blocks(self) -> list[ImageElement]:
        """All elements that carry text, in document order."""
        text_types = {
            "title", "plain_text", "text_block", "header", "footer",
            "table_caption", "figure_caption", "handwriting",
        }
        return [e for e in self.elements if e.element_type in text_types]

    def barcodes(self) -> list[ImageElement]:
        """All barcode and QR code elements."""
        return [e for e in self.elements if e.element_type in ("barcode", "qr_code")]

    def tables(self) -> list[ImageElement]:
        """All table elements."""
        return [e for e in self.elements if e.element_type == "table"]

    def full_text(self, separator: str = "\n") -> str:
        """Concatenated text from all text-bearing elements."""
        return separator.join(e.text for e in self.text_blocks() if e.text)