cf-vision/cf_vision/models.py
pyr0ball 353525c1f4 feat: initial cf-vision scaffold — ImageFrame API, stub inference modules
- cf_vision/models.py: ImageFrame + ImageElement + BoundingBox (MIT)
  Full Dolphin-v2 element taxonomy (21 types), convenience accessors
  (text_blocks, barcodes, tables, full_text)
- cf_vision/router.py: VisionRouter — mock + real paths, task routing
  (document, barcode, receipt, general)
- cf_vision/barcode.py: BarcodeScanner — pyzbar wrapper, CPU-only, MIT
- cf_vision/ocr.py: DolphinOCR — ByteDance/Dolphin-v2 async stub (BSL 1.1)
- cf_vision/receipt.py: ReceiptParser stub — Kiwi Phase 2 target (BSL 1.1)
- cf_vision/camera.py: CameraCapture — OpenCV single-frame capture (MIT)
- pyproject.toml: inference / barcode / camera optional extras
- .env.example: HF_TOKEN, CF_VISION_DEVICE, CF_VISION_MOCK
- README: module map, ImageFrame API reference, consumer roadmap
- tests: 6 passing (ImageFrame accessors, VisionRouter mock/real)

Extracted from circuitforge_core.vision per cf-core#36.
2026-04-06 17:59:00 -07:00

90 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# cf_vision/models.py — ImageFrame API contract
#
# MIT licensed. All consumers (Kiwi, Peregrine, Falcon, Godwit) import
# ImageFrame from here so the shape is consistent across the stack.
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Literal
ElementType = Literal[
# Dolphin-v2 element taxonomy (21 types)
"title", "plain_text", "abandon", "figure", "figure_caption",
"table", "table_caption", "table_footnote", "isolate_formula",
"formula_caption", "text_block", "inline_formula", "header",
"footer", "page_number", "seal", "handwriting", "barcode",
"qr_code", "signature", "watermark",
]
@dataclass
class BoundingBox:
"""Pixel coordinates of a detected element, relative to the source image."""
x: float # left edge, 0.01.0 (normalised) or pixels if absolute=True
y: float # top edge
width: float
height: float
absolute: bool = False # True when coordinates are in pixels
@dataclass
class ImageElement:
"""
A single structured element extracted from an image.
Produced by cf_vision.ocr (Dolphin-v2) or cf_vision.barcode.
Consumers iterate over ImageFrame.elements to reconstruct document structure.
"""
element_type: ElementType
text: str # extracted text, or empty for non-text types
confidence: float # 0.01.0
bbox: BoundingBox | None = None # None when position is unknown
metadata: dict = field(default_factory=dict)
@dataclass
class ImageFrame:
"""
A fully analysed image from cf-vision.
Produced by VisionRouter.analyze() and consumed by products that need
structured content from image sources (receipts, barcodes, documents,
camera captures).
Fields
------
source How the image arrived: "camera" | "upload" | "url" | "mock"
image_bytes Original image bytes (JPEG/PNG). None when source="mock".
elements Ordered list of extracted elements (top-to-bottom, left-to-right).
width_px Source image dimensions, or 0 if unknown.
height_px
model Model that produced the elements, e.g. "dolphin-v2", "mock".
"""
source: Literal["camera", "upload", "url", "mock"]
image_bytes: bytes | None
elements: list[ImageElement] = field(default_factory=list)
width_px: int = 0
height_px: int = 0
model: str = "stub"
# ── Convenience accessors ─────────────────────────────────────────────────
def text_blocks(self) -> list[ImageElement]:
"""All elements that carry text, in document order."""
text_types = {
"title", "plain_text", "text_block", "header", "footer",
"table_caption", "figure_caption", "handwriting",
}
return [e for e in self.elements if e.element_type in text_types]
def barcodes(self) -> list[ImageElement]:
"""All barcode and QR code elements."""
return [e for e in self.elements if e.element_type in ("barcode", "qr_code")]
def tables(self) -> list[ImageElement]:
"""All table elements."""
return [e for e in self.elements if e.element_type == "table"]
def full_text(self, separator: str = "\n") -> str:
"""Concatenated text from all text-bearing elements."""
return separator.join(e.text for e in self.text_blocks() if e.text)