From 353525c1f49fbd400f622374bc104319577a9f3c Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 6 Apr 2026 17:59:00 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20initial=20cf-vision=20scaffold=20?= =?UTF-8?q?=E2=80=94=20ImageFrame=20API,=20stub=20inference=20modules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - cf_vision/models.py: ImageFrame + ImageElement + BoundingBox (MIT) Full Dolphin-v2 element taxonomy (21 types), convenience accessors (text_blocks, barcodes, tables, full_text) - cf_vision/router.py: VisionRouter — mock + real paths, task routing (document, barcode, receipt, general) - cf_vision/barcode.py: BarcodeScanner — pyzbar wrapper, CPU-only, MIT - cf_vision/ocr.py: DolphinOCR — ByteDance/Dolphin-v2 async stub (BSL 1.1) - cf_vision/receipt.py: ReceiptParser stub — Kiwi Phase 2 target (BSL 1.1) - cf_vision/camera.py: CameraCapture — OpenCV single-frame capture (MIT) - pyproject.toml: inference / barcode / camera optional extras - .env.example: HF_TOKEN, CF_VISION_DEVICE, CF_VISION_MOCK - README: module map, ImageFrame API reference, consumer roadmap - tests: 6 passing (ImageFrame accessors, VisionRouter mock/real) Extracted from circuitforge_core.vision per cf-core#36. --- .env.example | 20 ++++++++ .gitignore | 4 ++ README.md | 100 ++++++++++++++++++++++++++++++++++++++ cf_vision/__init__.py | 7 +++ cf_vision/barcode.py | 85 ++++++++++++++++++++++++++++++++ cf_vision/camera.py | 79 ++++++++++++++++++++++++++++++ cf_vision/models.py | 90 ++++++++++++++++++++++++++++++++++ cf_vision/ocr.py | 109 ++++++++++++++++++++++++++++++++++++++++++ cf_vision/receipt.py | 44 +++++++++++++++++ cf_vision/router.py | 107 +++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 48 +++++++++++++++++++ tests/__init__.py | 0 tests/test_models.py | 71 +++++++++++++++++++++++++++ 13 files changed, 764 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md create mode 100644 cf_vision/__init__.py create mode 100644 cf_vision/barcode.py create mode 100644 cf_vision/camera.py create mode 100644 cf_vision/models.py create mode 100644 cf_vision/ocr.py create mode 100644 cf_vision/receipt.py create mode 100644 cf_vision/router.py create mode 100644 pyproject.toml create mode 100644 tests/__init__.py create mode 100644 tests/test_models.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..b941ad5 --- /dev/null +++ b/.env.example @@ -0,0 +1,20 @@ +# cf-vision environment — copy to .env and fill in values +# cf-vision does not auto-load .env; consumers load it in their own startup. + +# ── Dolphin-v2 document parser ──────────────────────────────────────────────── +# HuggingFace model: ByteDance/Dolphin-v2 +# Requires ~8GB VRAM. Download cached automatically on first use. +# Get a token at https://huggingface.co/settings/tokens +HF_TOKEN= + +# ── Compute ─────────────────────────────────────────────────────────────────── +# auto (detect GPU), cuda, cpu +CF_VISION_DEVICE=auto + +# ── Mock mode ───────────────────────────────────────────────────────────────── +# Set to 1 to use synthetic ImageFrame responses — no GPU or camera required. +CF_VISION_MOCK= + +# ── OCR confidence threshold ────────────────────────────────────────────────── +# Results below this are marked low-confidence in the ImageFrame output. +CF_VISION_CONFIDENCE_THRESHOLD=0.7 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8ac4c3f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +.env +__pycache__/ +*.egg-info/ +.pytest_cache/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..b69cadd --- /dev/null +++ b/README.md @@ -0,0 +1,100 @@ +# cf-vision + +CircuitForge vision pipeline. Produces `ImageFrame` objects from image sources -- documents, barcodes, receipts, camera captures -- using Dolphin-v2 (local, Free tier) or Claude vision (cloud, Paid tier). + +**Status:** Stub. `VisionRouter` and `BarcodeScanner` API surface locked; real Dolphin-v2 inference lands with Kiwi Phase 2. + +## Install + +```bash +# Stub / mock mode (no GPU required) +pip install -e ../cf-vision + +# Real document OCR (Dolphin-v2, ~8GB VRAM) +pip install -e "../cf-vision[inference]" + +# Barcode / QR scanning (CPU, no GPU) +pip install -e "../cf-vision[barcode]" + +# Camera capture +pip install -e "../cf-vision[camera]" +``` + +Copy `.env.example` to `.env` and fill in `HF_TOKEN` for Dolphin-v2. + +## Quick start + +```python +from cf_vision.router import VisionRouter + +# Mock mode (no hardware needed) +router = VisionRouter(mock=True) +frame = router.analyze(image_bytes, task="document") +for element in frame.elements: + print(element.element_type, element.text) + +# Real mode (requires [inference] extras) +router = VisionRouter.from_env() # reads CF_VISION_MOCK, CF_VISION_DEVICE +``` + +--- + +## ImageFrame + +The primary output type. MIT licensed. + +```python +@dataclass +class ImageFrame: + source: Literal["camera", "upload", "url", "mock"] + image_bytes: bytes | None + elements: list[ImageElement] + width_px: int + height_px: int + model: str # "dolphin-v2", "pyzbar", "claude", "mock" + + def text_blocks() -> list[ImageElement] + def barcodes() -> list[ImageElement] + def tables() -> list[ImageElement] + def full_text(separator="\n") -> str +``` + +### ImageElement + +```python +@dataclass +class ImageElement: + element_type: ElementType # one of 21 Dolphin-v2 types + text: str + confidence: float # 0.0–1.0 + bbox: BoundingBox | None + metadata: dict # e.g. {"format": "EAN13"} for barcodes +``` + +### ElementType (21 types from Dolphin-v2) + +`title` · `plain_text` · `text_block` · `header` · `footer` · `table` · `table_caption` · `table_footnote` · `figure` · `figure_caption` · `isolate_formula` · `formula_caption` · `inline_formula` · `page_number` · `seal` · `handwriting` · `barcode` · `qr_code` · `signature` · `watermark` · `abandon` + +--- + +## Module structure + +| Module | License | Purpose | +|--------|---------|---------| +| `cf_vision.models` | MIT | `ImageFrame`, `ImageElement`, `BoundingBox` | +| `cf_vision.router` | BSL 1.1* | `VisionRouter` — routes to local or cloud model | +| `cf_vision.barcode` | MIT | `BarcodeScanner` — pyzbar wrapper, no GPU | +| `cf_vision.ocr` | BSL 1.1 | `DolphinOCR` — Dolphin-v2 async wrapper | +| `cf_vision.receipt` | BSL 1.1 | `ReceiptParser` — line-item extraction (stub) | +| `cf_vision.camera` | MIT | `CameraCapture` — OpenCV frame capture | + +*BSL applies to inference modules. Models + barcode + camera = MIT. + +--- + +## Consumed by + +- `Circuit-Forge/kiwi` — barcode scan + receipt OCR (Phase 2, primary consumer) +- `Circuit-Forge/peregrine` — resume document parsing +- `Circuit-Forge/falcon` (planned) — government form scanning +- `Circuit-Forge/godwit` (planned) — emergency identity document capture diff --git a/cf_vision/__init__.py b/cf_vision/__init__.py new file mode 100644 index 0000000..3227b52 --- /dev/null +++ b/cf_vision/__init__.py @@ -0,0 +1,7 @@ +""" +cf-vision — CircuitForge vision pipeline. + +Primary API surface: + from cf_vision.models import ImageFrame + from cf_vision.router import VisionRouter +""" diff --git a/cf_vision/barcode.py b/cf_vision/barcode.py new file mode 100644 index 0000000..7f9d841 --- /dev/null +++ b/cf_vision/barcode.py @@ -0,0 +1,85 @@ +# cf_vision/barcode.py — barcode and QR code scanning +# +# MIT licensed. Uses pyzbar (libzbar wrapper) — no GPU required. +# Requires [barcode] extras: pip install cf-vision[barcode] +# +# Primary consumer: Kiwi (pantry item lookup by UPC/EAN barcode scan) +from __future__ import annotations + +import logging +from typing import Literal + +from cf_vision.models import BoundingBox, ImageElement, ImageFrame + +logger = logging.getLogger(__name__) + +BarcodeFormat = Literal[ + "EAN13", "EAN8", "UPCA", "UPCE", "CODE128", "CODE39", + "QR_CODE", "PDF417", "DATAMATRIX", "ITF", "CODABAR", +] + + +class BarcodeScanner: + """ + Lightweight barcode and QR code scanner using pyzbar. + + No GPU required. Works on CPU with ~5ms per image. + + Usage + ----- + scanner = BarcodeScanner() + frame = scanner.scan(image_bytes) + for b in frame.barcodes(): + print(b.text, b.metadata["format"]) + + Requires: pip install cf-vision[barcode] + """ + + def scan(self, image_bytes: bytes) -> ImageFrame: + """ + Scan image_bytes for barcodes and QR codes. + + Returns an ImageFrame with element_type "barcode" or "qr_code" for + each detected code. Elements include decoded text and bounding box. + """ + try: + from pyzbar.pyzbar import decode as pyzbar_decode + from PIL import Image + import io + except ImportError as exc: + raise ImportError( + "pyzbar and Pillow are required for barcode scanning. " + "Install with: pip install cf-vision[barcode]" + ) from exc + + img = Image.open(io.BytesIO(image_bytes)).convert("RGB") + w, h = img.size + decoded = pyzbar_decode(img) + + elements: list[ImageElement] = [] + for symbol in decoded: + fmt = symbol.type.upper() + el_type = "qr_code" if fmt == "QRCODE" else "barcode" + rect = symbol.rect + bbox = BoundingBox( + x=rect.left / w, + y=rect.top / h, + width=rect.width / w, + height=rect.height / h, + ) + elements.append(ImageElement( + element_type=el_type, + text=symbol.data.decode("utf-8", errors="replace"), + confidence=1.0, # pyzbar doesn't give confidence scores + bbox=bbox, + metadata={"format": fmt}, + )) + + return ImageFrame( + source="upload", + image_bytes=image_bytes, + elements=elements, + width_px=w, + height_px=h, + model="pyzbar", + ) diff --git a/cf_vision/camera.py b/cf_vision/camera.py new file mode 100644 index 0000000..55dc6fa --- /dev/null +++ b/cf_vision/camera.py @@ -0,0 +1,79 @@ +# cf_vision/camera.py — camera capture and preprocessing +# +# MIT licensed. Uses OpenCV for capture; no GPU required for capture itself. +# Requires [camera] extras: pip install cf-vision[camera] +# +# Planned consumers: +# Kiwi — live barcode scan from phone camera +# Godwit — fingerprint/ID document capture for emergency identity recovery +from __future__ import annotations + +import asyncio +import logging + +logger = logging.getLogger(__name__) + + +class CameraCapture: + """ + Single-frame camera capture with preprocessing. + + Captures one frame from a camera device, normalises it to JPEG bytes + suitable for VisionRouter.analyze() or BarcodeScanner.scan(). + + Usage + ----- + capture = CameraCapture(device_index=0) + jpeg_bytes = await capture.capture_async() + frame = router.analyze(jpeg_bytes, task="barcode") + + Requires: pip install cf-vision[camera] + """ + + def __init__( + self, + device_index: int = 0, + width: int = 1280, + height: int = 720, + jpeg_quality: int = 92, + ) -> None: + self._device_index = device_index + self._width = width + self._height = height + self._jpeg_quality = jpeg_quality + + def capture(self) -> bytes: + """ + Capture one frame and return JPEG bytes. + Stub: raises NotImplementedError until Kiwi Phase 2. + """ + try: + import cv2 + except ImportError as exc: + raise ImportError( + "OpenCV is required for camera capture. " + "Install with: pip install cf-vision[camera]" + ) from exc + + cap = cv2.VideoCapture(self._device_index) + try: + cap.set(cv2.CAP_PROP_FRAME_WIDTH, self._width) + cap.set(cv2.CAP_PROP_FRAME_HEIGHT, self._height) + ok, frame = cap.read() + if not ok or frame is None: + raise RuntimeError( + f"Camera device {self._device_index} did not return a frame." + ) + ok, buf = cv2.imencode( + ".jpg", frame, [cv2.IMWRITE_JPEG_QUALITY, self._jpeg_quality] + ) + if not ok: + raise RuntimeError("JPEG encoding failed") + return bytes(buf) + finally: + cap.release() + + async def capture_async(self) -> bytes: + """capture() without blocking the event loop.""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self.capture) diff --git a/cf_vision/models.py b/cf_vision/models.py new file mode 100644 index 0000000..691591e --- /dev/null +++ b/cf_vision/models.py @@ -0,0 +1,90 @@ +# cf_vision/models.py — ImageFrame API contract +# +# MIT licensed. All consumers (Kiwi, Peregrine, Falcon, Godwit) import +# ImageFrame from here so the shape is consistent across the stack. +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Literal + +ElementType = Literal[ + # Dolphin-v2 element taxonomy (21 types) + "title", "plain_text", "abandon", "figure", "figure_caption", + "table", "table_caption", "table_footnote", "isolate_formula", + "formula_caption", "text_block", "inline_formula", "header", + "footer", "page_number", "seal", "handwriting", "barcode", + "qr_code", "signature", "watermark", +] + + +@dataclass +class BoundingBox: + """Pixel coordinates of a detected element, relative to the source image.""" + x: float # left edge, 0.0–1.0 (normalised) or pixels if absolute=True + y: float # top edge + width: float + height: float + absolute: bool = False # True when coordinates are in pixels + + +@dataclass +class ImageElement: + """ + A single structured element extracted from an image. + + Produced by cf_vision.ocr (Dolphin-v2) or cf_vision.barcode. + Consumers iterate over ImageFrame.elements to reconstruct document structure. + """ + element_type: ElementType + text: str # extracted text, or empty for non-text types + confidence: float # 0.0–1.0 + bbox: BoundingBox | None = None # None when position is unknown + metadata: dict = field(default_factory=dict) + + +@dataclass +class ImageFrame: + """ + A fully analysed image from cf-vision. + + Produced by VisionRouter.analyze() and consumed by products that need + structured content from image sources (receipts, barcodes, documents, + camera captures). + + Fields + ------ + source How the image arrived: "camera" | "upload" | "url" | "mock" + image_bytes Original image bytes (JPEG/PNG). None when source="mock". + elements Ordered list of extracted elements (top-to-bottom, left-to-right). + width_px Source image dimensions, or 0 if unknown. + height_px + model Model that produced the elements, e.g. "dolphin-v2", "mock". + """ + source: Literal["camera", "upload", "url", "mock"] + image_bytes: bytes | None + elements: list[ImageElement] = field(default_factory=list) + width_px: int = 0 + height_px: int = 0 + model: str = "stub" + + # ── Convenience accessors ───────────────────────────────────────────────── + + def text_blocks(self) -> list[ImageElement]: + """All elements that carry text, in document order.""" + text_types = { + "title", "plain_text", "text_block", "header", "footer", + "table_caption", "figure_caption", "handwriting", + } + return [e for e in self.elements if e.element_type in text_types] + + def barcodes(self) -> list[ImageElement]: + """All barcode and QR code elements.""" + return [e for e in self.elements if e.element_type in ("barcode", "qr_code")] + + def tables(self) -> list[ImageElement]: + """All table elements.""" + return [e for e in self.elements if e.element_type == "table"] + + def full_text(self, separator: str = "\n") -> str: + """Concatenated text from all text-bearing elements.""" + return separator.join(e.text for e in self.text_blocks() if e.text) diff --git a/cf_vision/ocr.py b/cf_vision/ocr.py new file mode 100644 index 0000000..f21915f --- /dev/null +++ b/cf_vision/ocr.py @@ -0,0 +1,109 @@ +# cf_vision/ocr.py — Dolphin-v2 document parser +# +# BSL 1.1: real inference. Requires [inference] extras + ~8GB VRAM. +# Stub: raises NotImplementedError until Kiwi Phase 2 wires in the model. +# +# Model: ByteDance/Dolphin-v2 +# Handles 21 element types: title, plain_text, table, figure, barcode, +# handwriting, formula, signature, watermark, and more. +# Reference: https://huggingface.co/ByteDance/Dolphin-v2 +from __future__ import annotations + +import asyncio +import logging +import os +from functools import partial + +from cf_vision.models import ImageFrame + +logger = logging.getLogger(__name__) + +_DOLPHIN_MODEL_ID = "ByteDance/Dolphin" # HuggingFace model ID + + +class DolphinOCR: + """ + Async wrapper around Dolphin-v2 for structured document parsing. + + Loads the model lazily on first call. Runs in a thread pool executor + so it never blocks the asyncio event loop (~200ms–2s per page on A100). + + Usage + ----- + ocr = DolphinOCR.from_env() + frame = await ocr.parse_async(image_bytes) + for element in frame.elements: + print(element.element_type, element.text[:80]) + + Navigation note: Dolphin-v2 returns elements in reading order + (top-to-bottom, left-to-right). Use ImageFrame.full_text() for a + plain concatenation or iterate elements for structured access. + + Consumer roadmap: + Kiwi Phase 2 — receipt line-item extraction + Peregrine — resume document parsing + Falcon — government form scanning + Godwit — identity document recovery + """ + + def __init__(self, device: str = "auto") -> None: + self._device = device + self._model = None + self._processor = None + + @classmethod + def from_env(cls) -> "DolphinOCR": + return cls(device=os.environ.get("CF_VISION_DEVICE", "auto")) + + def _load(self) -> None: + if self._model is not None: + return + try: + from transformers import AutoModelForCausalLM, AutoProcessor + import torch + except ImportError as exc: + raise ImportError( + "Dolphin-v2 requires [inference] extras: " + "pip install cf-vision[inference]" + ) from exc + + device = self._device + if device == "auto": + device = "cuda" if _cuda_available() else "cpu" + + hf_token = os.environ.get("HF_TOKEN") or None + logger.info("Loading Dolphin-v2 on %s", device) + self._processor = AutoProcessor.from_pretrained( + _DOLPHIN_MODEL_ID, token=hf_token + ) + self._model = AutoModelForCausalLM.from_pretrained( + _DOLPHIN_MODEL_ID, + token=hf_token, + torch_dtype="auto", + device_map=device, + ) + + def parse(self, image_bytes: bytes) -> ImageFrame: + """ + Parse document image bytes into a structured ImageFrame. + + Stub: raises NotImplementedError. Real implementation coming in Kiwi Phase 2. + """ + self._load() + raise NotImplementedError( + "DolphinOCR.parse() is not yet implemented. " + "Tracking: Kiwi Phase 2 / cf-vision#TBD" + ) + + async def parse_async(self, image_bytes: bytes) -> ImageFrame: + """parse() without blocking the event loop.""" + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, self.parse, image_bytes) + + +def _cuda_available() -> bool: + try: + import torch + return torch.cuda.is_available() + except ImportError: + return False diff --git a/cf_vision/receipt.py b/cf_vision/receipt.py new file mode 100644 index 0000000..addadc6 --- /dev/null +++ b/cf_vision/receipt.py @@ -0,0 +1,44 @@ +# cf_vision/receipt.py — receipt line-item extraction +# +# BSL 1.1: real inference. Dolphin-v2 + post-processing. +# Stub: raises NotImplementedError until Kiwi Phase 2. +# +# Planned pipeline: +# DolphinOCR.parse(image_bytes) → ImageFrame with table/text elements +# ReceiptParser.extract(frame) → list[LineItem] +# ProductResolver.resolve(items) → matched pantry items (Kiwi-specific) +from __future__ import annotations + +from dataclasses import dataclass, field + + +@dataclass +class LineItem: + """A single line item extracted from a receipt.""" + name: str + quantity: float = 1.0 + unit: str = "" # "g", "ml", "oz", "each", etc. + price: float | None = None + barcode: str | None = None + confidence: float = 0.0 + + +class ReceiptParser: + """ + Extract line items from a receipt ImageFrame. + + Stub: raises NotImplementedError until Kiwi Phase 2. + Consumer: Kiwi Phase 2 pantry auto-population from receipt photos. + + Real pipeline: + 1. DolphinOCR produces an ImageFrame with table rows and text blocks + 2. ReceiptParser identifies the items section (skip header/footer/totals) + 3. Per-row NLP extracts name, quantity, unit, price + 4. Optional: barcode lookup if any barcode elements present + """ + + def extract(self, frame: "ImageFrame") -> list[LineItem]: # type: ignore[name-defined] + raise NotImplementedError( + "ReceiptParser.extract() is not yet implemented. " + "Tracking: Kiwi Phase 2 / cf-vision#TBD" + ) diff --git a/cf_vision/router.py b/cf_vision/router.py new file mode 100644 index 0000000..62eb9d7 --- /dev/null +++ b/cf_vision/router.py @@ -0,0 +1,107 @@ +# cf_vision/router.py — VisionRouter, the primary consumer API +# +# BSL 1.1 when real inference models are integrated (Dolphin-v2, Claude vision). +# Currently a stub: analyze() raises NotImplementedError unless mock=True. +from __future__ import annotations + +import os +from typing import Literal + +from cf_vision.models import ImageFrame, ImageElement, BoundingBox + +_MOCK_ELEMENTS = [ + ImageElement( + element_type="title", + text="[Mock document title]", + confidence=0.99, + bbox=BoundingBox(x=0.05, y=0.02, width=0.9, height=0.06), + ), + ImageElement( + element_type="plain_text", + text="[Mock paragraph — real content requires cf-vision[inference] and a vision model.]", + confidence=0.95, + bbox=BoundingBox(x=0.05, y=0.12, width=0.9, height=0.08), + ), +] + + +class VisionRouter: + """ + Routes image analysis requests to local or cloud vision models. + + Local models (Free tier): + - Dolphin-v2 (ByteDance) — universal document parser, 21 element types + - pyzbar — barcode / QR code scanning (no GPU required) + + Cloud fallback (Paid tier): + - Claude vision API — general-purpose image understanding + + Usage + ----- + router = VisionRouter.from_env() + frame = router.analyze(image_bytes, task="document") + for element in frame.elements: + print(element.element_type, element.text) + """ + + def __init__( + self, + mock: bool = False, + device: str = "auto", + ) -> None: + self._mock = mock + self._device = device + + @classmethod + def from_env(cls) -> "VisionRouter": + """Construct from CF_VISION_MOCK and CF_VISION_DEVICE env vars.""" + mock = os.environ.get("CF_VISION_MOCK", "") == "1" + device = os.environ.get("CF_VISION_DEVICE", "auto") + return cls(mock=mock, device=device) + + def analyze( + self, + image_bytes: bytes, + task: Literal["document", "barcode", "receipt", "general"] = "document", + prompt: str = "", + ) -> ImageFrame: + """ + Analyse image_bytes and return a structured ImageFrame. + + task: + "document" — full document parsing via Dolphin-v2 (all 21 element types) + "barcode" — barcode / QR code extraction via pyzbar (lightweight) + "receipt" — receipt line-item extraction (Dolphin-v2 + post-processing) + "general" — general image understanding via Claude vision (cloud, Paid tier) + + Stub: raises NotImplementedError unless CF_VISION_MOCK=1 or mock=True. + Real implementation lands with Kiwi Phase 2 (cf_vision.ocr, cf_vision.barcode). + """ + if self._mock: + return self._mock_frame(image_bytes, task) + + raise NotImplementedError( + "VisionRouter real inference is not yet implemented. " + "Set CF_VISION_MOCK=1 or mock=True to use synthetic frames. " + "Real analysis requires: pip install cf-vision[inference]" + ) + + def _mock_frame(self, image_bytes: bytes, task: str) -> ImageFrame: + from cf_vision.models import ImageElement, BoundingBox + if task == "barcode": + elements = [ + ImageElement( + element_type="barcode", + text="0123456789012", + confidence=0.99, + metadata={"format": "EAN13"}, + ) + ] + else: + elements = list(_MOCK_ELEMENTS) + return ImageFrame( + source="mock", + image_bytes=None, + elements=elements, + model="mock", + ) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..16f6dba --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,48 @@ +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.build_meta" + +[project] +name = "cf-vision" +version = "0.1.0" +description = "CircuitForge vision pipeline — ImageFrame API, OCR, barcode, receipt extraction" +readme = "README.md" +requires-python = ">=3.11" +license = {text = "MIT"} +dependencies = [ + "pydantic>=2.0", +] + +[project.optional-dependencies] +# Real inference backends — not required for stub/mock mode +inference = [ + "torch>=2.0", + "torchvision>=0.15", + "numpy>=1.24", + "Pillow>=10.0", + "transformers>=4.40", + "python-dotenv>=1.0", +] +# Barcode / QR scanning +barcode = [ + "pyzbar>=0.1.9", + "Pillow>=10.0", +] +# Camera capture +camera = [ + "opencv-python>=4.8", +] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.23", + "Pillow>=10.0", + "numpy>=1.24", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["cf_vision*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_models.py b/tests/test_models.py new file mode 100644 index 0000000..09dc251 --- /dev/null +++ b/tests/test_models.py @@ -0,0 +1,71 @@ +"""Tests for ImageFrame API contract.""" +import pytest +from cf_vision.models import BoundingBox, ImageElement, ImageFrame + + +def test_imageframe_text_blocks(): + frame = ImageFrame( + source="mock", + image_bytes=None, + elements=[ + ImageElement(element_type="title", text="My Doc", confidence=0.99), + ImageElement(element_type="barcode", text="123456", confidence=1.0), + ImageElement(element_type="plain_text", text="Body text.", confidence=0.9), + ], + model="mock", + ) + blocks = frame.text_blocks() + assert len(blocks) == 2 + assert all(b.element_type in ("title", "plain_text") for b in blocks) + + +def test_imageframe_barcodes(): + frame = ImageFrame( + source="mock", + image_bytes=None, + elements=[ + ImageElement(element_type="barcode", text="0123456789012", confidence=1.0, + metadata={"format": "EAN13"}), + ], + model="mock", + ) + barcodes = frame.barcodes() + assert len(barcodes) == 1 + assert barcodes[0].text == "0123456789012" + + +def test_imageframe_full_text(): + frame = ImageFrame( + source="mock", + image_bytes=None, + elements=[ + ImageElement(element_type="title", text="Title", confidence=0.99), + ImageElement(element_type="plain_text", text="Paragraph.", confidence=0.9), + ], + model="mock", + ) + assert frame.full_text() == "Title\nParagraph." + + +def test_visionrouter_mock(): + from cf_vision.router import VisionRouter + router = VisionRouter(mock=True) + frame = router.analyze(b"fake_image_bytes", task="document") + assert frame.source == "mock" + assert len(frame.elements) > 0 + + +def test_visionrouter_mock_barcode(): + from cf_vision.router import VisionRouter + router = VisionRouter(mock=True) + frame = router.analyze(b"fake", task="barcode") + barcodes = frame.barcodes() + assert len(barcodes) == 1 + assert barcodes[0].element_type == "barcode" + + +def test_visionrouter_real_raises(): + from cf_vision.router import VisionRouter + router = VisionRouter(mock=False) + with pytest.raises(NotImplementedError): + router.analyze(b"fake")