feat: cf-vision managed service (#43)

SigLIP so400m-patch14-384 as default backend (classify + embed, ~1.4 GB VRAM). VLM backend (moondream2, LLaVA, Qwen-VL, etc.) as callable alternative for caption generation and VQA. Follows the same factory/Protocol/mock pattern as cf-stt and cf-tts. New module: circuitforge_core.vision - backends/base.py — VisionBackend Protocol, VisionResult, make_vision_backend() - backends/mock.py — MockVisionBackend (no GPU, deterministic) - backends/siglip.py — SigLIPBackend: sigmoid zero-shot classify + L2 embed - backends/vlm.py — VLMBackend: AutoModelForVision2Seq caption + prompt classify - __init__.py — process singleton; classify(), embed(), caption(), make_backend() - app.py — FastAPI service (port 8006): /health /classify /embed /caption Backend selection: CF_VISION_BACKEND=siglip|vlm, auto-detected from model path. VLM backend: supports_embed=False, caption()/classify() only. SigLIP backend: supports_caption=False, classify()/embed() only. 52 new tests, 385 total passing. Closes #43.
2026-04-09 06:53:43 -07:00 · 2026-04-09 06:53:43 -07:00 · 8c1daf3b6c
commit 8c1daf3b6c
parent 80b0d5fd34
12 changed files with 1354 additions and 28 deletions
--- a/circuitforge_core/vision/init.py
+++ b/circuitforge_core/vision/init.py
@ -1,3 +1,108 @@
-from .router import VisionRouter
+"""
 circuitforge_core.vision — Managed vision service module.
-__all__ = ["VisionRouter"]
+Quick start (mock mode — no GPU or model required):
    import os; os.environ["CF_VISION_MOCK"] = "1"
    from circuitforge_core.vision import classify, embed
    result = classify(image_bytes, labels=["cat", "dog", "bird"])
    print(result.top(1))          # [("cat", 0.82)]
    emb = embed(image_bytes)
    print(len(emb.embedding))     # 1152  (so400m hidden dim)
 Real inference (SigLIP — default, ~1.4 GB VRAM):
    export CF_VISION_MODEL=google/siglip-so400m-patch14-384
    from circuitforge_core.vision import classify
 Full VLM inference (caption + VQA):
    export CF_VISION_BACKEND=vlm
    export CF_VISION_MODEL=vikhyatk/moondream2
    from circuitforge_core.vision import caption
 Per-request backend (bypasses process singleton):
    from circuitforge_core.vision import make_backend
    vlm = make_backend("vikhyatk/moondream2", backend="vlm")
    result = vlm.caption(image_bytes, prompt="What text appears in this image?")
 cf-orch service profile:
    service_type: cf-vision
    max_mb:       1536 (siglip-so400m); 2200 (moondream2); 14500 (llava-7b)
    max_concurrent: 4   (siglip); 1 (vlm)
    shared:       true
    managed:
      exec:       python -m circuitforge_core.vision.app
      args:       --model <path> --backend siglip --port {port} --gpu-id {gpu_id}
      port:       8006
      health:     /health
 """
 from __future__ import annotations
 import os
 from circuitforge_core.vision.backends.base import (
    VisionBackend,
    VisionResult,
    make_vision_backend,
 )
 from circuitforge_core.vision.backends.mock import MockVisionBackend
 _backend: VisionBackend | None = None
 def _get_backend() -> VisionBackend:
    global _backend
    if _backend is None:
        model_path = os.environ.get("CF_VISION_MODEL", "mock")
        mock = model_path == "mock" or os.environ.get("CF_VISION_MOCK", "") == "1"
        _backend = make_vision_backend(model_path, mock=mock)
    return _backend
 def classify(image: bytes, labels: list[str]) -> VisionResult:
    """Zero-shot image classification using the process-level backend."""
    return _get_backend().classify(image, labels)
 def embed(image: bytes) -> VisionResult:
    """Image embedding using the process-level backend (SigLIP only)."""
    return _get_backend().embed(image)
 def caption(image: bytes, prompt: str = "") -> VisionResult:
    """Image captioning / VQA using the process-level backend (VLM only)."""
    return _get_backend().caption(image, prompt)
 def make_backend(
    model_path: str,
    backend: str | None = None,
    mock: bool | None = None,
    device: str = "cuda",
    dtype: str = "float16",
 ) -> VisionBackend:
    """
    Create a one-off VisionBackend without affecting the process singleton.
    Useful when a product needs both SigLIP (routing) and a VLM (captioning)
    in the same process, or when testing different models side-by-side.
    """
    return make_vision_backend(
        model_path, backend=backend, mock=mock, device=device, dtype=dtype
    )
 __all__ = [
    "VisionBackend",
    "VisionResult",
    "MockVisionBackend",
    "classify",
    "embed",
    "caption",
    "make_backend",
 ]
--- a/circuitforge_core/vision/app.py
+++ b/circuitforge_core/vision/app.py
@ -0,0 +1,245 @@
 """
 circuitforge_core.vision.app — cf-vision FastAPI service.
 Managed by cf-orch as a process-type service. cf-orch starts this via:
    python -m circuitforge_core.vision.app \
        --model google/siglip-so400m-patch14-384 \
        --backend siglip \
        --port 8006 \
        --gpu-id 0
 For VLM inference (caption/VQA):
    python -m circuitforge_core.vision.app \
        --model vikhyatk/moondream2 \
        --backend vlm \
        --port 8006 \
        --gpu-id 0
 Endpoints:
    GET  /health       → {"status": "ok", "model": "...", "vram_mb": n,
                          "supports_embed": bool, "supports_caption": bool}
    POST /classify     → VisionClassifyResponse  (multipart: image + labels)
    POST /embed        → VisionEmbedResponse     (multipart: image)
    POST /caption      → VisionCaptionResponse   (multipart: image + prompt)
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import os
 import sys
 import uvicorn
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from pydantic import BaseModel
 from circuitforge_core.vision.backends.base import make_vision_backend
 logger = logging.getLogger(__name__)
 # ── Response models ───────────────────────────────────────────────────────────
 class VisionClassifyResponse(BaseModel):
    labels: list[str]
    scores: list[float]
    model: str
 class VisionEmbedResponse(BaseModel):
    embedding: list[float]
    model: str
 class VisionCaptionResponse(BaseModel):
    caption: str
    model: str
 class HealthResponse(BaseModel):
    status: str
    model: str
    vram_mb: int
    backend: str
    supports_embed: bool
    supports_caption: bool
 # ── App factory ───────────────────────────────────────────────────────────────
 def create_app(
    model_path: str,
    backend: str = "siglip",
    device: str = "cuda",
    dtype: str = "float16",
    mock: bool = False,
 ) -> FastAPI:
    app = FastAPI(title="cf-vision", version="0.1.0")
    _backend = make_vision_backend(
        model_path, backend=backend, device=device, dtype=dtype, mock=mock
    )
    logger.info(
        "cf-vision ready: model=%r backend=%r vram=%dMB",
        _backend.model_name, backend, _backend.vram_mb,
    )
    @app.get("/health", response_model=HealthResponse)
    async def health() -> HealthResponse:
        return HealthResponse(
            status="ok",
            model=_backend.model_name,
            vram_mb=_backend.vram_mb,
            backend=backend,
            supports_embed=_backend.supports_embed,
            supports_caption=_backend.supports_caption,
        )
    @app.post("/classify", response_model=VisionClassifyResponse)
    async def classify(
        image: UploadFile = File(..., description="Image file (JPEG, PNG, WEBP, ...)"),
        labels: str = Form(
            ...,
            description=(
                "Candidate labels — either a JSON array "
                '(["cat","dog"]) or comma-separated (cat,dog)'
            ),
        ),
    ) -> VisionClassifyResponse:
        image_bytes = await image.read()
        if not image_bytes:
            raise HTTPException(status_code=400, detail="Empty image file")
        parsed_labels = _parse_labels(labels)
        if not parsed_labels:
            raise HTTPException(status_code=400, detail="At least one label is required")
        try:
            result = _backend.classify(image_bytes, parsed_labels)
        except Exception as exc:
            logger.exception("classify failed")
            raise HTTPException(status_code=500, detail=str(exc)) from exc
        return VisionClassifyResponse(
            labels=result.labels, scores=result.scores, model=result.model
        )
    @app.post("/embed", response_model=VisionEmbedResponse)
    async def embed_image(
        image: UploadFile = File(..., description="Image file (JPEG, PNG, WEBP, ...)"),
    ) -> VisionEmbedResponse:
        if not _backend.supports_embed:
            raise HTTPException(
                status_code=501,
                detail=(
                    f"Backend '{backend}' does not support embedding. "
                    "Use backend=siglip for embed()."
                ),
            )
        image_bytes = await image.read()
        if not image_bytes:
            raise HTTPException(status_code=400, detail="Empty image file")
        try:
            result = _backend.embed(image_bytes)
        except Exception as exc:
            logger.exception("embed failed")
            raise HTTPException(status_code=500, detail=str(exc)) from exc
        return VisionEmbedResponse(embedding=result.embedding or [], model=result.model)
    @app.post("/caption", response_model=VisionCaptionResponse)
    async def caption_image(
        image: UploadFile = File(..., description="Image file (JPEG, PNG, WEBP, ...)"),
        prompt: str = Form(
            "",
            description="Optional instruction / question for the VLM",
        ),
    ) -> VisionCaptionResponse:
        if not _backend.supports_caption:
            raise HTTPException(
                status_code=501,
                detail=(
                    f"Backend '{backend}' does not support caption generation. "
                    "Use backend=vlm for caption()."
                ),
            )
        image_bytes = await image.read()
        if not image_bytes:
            raise HTTPException(status_code=400, detail="Empty image file")
        try:
            result = _backend.caption(image_bytes, prompt=prompt)
        except Exception as exc:
            logger.exception("caption failed")
            raise HTTPException(status_code=500, detail=str(exc)) from exc
        return VisionCaptionResponse(caption=result.caption or "", model=result.model)
    return app
 # ── Label parsing ─────────────────────────────────────────────────────────────
 def _parse_labels(raw: str) -> list[str]:
    """Accept JSON array or comma-separated label string."""
    stripped = raw.strip()
    if stripped.startswith("["):
        try:
            parsed = json.loads(stripped)
            if isinstance(parsed, list):
                return [str(x) for x in parsed]
        except json.JSONDecodeError:
            pass
    return [lbl.strip() for lbl in stripped.split(",") if lbl.strip()]
 # ── CLI entry point ───────────────────────────────────────────────────────────
 def main() -> None:
    parser = argparse.ArgumentParser(description="cf-vision — CircuitForge vision service")
    parser.add_argument(
        "--model",
        default="google/siglip-so400m-patch14-384",
        help="HuggingFace model ID or local path",
    )
    parser.add_argument(
        "--backend", default="siglip", choices=["siglip", "vlm"],
        help="Vision backend: siglip (classify+embed) or vlm (caption+classify)",
    )
    parser.add_argument("--port", type=int, default=8006)
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--gpu-id", type=int, default=0)
    parser.add_argument("--device", default="cuda", choices=["cuda", "cpu"])
    parser.add_argument("--dtype", default="float16",
                        choices=["float16", "bfloat16", "float32"])
    parser.add_argument("--mock", action="store_true",
                        help="Run with mock backend (no GPU, for testing)")
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(name)s  %(message)s",
    )
    if args.device == "cuda" and not args.mock:
        os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
    mock = args.mock or os.environ.get("CF_VISION_MOCK", "") == "1"
    app = create_app(
        model_path=args.model,
        backend=args.backend,
        device=args.device,
        dtype=args.dtype,
        mock=mock,
    )
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
 if __name__ == "__main__":
    main()
--- a/circuitforge_core/vision/backends/init.py
+++ b/circuitforge_core/vision/backends/init.py
@ -0,0 +1,4 @@
 from circuitforge_core.vision.backends.base import VisionBackend, VisionResult, make_vision_backend
 from circuitforge_core.vision.backends.mock import MockVisionBackend
 __all__ = ["VisionBackend", "VisionResult", "make_vision_backend", "MockVisionBackend"]
--- a/circuitforge_core/vision/backends/base.py
+++ b/circuitforge_core/vision/backends/base.py
@ -0,0 +1,150 @@
 # circuitforge_core/vision/backends/base.py — VisionBackend Protocol + factory
 #
 # MIT licensed. The Protocol and mock are always importable without GPU deps.
 # Real backends require optional extras:
 #   pip install -e "circuitforge-core[vision-siglip]"   # SigLIP (default, ~1.4 GB VRAM)
 #   pip install -e "circuitforge-core[vision-vlm]"      # Full VLM (e.g. moondream, LLaVA)
 from __future__ import annotations
 import os
 from dataclasses import dataclass, field
 from typing import Protocol, runtime_checkable
 # ── Result type ───────────────────────────────────────────────────────────────
@dataclass(frozen=True)
 class VisionResult:
    """
    Standard result from any VisionBackend call.
    classify() → labels + scores populated; embedding/caption may be None.
    embed()    → embedding populated; labels/scores empty.
    caption()  → caption populated; labels/scores empty; embedding None.
    """
    labels: list[str] = field(default_factory=list)
    scores: list[float] = field(default_factory=list)
    embedding: list[float] | None = None
    caption: str | None = None
    model: str = ""
    def top(self, n: int = 1) -> list[tuple[str, float]]:
        """Return the top-n (label, score) pairs sorted by descending score."""
        paired = sorted(zip(self.labels, self.scores), key=lambda x: x[1], reverse=True)
        return paired[:n]
 # ── Protocol ──────────────────────────────────────────────────────────────────
@runtime_checkable
 class VisionBackend(Protocol):
    """
    Abstract interface for vision backends.
    All backends load their model once at construction time.
    SigLIP backends implement classify() and embed() but raise NotImplementedError
    for caption().  VLM backends implement caption() and a prompt-based classify()
    but raise NotImplementedError for embed().
    """
    def classify(self, image: bytes, labels: list[str]) -> VisionResult:
        """
        Zero-shot image classification.
        labels: candidate text descriptions; scores are returned in the same order.
        SigLIP uses sigmoid similarity; VLM prompts for each label.
        """
        ...
    def embed(self, image: bytes) -> VisionResult:
        """
        Return an image embedding vector.
        Available on SigLIP backends.  Raises NotImplementedError on VLM backends.
        embedding is a list of floats with length == model hidden dim.
        """
        ...
    def caption(self, image: bytes, prompt: str = "") -> VisionResult:
        """
        Generate a text description of the image.
        Available on VLM backends.  Raises NotImplementedError on SigLIP backends.
        prompt is an optional instruction; defaults to a generic description request.
        """
        ...
    @property
    def model_name(self) -> str:
        """Identifier for the loaded model (HuggingFace ID or path stem)."""
        ...
    @property
    def vram_mb(self) -> int:
        """Approximate VRAM footprint in MB. Used by cf-orch service registry."""
        ...
    @property
    def supports_embed(self) -> bool:
        """True if embed() is implemented (SigLIP backends)."""
        ...
    @property
    def supports_caption(self) -> bool:
        """True if caption() is implemented (VLM backends)."""
        ...
 # ── Factory ───────────────────────────────────────────────────────────────────
 def make_vision_backend(
    model_path: str,
    backend: str | None = None,
    mock: bool | None = None,
    device: str = "cuda",
    dtype: str = "float16",
 ) -> VisionBackend:
    """
    Return a VisionBackend for the given model.
    mock=True or CF_VISION_MOCK=1  → MockVisionBackend (no GPU, no model file needed)
    backend="siglip"               → SigLIPBackend (default; classify + embed)
    backend="vlm"                  → VLMBackend (caption + prompt-based classify)
    Auto-detection: if model_path contains "siglip" → SigLIPBackend;
    otherwise defaults to siglip unless backend is explicitly "vlm".
    device and dtype are forwarded to the real backends and ignored by mock.
    """
    use_mock = mock if mock is not None else os.environ.get("CF_VISION_MOCK", "") == "1"
    if use_mock:
        from circuitforge_core.vision.backends.mock import MockVisionBackend
        return MockVisionBackend(model_name=model_path)
    resolved = backend or os.environ.get("CF_VISION_BACKEND", "")
    if not resolved:
        # Auto-detect from model path
        resolved = "vlm" if _looks_like_vlm(model_path) else "siglip"
    if resolved == "siglip":
        from circuitforge_core.vision.backends.siglip import SigLIPBackend
        return SigLIPBackend(model_path=model_path, device=device, dtype=dtype)
    if resolved == "vlm":
        from circuitforge_core.vision.backends.vlm import VLMBackend
        return VLMBackend(model_path=model_path, device=device, dtype=dtype)
    raise ValueError(
        f"Unknown vision backend {resolved!r}. "
        "Expected 'siglip' or 'vlm'. Set CF_VISION_BACKEND or pass backend= explicitly."
    )
 def _looks_like_vlm(model_path: str) -> bool:
    """Heuristic: names associated with generative VLMs."""
    _vlm_hints = ("llava", "moondream", "qwen-vl", "qwenvl", "idefics",
                  "cogvlm", "internvl", "phi-3-vision", "phi3vision",
                  "dolphin", "paligemma")
    lower = model_path.lower()
    return any(h in lower for h in _vlm_hints)
--- a/circuitforge_core/vision/backends/mock.py
+++ b/circuitforge_core/vision/backends/mock.py
@ -0,0 +1,62 @@
 # circuitforge_core/vision/backends/mock.py — MockVisionBackend
 #
 # Deterministic stub for tests and CI. No GPU, no model files required.
 from __future__ import annotations
 import math
 from circuitforge_core.vision.backends.base import VisionBackend, VisionResult
 class MockVisionBackend:
    """
    Mock VisionBackend for testing.
    classify() returns uniform scores normalised to 1/n per label.
    embed()    returns a unit vector of length 512 (all values 1/sqrt(512)).
    caption()  returns a canned string.
    """
    def __init__(self, model_name: str = "mock") -> None:
        self._model_name = model_name
    # ── VisionBackend Protocol ─────────────────────────────────────────────────
    def classify(self, image: bytes, labels: list[str]) -> VisionResult:
        n = max(len(labels), 1)
        return VisionResult(
            labels=list(labels),
            scores=[1.0 / n] * len(labels),
            model=self._model_name,
        )
    def embed(self, image: bytes) -> VisionResult:
        dim = 512
        val = 1.0 / math.sqrt(dim)
        return VisionResult(embedding=[val] * dim, model=self._model_name)
    def caption(self, image: bytes, prompt: str = "") -> VisionResult:
        return VisionResult(
            caption="A mock image description for testing purposes.",
            model=self._model_name,
        )
    @property
    def model_name(self) -> str:
        return self._model_name
    @property
    def vram_mb(self) -> int:
        return 0
    @property
    def supports_embed(self) -> bool:
        return True
    @property
    def supports_caption(self) -> bool:
        return True
 # Verify protocol compliance at import time (catches missing methods early).
 assert isinstance(MockVisionBackend(), VisionBackend)
--- a/circuitforge_core/vision/backends/siglip.py
+++ b/circuitforge_core/vision/backends/siglip.py
@ -0,0 +1,138 @@
 # circuitforge_core/vision/backends/siglip.py — SigLIPBackend
 #
 # Requires: pip install -e "circuitforge-core[vision-siglip]"
 # Default model: google/siglip-so400m-patch14-384 (~1.4 GB VRAM)
 #
 # SigLIP uses sigmoid cross-entropy rather than softmax over labels, so each
 # score is an independent 0–1 probability.  This is better than CLIP for
 # multi-label classification and document routing.
 from __future__ import annotations
 import io
 from circuitforge_core.vision.backends.base import VisionResult
 _DEFAULT_MODEL = "google/siglip-so400m-patch14-384"
 # VRAM footprints by model variant (MB, fp16).
 _VRAM_TABLE: dict[str, int] = {
    "siglip-so400m-patch14-384": 1440,
    "siglip-so400m-patch14-224": 1440,
    "siglip-base-patch16-224": 340,
    "siglip-large-patch16-256": 690,
 }
 def _estimate_vram(model_path: str) -> int:
    lower = model_path.lower()
    for key, mb in _VRAM_TABLE.items():
        if key in lower:
            return mb
    return 1500  # conservative default for unknown so400m variants
 class SigLIPBackend:
    """
    Image classification + embedding via Google SigLIP.
    classify() returns sigmoid similarity scores for each candidate label —
    independent probabilities, not a softmax distribution.
    embed()    returns the CLS-pool image embedding (normalised).
    caption()  raises NotImplementedError — use VLMBackend for generation.
    """
    def __init__(
        self,
        model_path: str = _DEFAULT_MODEL,
        device: str = "cuda",
        dtype: str = "float16",
    ) -> None:
        try:
            import torch
            from transformers import AutoProcessor, AutoModel
        except ImportError as exc:
            raise ImportError(
                "SigLIPBackend requires torch and transformers. "
                "Install with: pip install -e 'circuitforge-core[vision-siglip]'"
            ) from exc
        import torch as _torch
        self._device = device
        self._dtype_str = dtype
        self._torch_dtype = (
            _torch.float16 if dtype == "float16"
            else _torch.bfloat16 if dtype == "bfloat16"
            else _torch.float32
        )
        self._model_path = model_path
        self._vram_mb = _estimate_vram(model_path)
        self._processor = AutoProcessor.from_pretrained(model_path)
        self._model = AutoModel.from_pretrained(
            model_path,
            torch_dtype=self._torch_dtype,
        ).to(device)
        # Set inference mode (train(False) == model.eval() without grad tracking)
        self._model.train(False)
    # ── VisionBackend Protocol ─────────────────────────────────────────────────
    def classify(self, image: bytes, labels: list[str]) -> VisionResult:
        """Zero-shot sigmoid classification — scores are independent per label."""
        import torch
        from PIL import Image
        pil_img = Image.open(io.BytesIO(image)).convert("RGB")
        inputs = self._processor(
            text=labels,
            images=pil_img,
            return_tensors="pt",
            padding="max_length",
        ).to(self._device)
        with torch.no_grad():
            outputs = self._model(**inputs)
            # logits_per_image: (1, num_labels) — raw SigLIP logits
            logits = outputs.logits_per_image[0]
            scores = torch.sigmoid(logits).cpu().float().tolist()
        return VisionResult(labels=list(labels), scores=scores, model=self.model_name)
    def embed(self, image: bytes) -> VisionResult:
        """Return normalised image embedding (CLS pool, L2-normalised)."""
        import torch
        from PIL import Image
        pil_img = Image.open(io.BytesIO(image)).convert("RGB")
        inputs = self._processor(images=pil_img, return_tensors="pt").to(self._device)
        with torch.no_grad():
            image_features = self._model.get_image_features(**inputs)
            # L2-normalise so dot-product == cosine similarity
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        embedding = image_features[0].cpu().float().tolist()
        return VisionResult(embedding=embedding, model=self.model_name)
    def caption(self, image: bytes, prompt: str = "") -> VisionResult:
        raise NotImplementedError(
            "SigLIPBackend does not support caption generation. "
            "Use backend='vlm' (VLMBackend) for image-to-text generation."
        )
    @property
    def model_name(self) -> str:
        return self._model_path.split("/")[-1]
    @property
    def vram_mb(self) -> int:
        return self._vram_mb
    @property
    def supports_embed(self) -> bool:
        return True
    @property
    def supports_caption(self) -> bool:
        return False
--- a/circuitforge_core/vision/backends/vlm.py
+++ b/circuitforge_core/vision/backends/vlm.py
@ -0,0 +1,181 @@
 # circuitforge_core/vision/backends/vlm.py — VLMBackend
 #
 # Requires: pip install -e "circuitforge-core[vision-vlm]"
 #
 # Supports any HuggingFace AutoModelForVision2Seq-compatible VLM.
 # Validated models (VRAM fp16):
 #   vikhyatk/moondream2           ~2 GB   — fast, lightweight, good for documents
 #   llava-hf/llava-1.5-7b-hf     ~14 GB  — strong general VQA
 #   Qwen/Qwen2-VL-7B-Instruct    ~16 GB  — multilingual, structured output friendly
 #
 # VLMBackend implements caption() (generative) and a prompt-based classify()
 # that asks the model to pick from a list.  embed() raises NotImplementedError.
 from __future__ import annotations
 import io
 from circuitforge_core.vision.backends.base import VisionResult
 # VRAM estimates (MB, fp16) keyed by lowercase model name fragment.
 _VRAM_TABLE: dict[str, int] = {
    "moondream2": 2000,
    "moondream": 2000,
    "llava-1.5-7b": 14000,
    "llava-7b": 14000,
    "qwen2-vl-7b": 16000,
    "qwen-vl-7b": 16000,
    "llava-1.5-13b": 26000,
    "phi-3-vision": 8000,
    "phi3-vision": 8000,
    "paligemma": 6000,
    "idefics": 12000,
    "cogvlm": 14000,
 }
 _CLASSIFY_PROMPT_TMPL = (
    "Choose the single best label for this image from the following options: "
    "{labels}. Reply with ONLY the label text, nothing else."
 )
 def _estimate_vram(model_path: str) -> int:
    lower = model_path.lower()
    for key, mb in _VRAM_TABLE.items():
        if key in lower:
            return mb
    return 8000  # safe default for unknown 7B-class VLMs
 class VLMBackend:
    """
    Generative vision-language model backend.
    caption() generates free-form text from an image + optional prompt.
    classify() prompts the model to select from candidate labels.
    embed() raises NotImplementedError — use SigLIPBackend for embeddings.
    """
    def __init__(
        self,
        model_path: str,
        device: str = "cuda",
        dtype: str = "float16",
        max_new_tokens: int = 512,
    ) -> None:
        try:
            import torch
            from transformers import AutoProcessor, AutoModelForVision2Seq
        except ImportError as exc:
            raise ImportError(
                "VLMBackend requires torch and transformers. "
                "Install with: pip install -e 'circuitforge-core[vision-vlm]'"
            ) from exc
        import torch as _torch
        self._device = device
        self._max_new_tokens = max_new_tokens
        self._model_path = model_path
        self._vram_mb = _estimate_vram(model_path)
        torch_dtype = (
            _torch.float16 if dtype == "float16"
            else _torch.bfloat16 if dtype == "bfloat16"
            else _torch.float32
        )
        self._processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
        self._model = AutoModelForVision2Seq.from_pretrained(
            model_path,
            torch_dtype=torch_dtype,
            trust_remote_code=True,
        ).to(device)
        # Put model in inference mode — disables dropout/batchnorm training behaviour
        self._model.train(False)
    # ── VisionBackend Protocol ─────────────────────────────────────────────────
    def caption(self, image: bytes, prompt: str = "") -> VisionResult:
        """Generate a text description of the image."""
        import torch
        from PIL import Image
        pil_img = Image.open(io.BytesIO(image)).convert("RGB")
        effective_prompt = prompt or "Describe this image in detail."
        inputs = self._processor(
            text=effective_prompt,
            images=pil_img,
            return_tensors="pt",
        ).to(self._device)
        with torch.no_grad():
            generated_ids = self._model.generate(
                **inputs,
                max_new_tokens=self._max_new_tokens,
                do_sample=False,
            )
        # Strip the input prompt tokens from the generated output
        input_len = inputs["input_ids"].shape[1]
        output_ids = generated_ids[0][input_len:]
        text = self._processor.decode(output_ids, skip_special_tokens=True).strip()
        return VisionResult(caption=text, model=self.model_name)
    def classify(self, image: bytes, labels: list[str]) -> VisionResult:
        """
        Prompt-based zero-shot classification.
        Asks the VLM to choose a label from the provided list.  The returned
        scores are binary (1.0 for the selected label, 0.0 for others) since
        VLMs don't expose per-label logits the same way SigLIP does.
        For soft scores, use SigLIPBackend.
        """
        labels_str = ", ".join(f'"{lbl}"' for lbl in labels)
        prompt = _CLASSIFY_PROMPT_TMPL.format(labels=labels_str)
        result = self.caption(image, prompt=prompt)
        raw = (result.caption or "").strip().strip('"').strip("'")
        matched = _match_label(raw, labels)
        scores = [1.0 if lbl == matched else 0.0 for lbl in labels]
        return VisionResult(labels=list(labels), scores=scores, model=self.model_name)
    def embed(self, image: bytes) -> VisionResult:
        raise NotImplementedError(
            "VLMBackend does not support image embeddings. "
            "Use backend='siglip' (SigLIPBackend) for embed()."
        )
    @property
    def model_name(self) -> str:
        return self._model_path.split("/")[-1]
    @property
    def vram_mb(self) -> int:
        return self._vram_mb
    @property
    def supports_embed(self) -> bool:
        return False
    @property
    def supports_caption(self) -> bool:
        return True
 # ── Helpers ───────────────────────────────────────────────────────────────────
 def _match_label(raw: str, labels: list[str]) -> str:
    """Return the best matching label from the VLM's free-form response."""
    raw_lower = raw.lower()
    for lbl in labels:
        if lbl.lower() == raw_lower:
            return lbl
    for lbl in labels:
        if raw_lower.startswith(lbl.lower()) or lbl.lower().startswith(raw_lower):
            return lbl
    for lbl in labels:
        if lbl.lower() in raw_lower or raw_lower in lbl.lower():
            return lbl
    return labels[0] if labels else raw
--- a/circuitforge_core/vision/router.py
+++ b/circuitforge_core/vision/router.py
@ -1,26 +0,0 @@
 # circuitforge_core/vision/router.py — shim
 #
 # The vision module has been extracted to the standalone cf-vision repo.
 # This shim re-exports VisionRouter so existing imports continue to work.
 # New code should import directly from cf_vision:
 #
 #   from cf_vision.router import VisionRouter
 #   from cf_vision.models import ImageFrame
 #
 # Install: pip install -e ../cf-vision
 from __future__ import annotations
 try:
    from cf_vision.router import VisionRouter  # noqa: F401
    from cf_vision.models import ImageFrame    # noqa: F401
 except ImportError:
    # cf-vision not installed — fall back to the stub so products that don't
    # need vision yet don't hard-fail on import.
    class VisionRouter:  # type: ignore[no-redef]
        """Stub — install cf-vision: pip install -e ../cf-vision"""
        def analyze(self, image_bytes: bytes, prompt: str = "", task: str = "document"):
            raise ImportError(
                "cf-vision is not installed. "
                "Run: pip install -e ../cf-vision"
            )
--- a/pyproject.toml
+++ b/pyproject.toml
@ -49,6 +49,23 @@ tts-service = [
    "uvicorn[standard]>=0.29",
    "python-multipart>=0.0.9",
 ]
 vision-siglip = [
    "torch>=2.0",
    "transformers>=4.40",
    "Pillow>=10.0",
 ]
 vision-vlm = [
    "torch>=2.0",
    "transformers>=4.40",
    "Pillow>=10.0",
    "accelerate>=0.27",
 ]
 vision-service = [
    "circuitforge-core[vision-siglip]",
    "fastapi>=0.110",
    "uvicorn[standard]>=0.29",
    "python-multipart>=0.0.9",
 ]
 dev = [
    "circuitforge-core[manage]",
    "pytest>=8.0",
--- a/tests/test_vision/init.py
+++ b/tests/test_vision/init.py
--- a/tests/test_vision/test_app.py
+++ b/tests/test_vision/test_app.py
@ -0,0 +1,203 @@
 """
 Tests for the cf-vision FastAPI service (mock backend).
 All tests use the mock backend — no GPU or model files required.
 """
 from __future__ import annotations
 import json
 import io
 import pytest
 from fastapi.testclient import TestClient
 from circuitforge_core.vision.app import create_app, _parse_labels
 # ── Fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture(scope="module")
 def siglip_client() -> TestClient:
    """Client backed by mock-siglip (supports classify + embed, not caption)."""
    app = create_app(model_path="mock-siglip", backend="siglip", mock=True)
    return TestClient(app)
@pytest.fixture(scope="module")
 def vlm_client() -> TestClient:
    """Client backed by mock-vlm (mock supports all; VLM contract tested separately)."""
    app = create_app(model_path="mock-vlm", backend="vlm", mock=True)
    return TestClient(app)
 FAKE_IMAGE = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100
 def _image_upload(data: bytes = FAKE_IMAGE) -> tuple[str, tuple]:
    return ("image", ("test.png", io.BytesIO(data), "image/png"))
 # ── /health ───────────────────────────────────────────────────────────────────
 def test_health_ok(siglip_client: TestClient) -> None:
    resp = siglip_client.get("/health")
    assert resp.status_code == 200
    body = resp.json()
    assert body["status"] == "ok"
    assert "model" in body
    assert "vram_mb" in body
    assert "backend" in body
 def test_health_backend_field(siglip_client: TestClient) -> None:
    resp = siglip_client.get("/health")
    assert resp.json()["backend"] == "siglip"
 def test_health_supports_fields(siglip_client: TestClient) -> None:
    body = siglip_client.get("/health").json()
    assert "supports_embed" in body
    assert "supports_caption" in body
 # ── /classify ─────────────────────────────────────────────────────────────────
 def test_classify_json_labels(siglip_client: TestClient) -> None:
    resp = siglip_client.post(
        "/classify",
        files=[_image_upload()],
        data={"labels": json.dumps(["cat", "dog", "bird"])},
    )
    assert resp.status_code == 200
    body = resp.json()
    assert body["labels"] == ["cat", "dog", "bird"]
    assert len(body["scores"]) == 3
 def test_classify_csv_labels(siglip_client: TestClient) -> None:
    resp = siglip_client.post(
        "/classify",
        files=[_image_upload()],
        data={"labels": "cat, dog, bird"},
    )
    assert resp.status_code == 200
    assert resp.json()["labels"] == ["cat", "dog", "bird"]
 def test_classify_single_label(siglip_client: TestClient) -> None:
    resp = siglip_client.post(
        "/classify",
        files=[_image_upload()],
        data={"labels": "document"},
    )
    assert resp.status_code == 200
    body = resp.json()
    assert body["labels"] == ["document"]
    assert len(body["scores"]) == 1
 def test_classify_empty_labels_4xx(siglip_client: TestClient) -> None:
    # Empty labels should yield a 4xx — either our 400 or FastAPI's 422
    # depending on how the empty string is handled by the form layer.
    resp = siglip_client.post(
        "/classify",
        files=[_image_upload()],
        data={"labels": ""},
    )
    assert resp.status_code in (400, 422)
 def test_classify_empty_image_400(siglip_client: TestClient) -> None:
    resp = siglip_client.post(
        "/classify",
        files=[("image", ("empty.png", io.BytesIO(b""), "image/png"))],
        data={"labels": "cat"},
    )
    assert resp.status_code == 400
 def test_classify_model_in_response(siglip_client: TestClient) -> None:
    resp = siglip_client.post(
        "/classify",
        files=[_image_upload()],
        data={"labels": "cat"},
    )
    assert "model" in resp.json()
 # ── /embed ────────────────────────────────────────────────────────────────────
 def test_embed_returns_vector(siglip_client: TestClient) -> None:
    resp = siglip_client.post("/embed", files=[_image_upload()])
    assert resp.status_code == 200
    body = resp.json()
    assert "embedding" in body
    assert isinstance(body["embedding"], list)
    assert len(body["embedding"]) > 0
 def test_embed_empty_image_400(siglip_client: TestClient) -> None:
    resp = siglip_client.post(
        "/embed",
        files=[("image", ("empty.png", io.BytesIO(b""), "image/png"))],
    )
    assert resp.status_code == 400
 def test_embed_model_in_response(siglip_client: TestClient) -> None:
    resp = siglip_client.post("/embed", files=[_image_upload()])
    assert "model" in resp.json()
 # ── /caption ──────────────────────────────────────────────────────────────────
 def test_caption_returns_text(vlm_client: TestClient) -> None:
    resp = vlm_client.post(
        "/caption",
        files=[_image_upload()],
        data={"prompt": ""},
    )
    assert resp.status_code == 200
    body = resp.json()
    assert "caption" in body
    assert isinstance(body["caption"], str)
 def test_caption_with_prompt(vlm_client: TestClient) -> None:
    resp = vlm_client.post(
        "/caption",
        files=[_image_upload()],
        data={"prompt": "What text appears here?"},
    )
    assert resp.status_code == 200
 def test_caption_empty_image_400(vlm_client: TestClient) -> None:
    resp = vlm_client.post(
        "/caption",
        files=[("image", ("empty.png", io.BytesIO(b""), "image/png"))],
        data={"prompt": ""},
    )
    assert resp.status_code == 400
 # ── Label parser ──────────────────────────────────────────────────────────────
 def test_parse_labels_json_array() -> None:
    assert _parse_labels('["cat", "dog"]') == ["cat", "dog"]
 def test_parse_labels_csv() -> None:
    assert _parse_labels("cat, dog, bird") == ["cat", "dog", "bird"]
 def test_parse_labels_single() -> None:
    assert _parse_labels("document") == ["document"]
 def test_parse_labels_empty() -> None:
    assert _parse_labels("") == []
 def test_parse_labels_whitespace_trimmed() -> None:
    assert _parse_labels("  cat ,  dog  ") == ["cat", "dog"]
--- a/tests/test_vision/test_backend.py
+++ b/tests/test_vision/test_backend.py
@ -0,0 +1,247 @@
 """
 Tests for cf-vision backends (mock) and factory routing.
 Real SigLIP/VLM backends are not tested here — they require GPU + model downloads.
 The mock backend exercises the full Protocol surface so we can verify the contract
 without hardware dependencies.
 """
 from __future__ import annotations
 import math
 import os
 import pytest
 from circuitforge_core.vision.backends.base import (
    VisionBackend,
    VisionResult,
    make_vision_backend,
 )
 from circuitforge_core.vision.backends.mock import MockVisionBackend
 # ── Fixtures ──────────────────────────────────────────────────────────────────
 FAKE_IMAGE = b"\x89PNG\r\n\x1a\n" + b"\x00" * 100  # Not a real PNG, but enough for mock
@pytest.fixture()
 def mock_backend() -> MockVisionBackend:
    return MockVisionBackend(model_name="test-mock")
 # ── Protocol compliance ───────────────────────────────────────────────────────
 def test_mock_is_vision_backend(mock_backend: MockVisionBackend) -> None:
    assert isinstance(mock_backend, VisionBackend)
 def test_mock_model_name(mock_backend: MockVisionBackend) -> None:
    assert mock_backend.model_name == "test-mock"
 def test_mock_vram_mb(mock_backend: MockVisionBackend) -> None:
    assert mock_backend.vram_mb == 0
 def test_mock_supports_embed(mock_backend: MockVisionBackend) -> None:
    assert mock_backend.supports_embed is True
 def test_mock_supports_caption(mock_backend: MockVisionBackend) -> None:
    assert mock_backend.supports_caption is True
 # ── classify() ───────────────────────────────────────────────────────────────
 def test_classify_returns_vision_result(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.classify(FAKE_IMAGE, ["cat", "dog", "bird"])
    assert isinstance(result, VisionResult)
 def test_classify_labels_preserved(mock_backend: MockVisionBackend) -> None:
    labels = ["cat", "dog", "bird"]
    result = mock_backend.classify(FAKE_IMAGE, labels)
    assert result.labels == labels
 def test_classify_scores_length_matches_labels(mock_backend: MockVisionBackend) -> None:
    labels = ["cat", "dog", "bird"]
    result = mock_backend.classify(FAKE_IMAGE, labels)
    assert len(result.scores) == len(labels)
 def test_classify_uniform_scores(mock_backend: MockVisionBackend) -> None:
    labels = ["cat", "dog", "bird"]
    result = mock_backend.classify(FAKE_IMAGE, labels)
    expected = 1.0 / 3
    for score in result.scores:
        assert abs(score - expected) < 1e-9
 def test_classify_single_label(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.classify(FAKE_IMAGE, ["document"])
    assert result.labels == ["document"]
    assert abs(result.scores[0] - 1.0) < 1e-9
 def test_classify_model_name_in_result(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.classify(FAKE_IMAGE, ["x"])
    assert result.model == "test-mock"
 # ── embed() ──────────────────────────────────────────────────────────────────
 def test_embed_returns_vision_result(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.embed(FAKE_IMAGE)
    assert isinstance(result, VisionResult)
 def test_embed_returns_embedding(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.embed(FAKE_IMAGE)
    assert result.embedding is not None
    assert len(result.embedding) == 512
 def test_embed_is_unit_vector(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.embed(FAKE_IMAGE)
    magnitude = math.sqrt(sum(v * v for v in result.embedding))
    assert abs(magnitude - 1.0) < 1e-6
 def test_embed_labels_empty(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.embed(FAKE_IMAGE)
    assert result.labels == []
    assert result.scores == []
 def test_embed_model_name_in_result(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.embed(FAKE_IMAGE)
    assert result.model == "test-mock"
 # ── caption() ────────────────────────────────────────────────────────────────
 def test_caption_returns_vision_result(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.caption(FAKE_IMAGE)
    assert isinstance(result, VisionResult)
 def test_caption_returns_string(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.caption(FAKE_IMAGE)
    assert isinstance(result.caption, str)
    assert len(result.caption) > 0
 def test_caption_with_prompt(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.caption(FAKE_IMAGE, prompt="What is in this image?")
    assert result.caption is not None
 def test_caption_model_name_in_result(mock_backend: MockVisionBackend) -> None:
    result = mock_backend.caption(FAKE_IMAGE)
    assert result.model == "test-mock"
 # ── VisionResult helpers ──────────────────────────────────────────────────────
 def test_top_returns_sorted_pairs() -> None:
    result = VisionResult(
        labels=["cat", "dog", "bird"],
        scores=[0.3, 0.6, 0.1],
    )
    top = result.top(2)
    assert top[0] == ("dog", 0.6)
    assert top[1] == ("cat", 0.3)
 def test_top_default_n1() -> None:
    result = VisionResult(labels=["cat", "dog"], scores=[0.4, 0.9])
    assert result.top() == [("dog", 0.9)]
 # ── Factory routing ───────────────────────────────────────────────────────────
 def test_factory_mock_flag() -> None:
    backend = make_vision_backend("any-model", mock=True)
    assert isinstance(backend, MockVisionBackend)
 def test_factory_mock_env(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("CF_VISION_MOCK", "1")
    backend = make_vision_backend("any-model")
    assert isinstance(backend, MockVisionBackend)
 def test_factory_mock_model_name() -> None:
    backend = make_vision_backend("google/siglip-so400m-patch14-384", mock=True)
    assert backend.model_name == "google/siglip-so400m-patch14-384"
 def test_factory_unknown_backend_raises() -> None:
    with pytest.raises(ValueError, match="Unknown vision backend"):
        make_vision_backend("any-model", backend="nonexistent", mock=False)
 def test_factory_vlm_autodetect_moondream(monkeypatch: pytest.MonkeyPatch) -> None:
    """Auto-detection should select VLM for moondream model paths."""
    # We mock at the import level to avoid requiring GPU deps
    monkeypatch.setenv("CF_VISION_MOCK", "0")
    # Just verify the ValueError is about vlm backend, not "unknown"
    # (the ImportError from missing torch is expected in CI)
    try:
        make_vision_backend("vikhyatk/moondream2", mock=False)
    except ImportError:
        pass  # Expected in CI without torch
    except ValueError as exc:
        pytest.fail(f"Should not raise ValueError for known backend: {exc}")
 def test_factory_siglip_autodetect(monkeypatch: pytest.MonkeyPatch) -> None:
    """Auto-detection should select siglip for non-VLM model paths (no ValueError)."""
    monkeypatch.setenv("CF_VISION_MOCK", "0")
    try:
        make_vision_backend("google/siglip-so400m-patch14-384", mock=False)
    except ValueError as exc:
        pytest.fail(f"Should not raise ValueError for known backend: {exc}")
    except Exception:
        pass  # ImportError or model-loading errors are expected outside GPU CI
 # ── Process singleton ─────────────────────────────────────────────────────────
 def test_module_classify_mock(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("CF_VISION_MOCK", "1")
    # Reset the module-level singleton
    import circuitforge_core.vision as vision_mod
    vision_mod._backend = None
    result = vision_mod.classify(FAKE_IMAGE, ["cat", "dog"])
    assert result.labels == ["cat", "dog"]
    assert len(result.scores) == 2
 def test_module_embed_mock(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("CF_VISION_MOCK", "1")
    import circuitforge_core.vision as vision_mod
    vision_mod._backend = None
    result = vision_mod.embed(FAKE_IMAGE)
    assert result.embedding is not None
 def test_module_caption_mock(monkeypatch: pytest.MonkeyPatch) -> None:
    monkeypatch.setenv("CF_VISION_MOCK", "1")
    import circuitforge_core.vision as vision_mod
    vision_mod._backend = None
    result = vision_mod.caption(FAKE_IMAGE, prompt="Describe")
    assert result.caption is not None
 def test_module_make_backend_returns_fresh_instance(monkeypatch: pytest.MonkeyPatch) -> None:
    import circuitforge_core.vision as vision_mod
    b1 = vision_mod.make_backend("m1", mock=True)
    b2 = vision_mod.make_backend("m2", mock=True)
    assert b1 is not b2
    assert b1.model_name != b2.model_name