feat: initial cf-docuvision service — Dolphin-v2 document parsing

FastAPI microservice wrapping ByteDance/Dolphin-v2 (Qwen2.5-VL-3B base) for structured document extraction. Exposes POST /extract and GET /health. Maps Dolphin's 21 element types to cf-core's 7-type canonical schema. Services: cf-text /extract, /health Env vars: CF_DOCUVISION_MODEL, CF_DOCUVISION_DEVICE, CF_DOCUVISION_PORT GPU: 8GB+ VRAM required for Dolphin-v2; CPU fallback available but very slow.
2026-06-05 10:25:18 -07:00 · 2026-06-05 10:25:18 -07:00 · 47d4dfc786
commit 47d4dfc786
9 changed files with 661 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,15 @@
 # cf-docuvision environment — copy to .env and fill in values
 # Model to load. Default: ByteDance/Dolphin-v2 (downloaded from HuggingFace on first run).
 # Set to a local path to skip the download: /Library/Assets/LLM/dolphin-v2/
 CF_DOCUVISION_MODEL=ByteDance/Dolphin-v2
 # Compute device. "auto" detects CUDA if available, falls back to CPU.
 # CPU is very slow for Dolphin-v2 — 8GB+ VRAM GPU strongly recommended.
 CF_DOCUVISION_DEVICE=auto
 # Service port (default matches CF_DOCUVISION_URL default in cf-core)
 CF_DOCUVISION_PORT=8003
 # Log level
 LOG_LEVEL=INFO
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,11 @@
 __pycache__/
 *.py[cod]
 *.egg-info/
 .env
 .venv/
 venv/
 dist/
 build/
 .pytest_cache/
 .mypy_cache/
 *.egg
--- a/23
+++ b/23
@ -0,0 +1,23 @@
 FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
 ENV DEBIAN_FRONTEND=noninteractive \
    PYTHONUNBUFFERED=1 \
    PIP_NO_CACHE_DIR=1
 RUN apt-get update && apt-get install -y --no-install-recommends \
    python3.11 python3.11-dev python3-pip git \
    libglib2.0-0 libsm6 libxext6 libxrender-dev \
    && rm -rf /var/lib/apt/lists/*
 RUN ln -sf python3.11 /usr/bin/python3 && ln -sf python3 /usr/bin/python
 WORKDIR /app
 COPY requirements.txt .
 RUN pip install --upgrade pip && pip install -r requirements.txt
 COPY . .
 EXPOSE 8003
 CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8003"]
--- a/app/init.py
+++ b/app/init.py
--- a/app/dolphin.py
+++ b/app/dolphin.py
@ -0,0 +1,275 @@
 # app/dolphin.py — Dolphin-v2 model wrapper
 #
 # Wraps ByteDance/Dolphin-v2 (Qwen2.5-VL-3B base) for document parsing.
 # This module is the only place in the codebase that touches the Dolphin model
 # directly. The FastAPI service (main.py) calls parse_document() and never
 # imports transformers itself.
 #
 # Dolphin-v2 uses a two-stage pipeline:
 #   Stage 1: classify each page region (21 element types)
 #   Stage 2: element-wise or holistic parsing depending on region type
 #
 # HuggingFace: https://huggingface.co/ByteDance/Dolphin-v2
 # VRAM: ~8GB minimum, 16GB+ recommended for multi-page documents
 from __future__ import annotations
 import base64
 import logging
 import os
 from dataclasses import dataclass, field
 from io import BytesIO
 from typing import Any
 logger = logging.getLogger(__name__)
 _MODEL_ID = os.environ.get("CF_DOCUVISION_MODEL", "ByteDance/Dolphin-v2")
 _DEVICE = os.environ.get("CF_DOCUVISION_DEVICE", "auto")
 # Dolphin-v2 element type → StructuredDocument element type mapping
 # Dolphin outputs 21 types; we map to cf-core's canonical 7 (+passthrough)
 _TYPE_MAP: dict[str, str] = {
    "title":            "heading",
    "section_header":   "heading",
    "paragraph":        "paragraph",
    "caption":          "paragraph",
    "footnote":         "paragraph",
    "page_header":      "paragraph",
    "page_footer":      "paragraph",
    "list_item":        "list",
    "table":            "table",
    "figure":           "figure",
    "figure_caption":   "paragraph",
    "formula":          "formula",
    "code":             "code",
    "annotation":       "paragraph",
    "abstract":         "paragraph",
    "toc_item":         "list",
    "reference":        "paragraph",
    "equation":         "formula",
    "watermark":        "paragraph",
    "stamp":            "paragraph",
    "signature":        "paragraph",
 }
@dataclass
 class DolphinElement:
    """Raw output from Dolphin-v2 for one detected region."""
    dolphin_type: str
    text: str
    bbox: list[float] | None = None   # [x0, y0, x1, y1] normalised 0-1
    html: str | None = None           # for table type only
@dataclass
 class DolphinResult:
    """Parsed output from one document image."""
    elements: list[DolphinElement] = field(default_factory=list)
    raw_text: str = ""
    model: str = _MODEL_ID
 class DolphinParser:
    """
    Dolphin-v2 document parser.
    Loaded once at service startup. Thread-safe for concurrent requests
    (model weights are read-only after loading).
    Usage:
        parser = DolphinParser.from_env()
        result = parser.parse(image_bytes, hint="auto")
    """
    def __init__(self, model_id: str = _MODEL_ID, device: str = _DEVICE) -> None:
        try:
            import torch
            from transformers import AutoModelForCausalLM, AutoProcessor
        except ImportError as exc:
            raise ImportError(
                "torch and transformers are required. "
                "Install with: pip install -r requirements.txt"
            ) from exc
        logger.info("Loading Dolphin-v2 model %s", model_id)
        if device == "auto":
            device = "cuda" if torch.cuda.is_available() else "cpu"
            if device == "cpu":
                logger.warning(
                    "Dolphin-v2 running on CPU — performance will be very slow. "
                    "8GB+ VRAM GPU strongly recommended."
                )
        self._processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
        self._model = AutoModelForCausalLM.from_pretrained(
            model_id,
            trust_remote_code=True,
            device_map=device if device != "cpu" else None,
            torch_dtype="auto",
        )
        if device == "cpu":
            self._model = self._model.to("cpu")
        self._model_id = model_id
        self._device = device
        logger.info("Dolphin-v2 loaded on %s", device)
    @classmethod
    def from_env(cls) -> "DolphinParser":
        return cls(model_id=_MODEL_ID, device=_DEVICE)
    def parse(self, image_bytes: bytes, hint: str = "auto") -> DolphinResult:
        """
        Parse a document image into structured elements.
        image_bytes  Raw image bytes (JPEG, PNG, TIFF, etc.)
        hint         Extraction focus: "auto" | "table" | "text" | "form"
                     Passed as context in the Dolphin prompt. "table" prioritises
                     HTML table rendering; "form" prioritises key-value pairs.
        """
        from PIL import Image
        image = Image.open(BytesIO(image_bytes)).convert("RGB")
        raw_output = self._run_inference(image, hint)
        return self._parse_output(raw_output)
    def parse_b64(self, image_b64: str, hint: str = "auto") -> DolphinResult:
        """Convenience wrapper for base64-encoded image bytes."""
        return self.parse(base64.b64decode(image_b64), hint=hint)
    def _run_inference(self, image: Any, hint: str) -> list[dict]:
        """Run Dolphin-v2 two-stage inference and return raw element dicts."""
        import torch
        # Dolphin-v2 uses a structured prompt with an optional extraction hint
        hint_instruction = {
            "table": " Focus on tables and render them as HTML.",
            "form":  " Focus on form fields and key-value pairs.",
            "text":  " Focus on text content, preserving heading hierarchy.",
        }.get(hint, "")
        prompt = (
            f"<|im_start|>system\nYou are a document parsing assistant.{hint_instruction}"
            f"<|im_end|>\n<|im_start|>user\n<image>\nParse this document.<|im_end|>\n"
            f"<|im_start|>assistant\n"
        )
        inputs = self._processor(
            text=prompt,
            images=image,
            return_tensors="pt",
        ).to(self._model.device)
        with torch.no_grad():
            output_ids = self._model.generate(
                **inputs,
                max_new_tokens=4096,
                do_sample=False,
            )
        # Decode only the newly generated tokens
        input_len = inputs["input_ids"].shape[1]
        generated = output_ids[0][input_len:]
        raw_text = self._processor.decode(generated, skip_special_tokens=True)
        return self._parse_dolphin_output(raw_text)
    def _parse_dolphin_output(self, raw: str) -> list[dict]:
        """
        Parse Dolphin-v2's structured text output into element dicts.
        Dolphin-v2 outputs a structured format with element markers. This parser
        extracts them into dicts with keys: type, text, bbox, html.
        """
        import json
        import re
        elements: list[dict] = []
        # Dolphin-v2 wraps elements in JSON-like blocks:
        # <element type="paragraph" bbox="[x0,y0,x1,y1]">text</element>
        # or a JSON array for structured output mode
        try:
            parsed = json.loads(raw)
            if isinstance(parsed, list):
                return parsed
        except (json.JSONDecodeError, ValueError):
            pass
        # Fallback: regex extraction of element tags
        pattern = re.compile(
            r'<element\s+type="([^"]+)"(?:\s+bbox="([^"]+)")?>(.*?)</element>',
            re.DOTALL,
        )
        for match in pattern.finditer(raw):
            el_type, bbox_str, text = match.groups()
            bbox = None
            if bbox_str:
                try:
                    bbox = [float(x) for x in bbox_str.strip("[]").split(",")]
                except ValueError:
                    pass
            elements.append({
                "type": el_type.strip(),
                "text": text.strip(),
                "bbox": bbox,
                "html": text.strip() if el_type == "table" else None,
            })
        if not elements and raw.strip():
            # Last resort: treat entire output as a single paragraph
            elements = [{"type": "paragraph", "text": raw.strip(), "bbox": None, "html": None}]
        return elements
    def _parse_output(self, raw_elements: list[dict]) -> DolphinResult:
        elements: list[DolphinElement] = []
        texts: list[str] = []
        for el in raw_elements:
            dolphin_type = el.get("type", "paragraph")
            text = el.get("text", "").strip()
            elements.append(DolphinElement(
                dolphin_type=dolphin_type,
                text=text,
                bbox=el.get("bbox"),
                html=el.get("html"),
            ))
            if text:
                texts.append(text)
        return DolphinResult(
            elements=elements,
            raw_text="\n".join(texts),
            model=self._model_id,
        )
 def dolphin_to_cf_elements(result: DolphinResult) -> tuple[list[dict], list[dict]]:
    """
    Convert DolphinResult into cf-core StructuredDocument wire format.
    Returns (elements_list, tables_list) ready to JSON-serialise in the
    /extract response. Tables are separated from elements to match the
    DocuvisionClient._parse_response() contract.
    """
    elements: list[dict] = []
    tables: list[dict] = []
    for el in result.elements:
        cf_type = _TYPE_MAP.get(el.dolphin_type, "paragraph")
        if cf_type == "table" and el.html:
            tables.append({
                "html": el.html,
                "bbox": el.bbox,
            })
        else:
            elements.append({
                "type": cf_type,
                "text": el.text,
                "bbox": el.bbox,
            })
    return elements, tables
--- a/app/main.py
+++ b/app/main.py
@ -0,0 +1,113 @@
 # app/main.py — cf-docuvision FastAPI service
 #
 # Exposes POST /extract and GET /health.
 # Response schema matches DocuvisionClient._parse_response() in cf-core.
 #
 # Start:
 #   uvicorn app.main:app --host 0.0.0.0 --port 8003
 #   CF_DOCUVISION_DEVICE=cuda uvicorn app.main:app ...
 from __future__ import annotations
 import logging
 import os
 import time
 from contextlib import asynccontextmanager
 from typing import Any
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from app.dolphin import DolphinParser, dolphin_to_cf_elements
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=os.environ.get("LOG_LEVEL", "INFO"))
 # ── Model lifecycle ───────────────────────────────────────────────────────────
 _parser: DolphinParser | None = None
@asynccontextmanager
 async def lifespan(app: FastAPI):
    global _parser
    logger.info("cf-docuvision: loading Dolphin-v2...")
    _parser = DolphinParser.from_env()
    logger.info("cf-docuvision: ready")
    yield
    _parser = None
 app = FastAPI(
    title="cf-docuvision",
    description="Dolphin-v2 document parsing service for CircuitForge products.",
    version="0.1.0",
    lifespan=lifespan,
 )
 # ── Request / Response schemas ────────────────────────────────────────────────
 class ExtractRequest(BaseModel):
    image_b64: str
    hint: str = "auto"
 class ExtractResponse(BaseModel):
    elements: list[dict[str, Any]]
    tables: list[dict[str, Any]]
    raw_text: str
    metadata: dict[str, Any]
 # ── Endpoints ─────────────────────────────────────────────────────────────────
@app.get("/health")
 def health():
    """Health check. Returns 200 when the model is loaded and ready."""
    if _parser is None:
        raise HTTPException(status_code=503, detail="Model not loaded")
    return {"status": "ok", "model": _parser._model_id}
@app.post("/extract", response_model=ExtractResponse)
 def extract(req: ExtractRequest):
    """
    Parse a document image into structured elements.
    Request body:
        image_b64   Base64-encoded image bytes (JPEG, PNG, TIFF, PDF page, etc.)
        hint        Extraction focus: "auto" | "table" | "text" | "form"
    Response matches the DocuvisionClient._parse_response() contract in cf-core.
    """
    if _parser is None:
        raise HTTPException(status_code=503, detail="Model not loaded")
    if req.hint not in ("auto", "table", "text", "form"):
        raise HTTPException(status_code=422, detail=f"Invalid hint {req.hint!r}")
    t0 = time.monotonic()
    try:
        result = _parser.parse_b64(req.image_b64, hint=req.hint)
    except Exception as exc:
        logger.exception("cf-docuvision: parse failed")
        raise HTTPException(status_code=500, detail=str(exc)) from exc
    elements, tables = dolphin_to_cf_elements(result)
    elapsed_ms = round((time.monotonic() - t0) * 1000)
    logger.info(
        "cf-docuvision: extracted %d elements, %d tables in %dms",
        len(elements), len(tables), elapsed_ms,
    )
    return ExtractResponse(
        elements=elements,
        tables=tables,
        raw_text=result.raw_text,
        metadata={
            "source": "cf-docuvision",
            "model": result.model,
            "hint": req.hint,
            "elapsed_ms": elapsed_ms,
        },
    )
--- a/compose.yml
+++ b/compose.yml
@ -0,0 +1,26 @@
 services:
  cf-docuvision:
    build: .
    network_mode: host
    env_file: .env
    environment:
      CF_DOCUVISION_PORT: "8003"
    volumes:
      # Cache HuggingFace model weights across rebuilds
      - ${HOME}/.cache/huggingface:/root/.cache/huggingface
      # Optional: mount a local model path to skip HF download
      # - /Library/Assets/LLM/dolphin-v2:/models/dolphin-v2:ro
    restart: unless-stopped
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    healthcheck:
      test: ["CMD", "python3", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8003/health')"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 120s
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,8 @@
 fastapi>=0.110
 uvicorn[standard]>=0.29
 pydantic>=2.0
 torch>=2.0
 transformers>=4.40
 accelerate>=0.27
 Pillow>=10.0
 python-multipart>=0.0.9
--- a/tests/test_dolphin.py
+++ b/tests/test_dolphin.py
@ -0,0 +1,190 @@
 """Tests for cf-docuvision — mock inference path only (no GPU required)."""
 import pytest
 from unittest.mock import MagicMock, patch
 from app.dolphin import (
    DolphinElement,
    DolphinParser,
    DolphinResult,
    dolphin_to_cf_elements,
    _TYPE_MAP,
 )
 class TestTypeMap:
    def test_all_21_dolphin_types_mapped(self):
        # Spot-check the 21 Dolphin-v2 element types are all covered
        expected = {
            "title", "section_header", "paragraph", "caption", "footnote",
            "page_header", "page_footer", "list_item", "table", "figure",
            "figure_caption", "formula", "code", "annotation", "abstract",
            "toc_item", "reference", "equation", "watermark", "stamp", "signature",
        }
        assert expected == set(_TYPE_MAP.keys())
    def test_table_maps_to_table(self):
        assert _TYPE_MAP["table"] == "table"
    def test_title_maps_to_heading(self):
        assert _TYPE_MAP["title"] == "heading"
    def test_formula_maps_to_formula(self):
        assert _TYPE_MAP["formula"] == "formula"
 class TestDolphinToCfElements:
    def _make_result(self, elements: list[DolphinElement]) -> DolphinResult:
        raw_text = "\n".join(e.text for e in elements if e.text)
        return DolphinResult(elements=elements, raw_text=raw_text)
    def test_paragraph_goes_to_elements(self):
        result = self._make_result([DolphinElement("paragraph", "Hello world")])
        elements, tables = dolphin_to_cf_elements(result)
        assert len(elements) == 1
        assert elements[0]["type"] == "paragraph"
        assert elements[0]["text"] == "Hello world"
        assert tables == []
    def test_table_with_html_goes_to_tables(self):
        result = self._make_result([
            DolphinElement("table", "col1 col2", html="<table><tr><td>A</td></tr></table>")
        ])
        elements, tables = dolphin_to_cf_elements(result)
        assert len(tables) == 1
        assert "<table>" in tables[0]["html"]
        assert elements == []
    def test_table_without_html_goes_to_elements(self):
        result = self._make_result([DolphinElement("table", "some table text", html=None)])
        elements, tables = dolphin_to_cf_elements(result)
        assert len(elements) == 1
        assert tables == []
    def test_bbox_preserved(self):
        result = self._make_result([
            DolphinElement("paragraph", "text", bbox=[0.1, 0.2, 0.8, 0.3])
        ])
        elements, _ = dolphin_to_cf_elements(result)
        assert elements[0]["bbox"] == [0.1, 0.2, 0.8, 0.3]
    def test_mixed_elements_and_tables(self):
        result = self._make_result([
            DolphinElement("title", "Document Title"),
            DolphinElement("table", "data", html="<table/>"),
            DolphinElement("paragraph", "Body text"),
        ])
        elements, tables = dolphin_to_cf_elements(result)
        assert len(elements) == 2
        assert len(tables) == 1
        assert elements[0]["type"] == "heading"
    def test_empty_result(self):
        result = DolphinResult()
        elements, tables = dolphin_to_cf_elements(result)
        assert elements == []
        assert tables == []
 class TestParseOutputFallbacks:
    """Test _parse_dolphin_output without loading the real model."""
    def _make_parser(self) -> DolphinParser:
        """Create a DolphinParser without loading the model."""
        parser = object.__new__(DolphinParser)
        parser._model_id = "ByteDance/Dolphin-v2"
        parser._device = "cpu"
        return parser
    def test_json_array_output(self):
        parser = self._make_parser()
        raw = '[{"type": "paragraph", "text": "Hello", "bbox": null, "html": null}]'
        elements = parser._parse_dolphin_output(raw)
        assert len(elements) == 1
        assert elements[0]["type"] == "paragraph"
    def test_element_tag_output(self):
        parser = self._make_parser()
        raw = '<element type="title" bbox="[0.1,0.2,0.8,0.3]">My Title</element>'
        elements = parser._parse_dolphin_output(raw)
        assert len(elements) == 1
        assert elements[0]["type"] == "title"
        assert elements[0]["text"] == "My Title"
        assert elements[0]["bbox"] == [0.1, 0.2, 0.8, 0.3]
    def test_element_tag_without_bbox(self):
        parser = self._make_parser()
        raw = '<element type="paragraph">Plain text</element>'
        elements = parser._parse_dolphin_output(raw)
        assert elements[0]["bbox"] is None
    def test_fallback_to_single_paragraph(self):
        parser = self._make_parser()
        raw = "This is some unstructured text output."
        elements = parser._parse_dolphin_output(raw)
        assert len(elements) == 1
        assert elements[0]["type"] == "paragraph"
        assert "unstructured text" in elements[0]["text"]
    def test_empty_output(self):
        parser = self._make_parser()
        elements = parser._parse_dolphin_output("")
        assert elements == []
 class TestFastAPIRoutes:
    """Integration tests for the FastAPI endpoints using TestClient."""
    def _make_app_with_mock_parser(self):
        from fastapi.testclient import TestClient
        import app.main as main_module
        mock_parser = MagicMock()
        mock_parser._model_id = "ByteDance/Dolphin-v2"
        from app.dolphin import DolphinResult, DolphinElement
        mock_result = DolphinResult(
            elements=[DolphinElement("paragraph", "Extracted text")],
            raw_text="Extracted text",
        )
        mock_parser.parse_b64.return_value = mock_result
        main_module._parser = mock_parser
        return TestClient(main_module.app)
    def test_health_with_loaded_model(self):
        client = self._make_app_with_mock_parser()
        resp = client.get("/health")
        assert resp.status_code == 200
        assert resp.json()["status"] == "ok"
    def test_health_without_model_returns_503(self):
        from fastapi.testclient import TestClient
        import app.main as main_module
        main_module._parser = None
        client = TestClient(main_module.app, raise_server_exceptions=False)
        resp = client.get("/health")
        assert resp.status_code == 503
    def test_extract_returns_structured_response(self):
        import base64
        client = self._make_app_with_mock_parser()
        payload = {
            "image_b64": base64.b64encode(b"fake-image-bytes").decode(),
            "hint": "auto",
        }
        resp = client.post("/extract", json=payload)
        assert resp.status_code == 200
        data = resp.json()
        assert "elements" in data
        assert "tables" in data
        assert "raw_text" in data
        assert data["metadata"]["source"] == "cf-docuvision"
    def test_extract_invalid_hint_returns_422(self):
        import base64
        client = self._make_app_with_mock_parser()
        payload = {
            "image_b64": base64.b64encode(b"fake").decode(),
            "hint": "invalid",
        }
        resp = client.post("/extract", json=payload)
        assert resp.status_code == 422