cf-docuvision/tests/test_dolphin.py
pyr0ball 47d4dfc786 feat: initial cf-docuvision service — Dolphin-v2 document parsing
FastAPI microservice wrapping ByteDance/Dolphin-v2 (Qwen2.5-VL-3B base)
for structured document extraction. Exposes POST /extract and GET /health.
Maps Dolphin's 21 element types to cf-core's 7-type canonical schema.

Services: cf-text /extract, /health
Env vars: CF_DOCUVISION_MODEL, CF_DOCUVISION_DEVICE, CF_DOCUVISION_PORT
GPU: 8GB+ VRAM required for Dolphin-v2; CPU fallback available but very slow.
2026-06-05 10:25:18 -07:00

190 lines
6.9 KiB
Python

"""Tests for cf-docuvision — mock inference path only (no GPU required)."""
import pytest
from unittest.mock import MagicMock, patch
from app.dolphin import (
DolphinElement,
DolphinParser,
DolphinResult,
dolphin_to_cf_elements,
_TYPE_MAP,
)
class TestTypeMap:
def test_all_21_dolphin_types_mapped(self):
# Spot-check the 21 Dolphin-v2 element types are all covered
expected = {
"title", "section_header", "paragraph", "caption", "footnote",
"page_header", "page_footer", "list_item", "table", "figure",
"figure_caption", "formula", "code", "annotation", "abstract",
"toc_item", "reference", "equation", "watermark", "stamp", "signature",
}
assert expected == set(_TYPE_MAP.keys())
def test_table_maps_to_table(self):
assert _TYPE_MAP["table"] == "table"
def test_title_maps_to_heading(self):
assert _TYPE_MAP["title"] == "heading"
def test_formula_maps_to_formula(self):
assert _TYPE_MAP["formula"] == "formula"
class TestDolphinToCfElements:
def _make_result(self, elements: list[DolphinElement]) -> DolphinResult:
raw_text = "\n".join(e.text for e in elements if e.text)
return DolphinResult(elements=elements, raw_text=raw_text)
def test_paragraph_goes_to_elements(self):
result = self._make_result([DolphinElement("paragraph", "Hello world")])
elements, tables = dolphin_to_cf_elements(result)
assert len(elements) == 1
assert elements[0]["type"] == "paragraph"
assert elements[0]["text"] == "Hello world"
assert tables == []
def test_table_with_html_goes_to_tables(self):
result = self._make_result([
DolphinElement("table", "col1 col2", html="<table><tr><td>A</td></tr></table>")
])
elements, tables = dolphin_to_cf_elements(result)
assert len(tables) == 1
assert "<table>" in tables[0]["html"]
assert elements == []
def test_table_without_html_goes_to_elements(self):
result = self._make_result([DolphinElement("table", "some table text", html=None)])
elements, tables = dolphin_to_cf_elements(result)
assert len(elements) == 1
assert tables == []
def test_bbox_preserved(self):
result = self._make_result([
DolphinElement("paragraph", "text", bbox=[0.1, 0.2, 0.8, 0.3])
])
elements, _ = dolphin_to_cf_elements(result)
assert elements[0]["bbox"] == [0.1, 0.2, 0.8, 0.3]
def test_mixed_elements_and_tables(self):
result = self._make_result([
DolphinElement("title", "Document Title"),
DolphinElement("table", "data", html="<table/>"),
DolphinElement("paragraph", "Body text"),
])
elements, tables = dolphin_to_cf_elements(result)
assert len(elements) == 2
assert len(tables) == 1
assert elements[0]["type"] == "heading"
def test_empty_result(self):
result = DolphinResult()
elements, tables = dolphin_to_cf_elements(result)
assert elements == []
assert tables == []
class TestParseOutputFallbacks:
"""Test _parse_dolphin_output without loading the real model."""
def _make_parser(self) -> DolphinParser:
"""Create a DolphinParser without loading the model."""
parser = object.__new__(DolphinParser)
parser._model_id = "ByteDance/Dolphin-v2"
parser._device = "cpu"
return parser
def test_json_array_output(self):
parser = self._make_parser()
raw = '[{"type": "paragraph", "text": "Hello", "bbox": null, "html": null}]'
elements = parser._parse_dolphin_output(raw)
assert len(elements) == 1
assert elements[0]["type"] == "paragraph"
def test_element_tag_output(self):
parser = self._make_parser()
raw = '<element type="title" bbox="[0.1,0.2,0.8,0.3]">My Title</element>'
elements = parser._parse_dolphin_output(raw)
assert len(elements) == 1
assert elements[0]["type"] == "title"
assert elements[0]["text"] == "My Title"
assert elements[0]["bbox"] == [0.1, 0.2, 0.8, 0.3]
def test_element_tag_without_bbox(self):
parser = self._make_parser()
raw = '<element type="paragraph">Plain text</element>'
elements = parser._parse_dolphin_output(raw)
assert elements[0]["bbox"] is None
def test_fallback_to_single_paragraph(self):
parser = self._make_parser()
raw = "This is some unstructured text output."
elements = parser._parse_dolphin_output(raw)
assert len(elements) == 1
assert elements[0]["type"] == "paragraph"
assert "unstructured text" in elements[0]["text"]
def test_empty_output(self):
parser = self._make_parser()
elements = parser._parse_dolphin_output("")
assert elements == []
class TestFastAPIRoutes:
"""Integration tests for the FastAPI endpoints using TestClient."""
def _make_app_with_mock_parser(self):
from fastapi.testclient import TestClient
import app.main as main_module
mock_parser = MagicMock()
mock_parser._model_id = "ByteDance/Dolphin-v2"
from app.dolphin import DolphinResult, DolphinElement
mock_result = DolphinResult(
elements=[DolphinElement("paragraph", "Extracted text")],
raw_text="Extracted text",
)
mock_parser.parse_b64.return_value = mock_result
main_module._parser = mock_parser
return TestClient(main_module.app)
def test_health_with_loaded_model(self):
client = self._make_app_with_mock_parser()
resp = client.get("/health")
assert resp.status_code == 200
assert resp.json()["status"] == "ok"
def test_health_without_model_returns_503(self):
from fastapi.testclient import TestClient
import app.main as main_module
main_module._parser = None
client = TestClient(main_module.app, raise_server_exceptions=False)
resp = client.get("/health")
assert resp.status_code == 503
def test_extract_returns_structured_response(self):
import base64
client = self._make_app_with_mock_parser()
payload = {
"image_b64": base64.b64encode(b"fake-image-bytes").decode(),
"hint": "auto",
}
resp = client.post("/extract", json=payload)
assert resp.status_code == 200
data = resp.json()
assert "elements" in data
assert "tables" in data
assert "raw_text" in data
assert data["metadata"]["source"] == "cf-docuvision"
def test_extract_invalid_hint_returns_422(self):
import base64
client = self._make_app_with_mock_parser()
payload = {
"image_b64": base64.b64encode(b"fake").decode(),
"hint": "invalid",
}
resp = client.post("/extract", json=payload)
assert resp.status_code == 422