FastAPI microservice wrapping ByteDance/Dolphin-v2 (Qwen2.5-VL-3B base) for structured document extraction. Exposes POST /extract and GET /health. Maps Dolphin's 21 element types to cf-core's 7-type canonical schema. Services: cf-text /extract, /health Env vars: CF_DOCUVISION_MODEL, CF_DOCUVISION_DEVICE, CF_DOCUVISION_PORT GPU: 8GB+ VRAM required for Dolphin-v2; CPU fallback available but very slow.
190 lines
6.9 KiB
Python
190 lines
6.9 KiB
Python
"""Tests for cf-docuvision — mock inference path only (no GPU required)."""
|
|
import pytest
|
|
from unittest.mock import MagicMock, patch
|
|
from app.dolphin import (
|
|
DolphinElement,
|
|
DolphinParser,
|
|
DolphinResult,
|
|
dolphin_to_cf_elements,
|
|
_TYPE_MAP,
|
|
)
|
|
|
|
|
|
class TestTypeMap:
|
|
def test_all_21_dolphin_types_mapped(self):
|
|
# Spot-check the 21 Dolphin-v2 element types are all covered
|
|
expected = {
|
|
"title", "section_header", "paragraph", "caption", "footnote",
|
|
"page_header", "page_footer", "list_item", "table", "figure",
|
|
"figure_caption", "formula", "code", "annotation", "abstract",
|
|
"toc_item", "reference", "equation", "watermark", "stamp", "signature",
|
|
}
|
|
assert expected == set(_TYPE_MAP.keys())
|
|
|
|
def test_table_maps_to_table(self):
|
|
assert _TYPE_MAP["table"] == "table"
|
|
|
|
def test_title_maps_to_heading(self):
|
|
assert _TYPE_MAP["title"] == "heading"
|
|
|
|
def test_formula_maps_to_formula(self):
|
|
assert _TYPE_MAP["formula"] == "formula"
|
|
|
|
|
|
class TestDolphinToCfElements:
|
|
def _make_result(self, elements: list[DolphinElement]) -> DolphinResult:
|
|
raw_text = "\n".join(e.text for e in elements if e.text)
|
|
return DolphinResult(elements=elements, raw_text=raw_text)
|
|
|
|
def test_paragraph_goes_to_elements(self):
|
|
result = self._make_result([DolphinElement("paragraph", "Hello world")])
|
|
elements, tables = dolphin_to_cf_elements(result)
|
|
assert len(elements) == 1
|
|
assert elements[0]["type"] == "paragraph"
|
|
assert elements[0]["text"] == "Hello world"
|
|
assert tables == []
|
|
|
|
def test_table_with_html_goes_to_tables(self):
|
|
result = self._make_result([
|
|
DolphinElement("table", "col1 col2", html="<table><tr><td>A</td></tr></table>")
|
|
])
|
|
elements, tables = dolphin_to_cf_elements(result)
|
|
assert len(tables) == 1
|
|
assert "<table>" in tables[0]["html"]
|
|
assert elements == []
|
|
|
|
def test_table_without_html_goes_to_elements(self):
|
|
result = self._make_result([DolphinElement("table", "some table text", html=None)])
|
|
elements, tables = dolphin_to_cf_elements(result)
|
|
assert len(elements) == 1
|
|
assert tables == []
|
|
|
|
def test_bbox_preserved(self):
|
|
result = self._make_result([
|
|
DolphinElement("paragraph", "text", bbox=[0.1, 0.2, 0.8, 0.3])
|
|
])
|
|
elements, _ = dolphin_to_cf_elements(result)
|
|
assert elements[0]["bbox"] == [0.1, 0.2, 0.8, 0.3]
|
|
|
|
def test_mixed_elements_and_tables(self):
|
|
result = self._make_result([
|
|
DolphinElement("title", "Document Title"),
|
|
DolphinElement("table", "data", html="<table/>"),
|
|
DolphinElement("paragraph", "Body text"),
|
|
])
|
|
elements, tables = dolphin_to_cf_elements(result)
|
|
assert len(elements) == 2
|
|
assert len(tables) == 1
|
|
assert elements[0]["type"] == "heading"
|
|
|
|
def test_empty_result(self):
|
|
result = DolphinResult()
|
|
elements, tables = dolphin_to_cf_elements(result)
|
|
assert elements == []
|
|
assert tables == []
|
|
|
|
|
|
class TestParseOutputFallbacks:
|
|
"""Test _parse_dolphin_output without loading the real model."""
|
|
|
|
def _make_parser(self) -> DolphinParser:
|
|
"""Create a DolphinParser without loading the model."""
|
|
parser = object.__new__(DolphinParser)
|
|
parser._model_id = "ByteDance/Dolphin-v2"
|
|
parser._device = "cpu"
|
|
return parser
|
|
|
|
def test_json_array_output(self):
|
|
parser = self._make_parser()
|
|
raw = '[{"type": "paragraph", "text": "Hello", "bbox": null, "html": null}]'
|
|
elements = parser._parse_dolphin_output(raw)
|
|
assert len(elements) == 1
|
|
assert elements[0]["type"] == "paragraph"
|
|
|
|
def test_element_tag_output(self):
|
|
parser = self._make_parser()
|
|
raw = '<element type="title" bbox="[0.1,0.2,0.8,0.3]">My Title</element>'
|
|
elements = parser._parse_dolphin_output(raw)
|
|
assert len(elements) == 1
|
|
assert elements[0]["type"] == "title"
|
|
assert elements[0]["text"] == "My Title"
|
|
assert elements[0]["bbox"] == [0.1, 0.2, 0.8, 0.3]
|
|
|
|
def test_element_tag_without_bbox(self):
|
|
parser = self._make_parser()
|
|
raw = '<element type="paragraph">Plain text</element>'
|
|
elements = parser._parse_dolphin_output(raw)
|
|
assert elements[0]["bbox"] is None
|
|
|
|
def test_fallback_to_single_paragraph(self):
|
|
parser = self._make_parser()
|
|
raw = "This is some unstructured text output."
|
|
elements = parser._parse_dolphin_output(raw)
|
|
assert len(elements) == 1
|
|
assert elements[0]["type"] == "paragraph"
|
|
assert "unstructured text" in elements[0]["text"]
|
|
|
|
def test_empty_output(self):
|
|
parser = self._make_parser()
|
|
elements = parser._parse_dolphin_output("")
|
|
assert elements == []
|
|
|
|
|
|
class TestFastAPIRoutes:
|
|
"""Integration tests for the FastAPI endpoints using TestClient."""
|
|
|
|
def _make_app_with_mock_parser(self):
|
|
from fastapi.testclient import TestClient
|
|
import app.main as main_module
|
|
|
|
mock_parser = MagicMock()
|
|
mock_parser._model_id = "ByteDance/Dolphin-v2"
|
|
|
|
from app.dolphin import DolphinResult, DolphinElement
|
|
mock_result = DolphinResult(
|
|
elements=[DolphinElement("paragraph", "Extracted text")],
|
|
raw_text="Extracted text",
|
|
)
|
|
mock_parser.parse_b64.return_value = mock_result
|
|
|
|
main_module._parser = mock_parser
|
|
return TestClient(main_module.app)
|
|
|
|
def test_health_with_loaded_model(self):
|
|
client = self._make_app_with_mock_parser()
|
|
resp = client.get("/health")
|
|
assert resp.status_code == 200
|
|
assert resp.json()["status"] == "ok"
|
|
|
|
def test_health_without_model_returns_503(self):
|
|
from fastapi.testclient import TestClient
|
|
import app.main as main_module
|
|
main_module._parser = None
|
|
client = TestClient(main_module.app, raise_server_exceptions=False)
|
|
resp = client.get("/health")
|
|
assert resp.status_code == 503
|
|
|
|
def test_extract_returns_structured_response(self):
|
|
import base64
|
|
client = self._make_app_with_mock_parser()
|
|
payload = {
|
|
"image_b64": base64.b64encode(b"fake-image-bytes").decode(),
|
|
"hint": "auto",
|
|
}
|
|
resp = client.post("/extract", json=payload)
|
|
assert resp.status_code == 200
|
|
data = resp.json()
|
|
assert "elements" in data
|
|
assert "tables" in data
|
|
assert "raw_text" in data
|
|
assert data["metadata"]["source"] == "cf-docuvision"
|
|
|
|
def test_extract_invalid_hint_returns_422(self):
|
|
import base64
|
|
client = self._make_app_with_mock_parser()
|
|
payload = {
|
|
"image_b64": base64.b64encode(b"fake").decode(),
|
|
"hint": "invalid",
|
|
}
|
|
resp = client.post("/extract", json=payload)
|
|
assert resp.status_code == 422
|