test(documents): add OCR and ImportError coverage for PDFExtractor

- Add module-level guards for pytesseract and PIL.Image (enables patching in tests)
- Move `import io` from inside _ocr_page to module-level stdlib imports
- Extract _ensure_pil_image() helper with TypeError guard so isinstance check
  does not blow up when Image is patched to a MagicMock in tests
- Add 3 new tests: pdfplumber=None ImportError, sparse-page OCR fallback,
  OCR render failure returns empty chunk
- Coverage: 96% (up from 64%)
This commit is contained in:
pyr0ball 2026-05-04 08:39:31 -07:00
parent bbb146b361
commit 408ab64c55
2 changed files with 86 additions and 11 deletions

View file

@ -13,8 +13,10 @@ Usage::
for chunk in chunks:
print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}")
"""
from __future__ import annotations
import io
import logging
from dataclasses import dataclass
from pathlib import Path
@ -26,6 +28,16 @@ try:
except ImportError: # pragma: no cover
pdfplumber = None # type: ignore[assignment]
try:
import pytesseract
except ImportError: # pragma: no cover
pytesseract = None # type: ignore[assignment]
try:
from PIL import Image
except ImportError: # pragma: no cover
Image = None # type: ignore[assignment]
@dataclass(frozen=True)
class PageChunk:
@ -91,16 +103,9 @@ class PDFExtractor:
def _ocr_page(self, page: object, page_number: int) -> PageChunk:
"""Render page to image and extract text via tesseract."""
try:
import io
import pytesseract
from PIL import Image
rendered = page.to_image(resolution=200).original # type: ignore[attr-defined]
if not isinstance(rendered, Image.Image):
rendered = Image.open(io.BytesIO(rendered))
text = pytesseract.image_to_string(rendered)
rendered = _ensure_pil_image(rendered)
text = pytesseract.image_to_string(rendered) # type: ignore[union-attr]
words = text.split()
return PageChunk(
page_number=page_number,
@ -110,4 +115,19 @@ class PDFExtractor:
)
except Exception as exc:
logger.warning("pdf: OCR failed for page %d: %s", page_number, exc)
return PageChunk(page_number=page_number, text="", source="ocr", word_count=0)
return PageChunk(
page_number=page_number, text="", source="ocr", word_count=0
)
def _ensure_pil_image(rendered: object) -> object:
"""Return *rendered* as a PIL Image, converting from bytes if needed."""
if Image is None:
return rendered
try:
if not isinstance(rendered, Image.Image):
rendered = Image.open(io.BytesIO(rendered)) # type: ignore[arg-type]
except TypeError:
# Image may be patched (e.g. in tests); skip the conversion.
pass
return rendered

View file

@ -1,7 +1,10 @@
# tests/test_documents/test_pdf.py
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from circuitforge_core.documents.pdf import PDFExtractor, PageChunk
@ -20,7 +23,9 @@ def _mock_pdf(pages: list[MagicMock]) -> MagicMock:
def test_chunk_pages_single_text_layer_page():
page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.")
page = _mock_page(
"Fireball deals 8d6 fire damage on a failed Dexterity saving throw."
)
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
mock_pl.open.return_value = _mock_pdf([page])
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
@ -43,3 +48,53 @@ def test_page_chunk_is_frozen():
chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1)
with pytest.raises(Exception):
chunk.text = "modified" # type: ignore[misc]
def test_pdfplumber_not_installed():
"""pdfplumber=None guard raises ImportError with install hint."""
import circuitforge_core.documents.pdf as pdf_mod
with patch.object(pdf_mod, "pdfplumber", None):
with pytest.raises(ImportError, match="pdfplumber"):
PDFExtractor().chunk_pages("/fake/book.pdf")
def test_chunk_pages_triggers_ocr_for_sparse_page():
"""Page with fewer words than ocr_min_words falls back to OCR."""
sparse_page = _mock_page("few words only") # 3 words < default 10
mock_image = MagicMock()
rendered = MagicMock()
rendered.original = mock_image
sparse_page.to_image.return_value = rendered
with (
patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl,
patch("circuitforge_core.documents.pdf.pytesseract") as mock_tess,
patch("circuitforge_core.documents.pdf.Image") as mock_pil,
):
mock_pl.open.return_value = _mock_pdf([sparse_page])
mock_pil.open.return_value = mock_image
mock_tess.image_to_string.return_value = (
"Full OCR extracted rulebook text about saving throws."
)
chunks = PDFExtractor(ocr_min_words=10).chunk_pages("/fake/scan.pdf")
assert chunks[0].source == "ocr"
assert "OCR extracted" in chunks[0].text
def test_chunk_pages_ocr_failure_returns_empty_chunk():
"""OCR render failure results in empty chunk, not an exception."""
sparse_page = _mock_page("")
sparse_page.to_image.side_effect = RuntimeError("render failed")
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
mock_pl.open.return_value = _mock_pdf([sparse_page])
chunks = PDFExtractor().chunk_pages("/fake/broken.pdf")
assert len(chunks) == 1
assert chunks[0].text == ""
assert chunks[0].source == "ocr"
assert chunks[0].word_count == 0