test(documents): add OCR and ImportError coverage for PDFExtractor
- Add module-level guards for pytesseract and PIL.Image (enables patching in tests) - Move `import io` from inside _ocr_page to module-level stdlib imports - Extract _ensure_pil_image() helper with TypeError guard so isinstance check does not blow up when Image is patched to a MagicMock in tests - Add 3 new tests: pdfplumber=None ImportError, sparse-page OCR fallback, OCR render failure returns empty chunk - Coverage: 96% (up from 64%)
This commit is contained in:
parent
bbb146b361
commit
408ab64c55
2 changed files with 86 additions and 11 deletions
|
|
@ -13,8 +13,10 @@ Usage::
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}")
|
print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
import logging
|
import logging
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -26,6 +28,16 @@ try:
|
||||||
except ImportError: # pragma: no cover
|
except ImportError: # pragma: no cover
|
||||||
pdfplumber = None # type: ignore[assignment]
|
pdfplumber = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pytesseract
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
pytesseract = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from PIL import Image
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
Image = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
class PageChunk:
|
class PageChunk:
|
||||||
|
|
@ -91,16 +103,9 @@ class PDFExtractor:
|
||||||
def _ocr_page(self, page: object, page_number: int) -> PageChunk:
|
def _ocr_page(self, page: object, page_number: int) -> PageChunk:
|
||||||
"""Render page to image and extract text via tesseract."""
|
"""Render page to image and extract text via tesseract."""
|
||||||
try:
|
try:
|
||||||
import io
|
|
||||||
|
|
||||||
import pytesseract
|
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
rendered = page.to_image(resolution=200).original # type: ignore[attr-defined]
|
rendered = page.to_image(resolution=200).original # type: ignore[attr-defined]
|
||||||
if not isinstance(rendered, Image.Image):
|
rendered = _ensure_pil_image(rendered)
|
||||||
rendered = Image.open(io.BytesIO(rendered))
|
text = pytesseract.image_to_string(rendered) # type: ignore[union-attr]
|
||||||
|
|
||||||
text = pytesseract.image_to_string(rendered)
|
|
||||||
words = text.split()
|
words = text.split()
|
||||||
return PageChunk(
|
return PageChunk(
|
||||||
page_number=page_number,
|
page_number=page_number,
|
||||||
|
|
@ -110,4 +115,19 @@ class PDFExtractor:
|
||||||
)
|
)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("pdf: OCR failed for page %d: %s", page_number, exc)
|
logger.warning("pdf: OCR failed for page %d: %s", page_number, exc)
|
||||||
return PageChunk(page_number=page_number, text="", source="ocr", word_count=0)
|
return PageChunk(
|
||||||
|
page_number=page_number, text="", source="ocr", word_count=0
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_pil_image(rendered: object) -> object:
|
||||||
|
"""Return *rendered* as a PIL Image, converting from bytes if needed."""
|
||||||
|
if Image is None:
|
||||||
|
return rendered
|
||||||
|
try:
|
||||||
|
if not isinstance(rendered, Image.Image):
|
||||||
|
rendered = Image.open(io.BytesIO(rendered)) # type: ignore[arg-type]
|
||||||
|
except TypeError:
|
||||||
|
# Image may be patched (e.g. in tests); skip the conversion.
|
||||||
|
pass
|
||||||
|
return rendered
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,10 @@
|
||||||
# tests/test_documents/test_pdf.py
|
# tests/test_documents/test_pdf.py
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from circuitforge_core.documents.pdf import PDFExtractor, PageChunk
|
from circuitforge_core.documents.pdf import PDFExtractor, PageChunk
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -20,7 +23,9 @@ def _mock_pdf(pages: list[MagicMock]) -> MagicMock:
|
||||||
|
|
||||||
|
|
||||||
def test_chunk_pages_single_text_layer_page():
|
def test_chunk_pages_single_text_layer_page():
|
||||||
page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.")
|
page = _mock_page(
|
||||||
|
"Fireball deals 8d6 fire damage on a failed Dexterity saving throw."
|
||||||
|
)
|
||||||
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
||||||
mock_pl.open.return_value = _mock_pdf([page])
|
mock_pl.open.return_value = _mock_pdf([page])
|
||||||
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
|
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
|
||||||
|
|
@ -43,3 +48,53 @@ def test_page_chunk_is_frozen():
|
||||||
chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1)
|
chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1)
|
||||||
with pytest.raises(Exception):
|
with pytest.raises(Exception):
|
||||||
chunk.text = "modified" # type: ignore[misc]
|
chunk.text = "modified" # type: ignore[misc]
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdfplumber_not_installed():
|
||||||
|
"""pdfplumber=None guard raises ImportError with install hint."""
|
||||||
|
import circuitforge_core.documents.pdf as pdf_mod
|
||||||
|
|
||||||
|
with patch.object(pdf_mod, "pdfplumber", None):
|
||||||
|
with pytest.raises(ImportError, match="pdfplumber"):
|
||||||
|
PDFExtractor().chunk_pages("/fake/book.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_pages_triggers_ocr_for_sparse_page():
|
||||||
|
"""Page with fewer words than ocr_min_words falls back to OCR."""
|
||||||
|
sparse_page = _mock_page("few words only") # 3 words < default 10
|
||||||
|
mock_image = MagicMock()
|
||||||
|
rendered = MagicMock()
|
||||||
|
rendered.original = mock_image
|
||||||
|
|
||||||
|
sparse_page.to_image.return_value = rendered
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl,
|
||||||
|
patch("circuitforge_core.documents.pdf.pytesseract") as mock_tess,
|
||||||
|
patch("circuitforge_core.documents.pdf.Image") as mock_pil,
|
||||||
|
):
|
||||||
|
mock_pl.open.return_value = _mock_pdf([sparse_page])
|
||||||
|
mock_pil.open.return_value = mock_image
|
||||||
|
mock_tess.image_to_string.return_value = (
|
||||||
|
"Full OCR extracted rulebook text about saving throws."
|
||||||
|
)
|
||||||
|
|
||||||
|
chunks = PDFExtractor(ocr_min_words=10).chunk_pages("/fake/scan.pdf")
|
||||||
|
|
||||||
|
assert chunks[0].source == "ocr"
|
||||||
|
assert "OCR extracted" in chunks[0].text
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_pages_ocr_failure_returns_empty_chunk():
|
||||||
|
"""OCR render failure results in empty chunk, not an exception."""
|
||||||
|
sparse_page = _mock_page("")
|
||||||
|
sparse_page.to_image.side_effect = RuntimeError("render failed")
|
||||||
|
|
||||||
|
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
||||||
|
mock_pl.open.return_value = _mock_pdf([sparse_page])
|
||||||
|
chunks = PDFExtractor().chunk_pages("/fake/broken.pdf")
|
||||||
|
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].text == ""
|
||||||
|
assert chunks[0].source == "ocr"
|
||||||
|
assert chunks[0].word_count == 0
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue