diff --git a/circuitforge_core/documents/pdf.py b/circuitforge_core/documents/pdf.py index 727492f..a620e40 100644 --- a/circuitforge_core/documents/pdf.py +++ b/circuitforge_core/documents/pdf.py @@ -13,8 +13,10 @@ Usage:: for chunk in chunks: print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}") """ + from __future__ import annotations +import io import logging from dataclasses import dataclass from pathlib import Path @@ -26,6 +28,16 @@ try: except ImportError: # pragma: no cover pdfplumber = None # type: ignore[assignment] +try: + import pytesseract +except ImportError: # pragma: no cover + pytesseract = None # type: ignore[assignment] + +try: + from PIL import Image +except ImportError: # pragma: no cover + Image = None # type: ignore[assignment] + @dataclass(frozen=True) class PageChunk: @@ -91,16 +103,9 @@ class PDFExtractor: def _ocr_page(self, page: object, page_number: int) -> PageChunk: """Render page to image and extract text via tesseract.""" try: - import io - - import pytesseract - from PIL import Image - rendered = page.to_image(resolution=200).original # type: ignore[attr-defined] - if not isinstance(rendered, Image.Image): - rendered = Image.open(io.BytesIO(rendered)) - - text = pytesseract.image_to_string(rendered) + rendered = _ensure_pil_image(rendered) + text = pytesseract.image_to_string(rendered) # type: ignore[union-attr] words = text.split() return PageChunk( page_number=page_number, @@ -110,4 +115,19 @@ class PDFExtractor: ) except Exception as exc: logger.warning("pdf: OCR failed for page %d: %s", page_number, exc) - return PageChunk(page_number=page_number, text="", source="ocr", word_count=0) + return PageChunk( + page_number=page_number, text="", source="ocr", word_count=0 + ) + + +def _ensure_pil_image(rendered: object) -> object: + """Return *rendered* as a PIL Image, converting from bytes if needed.""" + if Image is None: + return rendered + try: + if not isinstance(rendered, Image.Image): + rendered = Image.open(io.BytesIO(rendered)) # type: ignore[arg-type] + except TypeError: + # Image may be patched (e.g. in tests); skip the conversion. + pass + return rendered diff --git a/tests/test_documents/test_pdf.py b/tests/test_documents/test_pdf.py index 6afde6f..3aa82ed 100644 --- a/tests/test_documents/test_pdf.py +++ b/tests/test_documents/test_pdf.py @@ -1,7 +1,10 @@ # tests/test_documents/test_pdf.py from __future__ import annotations + from unittest.mock import MagicMock, patch + import pytest + from circuitforge_core.documents.pdf import PDFExtractor, PageChunk @@ -20,7 +23,9 @@ def _mock_pdf(pages: list[MagicMock]) -> MagicMock: def test_chunk_pages_single_text_layer_page(): - page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.") + page = _mock_page( + "Fireball deals 8d6 fire damage on a failed Dexterity saving throw." + ) with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl: mock_pl.open.return_value = _mock_pdf([page]) chunks = PDFExtractor().chunk_pages("/fake/book.pdf") @@ -43,3 +48,53 @@ def test_page_chunk_is_frozen(): chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1) with pytest.raises(Exception): chunk.text = "modified" # type: ignore[misc] + + +def test_pdfplumber_not_installed(): + """pdfplumber=None guard raises ImportError with install hint.""" + import circuitforge_core.documents.pdf as pdf_mod + + with patch.object(pdf_mod, "pdfplumber", None): + with pytest.raises(ImportError, match="pdfplumber"): + PDFExtractor().chunk_pages("/fake/book.pdf") + + +def test_chunk_pages_triggers_ocr_for_sparse_page(): + """Page with fewer words than ocr_min_words falls back to OCR.""" + sparse_page = _mock_page("few words only") # 3 words < default 10 + mock_image = MagicMock() + rendered = MagicMock() + rendered.original = mock_image + + sparse_page.to_image.return_value = rendered + + with ( + patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl, + patch("circuitforge_core.documents.pdf.pytesseract") as mock_tess, + patch("circuitforge_core.documents.pdf.Image") as mock_pil, + ): + mock_pl.open.return_value = _mock_pdf([sparse_page]) + mock_pil.open.return_value = mock_image + mock_tess.image_to_string.return_value = ( + "Full OCR extracted rulebook text about saving throws." + ) + + chunks = PDFExtractor(ocr_min_words=10).chunk_pages("/fake/scan.pdf") + + assert chunks[0].source == "ocr" + assert "OCR extracted" in chunks[0].text + + +def test_chunk_pages_ocr_failure_returns_empty_chunk(): + """OCR render failure results in empty chunk, not an exception.""" + sparse_page = _mock_page("") + sparse_page.to_image.side_effect = RuntimeError("render failed") + + with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl: + mock_pl.open.return_value = _mock_pdf([sparse_page]) + chunks = PDFExtractor().chunk_pages("/fake/broken.pdf") + + assert len(chunks) == 1 + assert chunks[0].text == "" + assert chunks[0].source == "ocr" + assert chunks[0].word_count == 0