diff --git a/circuitforge_core/documents/pdf.py b/circuitforge_core/documents/pdf.py new file mode 100644 index 0000000..727492f --- /dev/null +++ b/circuitforge_core/documents/pdf.py @@ -0,0 +1,113 @@ +# circuitforge_core/documents/pdf.py +""" +circuitforge_core.documents.pdf — PDF text extraction and page-level chunking. + +Primary path: pdfplumber (selectable text layers). +Fallback: pytesseract OCR (scanned / image-only pages). + +Usage:: + + from circuitforge_core.documents.pdf import PDFExtractor + + chunks = PDFExtractor().chunk_pages("/path/to/book.pdf") + for chunk in chunks: + print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}") +""" +from __future__ import annotations + +import logging +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger(__name__) + +try: + import pdfplumber +except ImportError: # pragma: no cover + pdfplumber = None # type: ignore[assignment] + + +@dataclass(frozen=True) +class PageChunk: + """Text content extracted from a single PDF page.""" + + page_number: int # 1-indexed + text: str + source: str # "text_layer" | "ocr" + word_count: int + + +class PDFExtractor: + """ + Extract page-level text chunks from PDF files. + + Args: + ocr_min_words: Pages with fewer words from the text layer trigger OCR. + """ + + def __init__(self, ocr_min_words: int = 10) -> None: + self.ocr_min_words = ocr_min_words + + def chunk_pages(self, pdf_path: str | Path) -> list[PageChunk]: + """ + Primary entry point. Returns one PageChunk per page. + + Uses text-layer extraction per page; falls back to OCR when text is sparse. + Empty PDFs return an empty list. + """ + if pdfplumber is None: + raise ImportError( + "pdfplumber is required for PDF extraction. " + "Install it with: pip install pdfplumber" + ) + + path = Path(pdf_path) + chunks: list[PageChunk] = [] + + with pdfplumber.open(path) as pdf: + for i, page in enumerate(pdf.pages, start=1): + text = page.extract_text() or "" + words = text.split() + + if len(words) >= self.ocr_min_words: + chunks.append( + PageChunk( + page_number=i, + text=text.strip(), + source="text_layer", + word_count=len(words), + ) + ) + else: + logger.debug( + "pdf: page %d sparse (%d words), falling back to OCR", + i, + len(words), + ) + chunks.append(self._ocr_page(page, i)) + + return chunks + + def _ocr_page(self, page: object, page_number: int) -> PageChunk: + """Render page to image and extract text via tesseract.""" + try: + import io + + import pytesseract + from PIL import Image + + rendered = page.to_image(resolution=200).original # type: ignore[attr-defined] + if not isinstance(rendered, Image.Image): + rendered = Image.open(io.BytesIO(rendered)) + + text = pytesseract.image_to_string(rendered) + words = text.split() + return PageChunk( + page_number=page_number, + text=text.strip(), + source="ocr", + word_count=len(words), + ) + except Exception as exc: + logger.warning("pdf: OCR failed for page %d: %s", page_number, exc) + return PageChunk(page_number=page_number, text="", source="ocr", word_count=0) diff --git a/pyproject.toml b/pyproject.toml index 1343928..d7fc0ea 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,6 +107,14 @@ gestures-mediapipe = [ "opencv-python>=4.8", "numpy>=1.24", ] +pdf = [ + "pdfplumber>=0.11", +] +pdf-ocr = [ + "circuitforge-core[pdf]", + "pytesseract>=0.3", + "Pillow>=10.0", +] dev = [ "circuitforge-core[manage]", "pytest>=8.0", diff --git a/tests/test_documents/test_pdf.py b/tests/test_documents/test_pdf.py new file mode 100644 index 0000000..6afde6f --- /dev/null +++ b/tests/test_documents/test_pdf.py @@ -0,0 +1,45 @@ +# tests/test_documents/test_pdf.py +from __future__ import annotations +from unittest.mock import MagicMock, patch +import pytest +from circuitforge_core.documents.pdf import PDFExtractor, PageChunk + + +def _mock_page(text: str) -> MagicMock: + page = MagicMock() + page.extract_text.return_value = text + return page + + +def _mock_pdf(pages: list[MagicMock]) -> MagicMock: + pdf = MagicMock() + pdf.__enter__ = MagicMock(return_value=pdf) + pdf.__exit__ = MagicMock(return_value=False) + pdf.pages = pages + return pdf + + +def test_chunk_pages_single_text_layer_page(): + page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.") + with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl: + mock_pl.open.return_value = _mock_pdf([page]) + chunks = PDFExtractor().chunk_pages("/fake/book.pdf") + assert len(chunks) == 1 + assert chunks[0].page_number == 1 + assert chunks[0].source == "text_layer" + assert "Fireball" in chunks[0].text + assert chunks[0].word_count >= 10 + + +def test_chunk_pages_numbers_from_one(): + pages = [_mock_page(f"Rule text for page {i} " * 10) for i in range(1, 4)] + with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl: + mock_pl.open.return_value = _mock_pdf(pages) + chunks = PDFExtractor().chunk_pages("/fake/book.pdf") + assert [c.page_number for c in chunks] == [1, 2, 3] + + +def test_page_chunk_is_frozen(): + chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1) + with pytest.raises(Exception): + chunk.text = "modified" # type: ignore[misc]