feat(documents): add PDFExtractor text-layer extraction and PageChunk

Adds circuitforge_core/documents/pdf.py with:
- PageChunk frozen dataclass (page_number, text, source, word_count)
- PDFExtractor.chunk_pages() — pdfplumber text-layer per page, OCR fallback via pytesseract for sparse pages
- Module-level graceful ImportError guard on pdfplumber (patchable, follows cf-core optional-extra pattern)
- pdf and pdf-ocr optional extras declared in pyproject.toml

3 tests, all passing.
This commit is contained in:
pyr0ball 2026-05-04 08:33:10 -07:00
parent 3be21ce452
commit bbb146b361
3 changed files with 166 additions and 0 deletions

View file

@ -0,0 +1,113 @@
# circuitforge_core/documents/pdf.py
"""
circuitforge_core.documents.pdf PDF text extraction and page-level chunking.
Primary path: pdfplumber (selectable text layers).
Fallback: pytesseract OCR (scanned / image-only pages).
Usage::
from circuitforge_core.documents.pdf import PDFExtractor
chunks = PDFExtractor().chunk_pages("/path/to/book.pdf")
for chunk in chunks:
print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}")
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
from pathlib import Path
logger = logging.getLogger(__name__)
try:
import pdfplumber
except ImportError: # pragma: no cover
pdfplumber = None # type: ignore[assignment]
@dataclass(frozen=True)
class PageChunk:
"""Text content extracted from a single PDF page."""
page_number: int # 1-indexed
text: str
source: str # "text_layer" | "ocr"
word_count: int
class PDFExtractor:
"""
Extract page-level text chunks from PDF files.
Args:
ocr_min_words: Pages with fewer words from the text layer trigger OCR.
"""
def __init__(self, ocr_min_words: int = 10) -> None:
self.ocr_min_words = ocr_min_words
def chunk_pages(self, pdf_path: str | Path) -> list[PageChunk]:
"""
Primary entry point. Returns one PageChunk per page.
Uses text-layer extraction per page; falls back to OCR when text is sparse.
Empty PDFs return an empty list.
"""
if pdfplumber is None:
raise ImportError(
"pdfplumber is required for PDF extraction. "
"Install it with: pip install pdfplumber"
)
path = Path(pdf_path)
chunks: list[PageChunk] = []
with pdfplumber.open(path) as pdf:
for i, page in enumerate(pdf.pages, start=1):
text = page.extract_text() or ""
words = text.split()
if len(words) >= self.ocr_min_words:
chunks.append(
PageChunk(
page_number=i,
text=text.strip(),
source="text_layer",
word_count=len(words),
)
)
else:
logger.debug(
"pdf: page %d sparse (%d words), falling back to OCR",
i,
len(words),
)
chunks.append(self._ocr_page(page, i))
return chunks
def _ocr_page(self, page: object, page_number: int) -> PageChunk:
"""Render page to image and extract text via tesseract."""
try:
import io
import pytesseract
from PIL import Image
rendered = page.to_image(resolution=200).original # type: ignore[attr-defined]
if not isinstance(rendered, Image.Image):
rendered = Image.open(io.BytesIO(rendered))
text = pytesseract.image_to_string(rendered)
words = text.split()
return PageChunk(
page_number=page_number,
text=text.strip(),
source="ocr",
word_count=len(words),
)
except Exception as exc:
logger.warning("pdf: OCR failed for page %d: %s", page_number, exc)
return PageChunk(page_number=page_number, text="", source="ocr", word_count=0)

View file

@ -107,6 +107,14 @@ gestures-mediapipe = [
"opencv-python>=4.8",
"numpy>=1.24",
]
pdf = [
"pdfplumber>=0.11",
]
pdf-ocr = [
"circuitforge-core[pdf]",
"pytesseract>=0.3",
"Pillow>=10.0",
]
dev = [
"circuitforge-core[manage]",
"pytest>=8.0",

View file

@ -0,0 +1,45 @@
# tests/test_documents/test_pdf.py
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from circuitforge_core.documents.pdf import PDFExtractor, PageChunk
def _mock_page(text: str) -> MagicMock:
page = MagicMock()
page.extract_text.return_value = text
return page
def _mock_pdf(pages: list[MagicMock]) -> MagicMock:
pdf = MagicMock()
pdf.__enter__ = MagicMock(return_value=pdf)
pdf.__exit__ = MagicMock(return_value=False)
pdf.pages = pages
return pdf
def test_chunk_pages_single_text_layer_page():
page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.")
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
mock_pl.open.return_value = _mock_pdf([page])
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
assert len(chunks) == 1
assert chunks[0].page_number == 1
assert chunks[0].source == "text_layer"
assert "Fireball" in chunks[0].text
assert chunks[0].word_count >= 10
def test_chunk_pages_numbers_from_one():
pages = [_mock_page(f"Rule text for page {i} " * 10) for i in range(1, 4)]
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
mock_pl.open.return_value = _mock_pdf(pages)
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
assert [c.page_number for c in chunks] == [1, 2, 3]
def test_page_chunk_is_frozen():
chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1)
with pytest.raises(Exception):
chunk.text = "modified" # type: ignore[misc]