feat(documents): add PDFExtractor text-layer extraction and PageChunk
Adds circuitforge_core/documents/pdf.py with: - PageChunk frozen dataclass (page_number, text, source, word_count) - PDFExtractor.chunk_pages() — pdfplumber text-layer per page, OCR fallback via pytesseract for sparse pages - Module-level graceful ImportError guard on pdfplumber (patchable, follows cf-core optional-extra pattern) - pdf and pdf-ocr optional extras declared in pyproject.toml 3 tests, all passing.
This commit is contained in:
parent
3be21ce452
commit
bbb146b361
3 changed files with 166 additions and 0 deletions
113
circuitforge_core/documents/pdf.py
Normal file
113
circuitforge_core/documents/pdf.py
Normal file
|
|
@ -0,0 +1,113 @@
|
|||
# circuitforge_core/documents/pdf.py
|
||||
"""
|
||||
circuitforge_core.documents.pdf — PDF text extraction and page-level chunking.
|
||||
|
||||
Primary path: pdfplumber (selectable text layers).
|
||||
Fallback: pytesseract OCR (scanned / image-only pages).
|
||||
|
||||
Usage::
|
||||
|
||||
from circuitforge_core.documents.pdf import PDFExtractor
|
||||
|
||||
chunks = PDFExtractor().chunk_pages("/path/to/book.pdf")
|
||||
for chunk in chunks:
|
||||
print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}")
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
import pdfplumber
|
||||
except ImportError: # pragma: no cover
|
||||
pdfplumber = None # type: ignore[assignment]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PageChunk:
|
||||
"""Text content extracted from a single PDF page."""
|
||||
|
||||
page_number: int # 1-indexed
|
||||
text: str
|
||||
source: str # "text_layer" | "ocr"
|
||||
word_count: int
|
||||
|
||||
|
||||
class PDFExtractor:
|
||||
"""
|
||||
Extract page-level text chunks from PDF files.
|
||||
|
||||
Args:
|
||||
ocr_min_words: Pages with fewer words from the text layer trigger OCR.
|
||||
"""
|
||||
|
||||
def __init__(self, ocr_min_words: int = 10) -> None:
|
||||
self.ocr_min_words = ocr_min_words
|
||||
|
||||
def chunk_pages(self, pdf_path: str | Path) -> list[PageChunk]:
|
||||
"""
|
||||
Primary entry point. Returns one PageChunk per page.
|
||||
|
||||
Uses text-layer extraction per page; falls back to OCR when text is sparse.
|
||||
Empty PDFs return an empty list.
|
||||
"""
|
||||
if pdfplumber is None:
|
||||
raise ImportError(
|
||||
"pdfplumber is required for PDF extraction. "
|
||||
"Install it with: pip install pdfplumber"
|
||||
)
|
||||
|
||||
path = Path(pdf_path)
|
||||
chunks: list[PageChunk] = []
|
||||
|
||||
with pdfplumber.open(path) as pdf:
|
||||
for i, page in enumerate(pdf.pages, start=1):
|
||||
text = page.extract_text() or ""
|
||||
words = text.split()
|
||||
|
||||
if len(words) >= self.ocr_min_words:
|
||||
chunks.append(
|
||||
PageChunk(
|
||||
page_number=i,
|
||||
text=text.strip(),
|
||||
source="text_layer",
|
||||
word_count=len(words),
|
||||
)
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
"pdf: page %d sparse (%d words), falling back to OCR",
|
||||
i,
|
||||
len(words),
|
||||
)
|
||||
chunks.append(self._ocr_page(page, i))
|
||||
|
||||
return chunks
|
||||
|
||||
def _ocr_page(self, page: object, page_number: int) -> PageChunk:
|
||||
"""Render page to image and extract text via tesseract."""
|
||||
try:
|
||||
import io
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
|
||||
rendered = page.to_image(resolution=200).original # type: ignore[attr-defined]
|
||||
if not isinstance(rendered, Image.Image):
|
||||
rendered = Image.open(io.BytesIO(rendered))
|
||||
|
||||
text = pytesseract.image_to_string(rendered)
|
||||
words = text.split()
|
||||
return PageChunk(
|
||||
page_number=page_number,
|
||||
text=text.strip(),
|
||||
source="ocr",
|
||||
word_count=len(words),
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("pdf: OCR failed for page %d: %s", page_number, exc)
|
||||
return PageChunk(page_number=page_number, text="", source="ocr", word_count=0)
|
||||
|
|
@ -107,6 +107,14 @@ gestures-mediapipe = [
|
|||
"opencv-python>=4.8",
|
||||
"numpy>=1.24",
|
||||
]
|
||||
pdf = [
|
||||
"pdfplumber>=0.11",
|
||||
]
|
||||
pdf-ocr = [
|
||||
"circuitforge-core[pdf]",
|
||||
"pytesseract>=0.3",
|
||||
"Pillow>=10.0",
|
||||
]
|
||||
dev = [
|
||||
"circuitforge-core[manage]",
|
||||
"pytest>=8.0",
|
||||
|
|
|
|||
45
tests/test_documents/test_pdf.py
Normal file
45
tests/test_documents/test_pdf.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
# tests/test_documents/test_pdf.py
|
||||
from __future__ import annotations
|
||||
from unittest.mock import MagicMock, patch
|
||||
import pytest
|
||||
from circuitforge_core.documents.pdf import PDFExtractor, PageChunk
|
||||
|
||||
|
||||
def _mock_page(text: str) -> MagicMock:
|
||||
page = MagicMock()
|
||||
page.extract_text.return_value = text
|
||||
return page
|
||||
|
||||
|
||||
def _mock_pdf(pages: list[MagicMock]) -> MagicMock:
|
||||
pdf = MagicMock()
|
||||
pdf.__enter__ = MagicMock(return_value=pdf)
|
||||
pdf.__exit__ = MagicMock(return_value=False)
|
||||
pdf.pages = pages
|
||||
return pdf
|
||||
|
||||
|
||||
def test_chunk_pages_single_text_layer_page():
|
||||
page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.")
|
||||
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
||||
mock_pl.open.return_value = _mock_pdf([page])
|
||||
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0].page_number == 1
|
||||
assert chunks[0].source == "text_layer"
|
||||
assert "Fireball" in chunks[0].text
|
||||
assert chunks[0].word_count >= 10
|
||||
|
||||
|
||||
def test_chunk_pages_numbers_from_one():
|
||||
pages = [_mock_page(f"Rule text for page {i} " * 10) for i in range(1, 4)]
|
||||
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
||||
mock_pl.open.return_value = _mock_pdf(pages)
|
||||
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
|
||||
assert [c.page_number for c in chunks] == [1, 2, 3]
|
||||
|
||||
|
||||
def test_page_chunk_is_frozen():
|
||||
chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1)
|
||||
with pytest.raises(Exception):
|
||||
chunk.text = "modified" # type: ignore[misc]
|
||||
Loading…
Reference in a new issue