feat(documents): add PDFExtractor text-layer extraction and PageChunk
Adds circuitforge_core/documents/pdf.py with: - PageChunk frozen dataclass (page_number, text, source, word_count) - PDFExtractor.chunk_pages() — pdfplumber text-layer per page, OCR fallback via pytesseract for sparse pages - Module-level graceful ImportError guard on pdfplumber (patchable, follows cf-core optional-extra pattern) - pdf and pdf-ocr optional extras declared in pyproject.toml 3 tests, all passing.
This commit is contained in:
parent
3be21ce452
commit
bbb146b361
3 changed files with 166 additions and 0 deletions
113
circuitforge_core/documents/pdf.py
Normal file
113
circuitforge_core/documents/pdf.py
Normal file
|
|
@ -0,0 +1,113 @@
|
||||||
|
# circuitforge_core/documents/pdf.py
|
||||||
|
"""
|
||||||
|
circuitforge_core.documents.pdf — PDF text extraction and page-level chunking.
|
||||||
|
|
||||||
|
Primary path: pdfplumber (selectable text layers).
|
||||||
|
Fallback: pytesseract OCR (scanned / image-only pages).
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
from circuitforge_core.documents.pdf import PDFExtractor
|
||||||
|
|
||||||
|
chunks = PDFExtractor().chunk_pages("/path/to/book.pdf")
|
||||||
|
for chunk in chunks:
|
||||||
|
print(f"[p.{chunk.page_number}] ({chunk.source}) {chunk.text[:80]}")
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pdfplumber
|
||||||
|
except ImportError: # pragma: no cover
|
||||||
|
pdfplumber = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class PageChunk:
|
||||||
|
"""Text content extracted from a single PDF page."""
|
||||||
|
|
||||||
|
page_number: int # 1-indexed
|
||||||
|
text: str
|
||||||
|
source: str # "text_layer" | "ocr"
|
||||||
|
word_count: int
|
||||||
|
|
||||||
|
|
||||||
|
class PDFExtractor:
|
||||||
|
"""
|
||||||
|
Extract page-level text chunks from PDF files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
ocr_min_words: Pages with fewer words from the text layer trigger OCR.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, ocr_min_words: int = 10) -> None:
|
||||||
|
self.ocr_min_words = ocr_min_words
|
||||||
|
|
||||||
|
def chunk_pages(self, pdf_path: str | Path) -> list[PageChunk]:
|
||||||
|
"""
|
||||||
|
Primary entry point. Returns one PageChunk per page.
|
||||||
|
|
||||||
|
Uses text-layer extraction per page; falls back to OCR when text is sparse.
|
||||||
|
Empty PDFs return an empty list.
|
||||||
|
"""
|
||||||
|
if pdfplumber is None:
|
||||||
|
raise ImportError(
|
||||||
|
"pdfplumber is required for PDF extraction. "
|
||||||
|
"Install it with: pip install pdfplumber"
|
||||||
|
)
|
||||||
|
|
||||||
|
path = Path(pdf_path)
|
||||||
|
chunks: list[PageChunk] = []
|
||||||
|
|
||||||
|
with pdfplumber.open(path) as pdf:
|
||||||
|
for i, page in enumerate(pdf.pages, start=1):
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
words = text.split()
|
||||||
|
|
||||||
|
if len(words) >= self.ocr_min_words:
|
||||||
|
chunks.append(
|
||||||
|
PageChunk(
|
||||||
|
page_number=i,
|
||||||
|
text=text.strip(),
|
||||||
|
source="text_layer",
|
||||||
|
word_count=len(words),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
"pdf: page %d sparse (%d words), falling back to OCR",
|
||||||
|
i,
|
||||||
|
len(words),
|
||||||
|
)
|
||||||
|
chunks.append(self._ocr_page(page, i))
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
def _ocr_page(self, page: object, page_number: int) -> PageChunk:
|
||||||
|
"""Render page to image and extract text via tesseract."""
|
||||||
|
try:
|
||||||
|
import io
|
||||||
|
|
||||||
|
import pytesseract
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
rendered = page.to_image(resolution=200).original # type: ignore[attr-defined]
|
||||||
|
if not isinstance(rendered, Image.Image):
|
||||||
|
rendered = Image.open(io.BytesIO(rendered))
|
||||||
|
|
||||||
|
text = pytesseract.image_to_string(rendered)
|
||||||
|
words = text.split()
|
||||||
|
return PageChunk(
|
||||||
|
page_number=page_number,
|
||||||
|
text=text.strip(),
|
||||||
|
source="ocr",
|
||||||
|
word_count=len(words),
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("pdf: OCR failed for page %d: %s", page_number, exc)
|
||||||
|
return PageChunk(page_number=page_number, text="", source="ocr", word_count=0)
|
||||||
|
|
@ -107,6 +107,14 @@ gestures-mediapipe = [
|
||||||
"opencv-python>=4.8",
|
"opencv-python>=4.8",
|
||||||
"numpy>=1.24",
|
"numpy>=1.24",
|
||||||
]
|
]
|
||||||
|
pdf = [
|
||||||
|
"pdfplumber>=0.11",
|
||||||
|
]
|
||||||
|
pdf-ocr = [
|
||||||
|
"circuitforge-core[pdf]",
|
||||||
|
"pytesseract>=0.3",
|
||||||
|
"Pillow>=10.0",
|
||||||
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"circuitforge-core[manage]",
|
"circuitforge-core[manage]",
|
||||||
"pytest>=8.0",
|
"pytest>=8.0",
|
||||||
|
|
|
||||||
45
tests/test_documents/test_pdf.py
Normal file
45
tests/test_documents/test_pdf.py
Normal file
|
|
@ -0,0 +1,45 @@
|
||||||
|
# tests/test_documents/test_pdf.py
|
||||||
|
from __future__ import annotations
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
import pytest
|
||||||
|
from circuitforge_core.documents.pdf import PDFExtractor, PageChunk
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_page(text: str) -> MagicMock:
|
||||||
|
page = MagicMock()
|
||||||
|
page.extract_text.return_value = text
|
||||||
|
return page
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_pdf(pages: list[MagicMock]) -> MagicMock:
|
||||||
|
pdf = MagicMock()
|
||||||
|
pdf.__enter__ = MagicMock(return_value=pdf)
|
||||||
|
pdf.__exit__ = MagicMock(return_value=False)
|
||||||
|
pdf.pages = pages
|
||||||
|
return pdf
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_pages_single_text_layer_page():
|
||||||
|
page = _mock_page("Fireball deals 8d6 fire damage on a failed Dexterity saving throw.")
|
||||||
|
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
||||||
|
mock_pl.open.return_value = _mock_pdf([page])
|
||||||
|
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
|
||||||
|
assert len(chunks) == 1
|
||||||
|
assert chunks[0].page_number == 1
|
||||||
|
assert chunks[0].source == "text_layer"
|
||||||
|
assert "Fireball" in chunks[0].text
|
||||||
|
assert chunks[0].word_count >= 10
|
||||||
|
|
||||||
|
|
||||||
|
def test_chunk_pages_numbers_from_one():
|
||||||
|
pages = [_mock_page(f"Rule text for page {i} " * 10) for i in range(1, 4)]
|
||||||
|
with patch("circuitforge_core.documents.pdf.pdfplumber") as mock_pl:
|
||||||
|
mock_pl.open.return_value = _mock_pdf(pages)
|
||||||
|
chunks = PDFExtractor().chunk_pages("/fake/book.pdf")
|
||||||
|
assert [c.page_number for c in chunks] == [1, 2, 3]
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_chunk_is_frozen():
|
||||||
|
chunk = PageChunk(page_number=1, text="hello", source="text_layer", word_count=1)
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
chunk.text = "modified" # type: ignore[misc]
|
||||||
Loading…
Reference in a new issue