pagepiper/scripts/text_clean.py

# scripts/text_clean.py
"""
Shared text-cleaning utilities for ingest pipelines.

Removes boilerplate lines injected by ebook converters, piracy watermarks,
and other non-content artifacts before chunks are stored or embedded.
"""
from __future__ import annotations

import re

# Lines that match any of these patterns are dropped entirely.
# Each pattern is matched against the stripped line (case-insensitive).
_LINE_DROP_PATTERNS: list[re.Pattern] = [
    # ABC Amber converter family
    re.compile(r'generated by abc amber', re.IGNORECASE),
    re.compile(r'processtext\.com', re.IGNORECASE),
    # Calibre / sigil metadata lines
    re.compile(r'calibre \d+\.\d+', re.IGNORECASE),
    # Standalone URLs (line is just a URL, no surrounding prose)
    re.compile(r'^https?://\S+$'),
    # Common piracy / file-sharing watermarks
    re.compile(r'www\.\w+\.(com|net|org)/\S*book', re.IGNORECASE),
    re.compile(r'downloaded from', re.IGNORECASE),
    re.compile(r'scanned by', re.IGNORECASE),
    re.compile(r'provided by', re.IGNORECASE),
    # Page-number-only lines from PDF extraction (e.g. "- 42 -" or "42")
    re.compile(r'^\s*-?\s*\d{1,4}\s*-?\s*$'),
]

# Inline substrings to strip from within a line before further processing.
_INLINE_STRIP_PATTERNS: list[re.Pattern] = [
    re.compile(r'generated by abc amber \w+ converter,?\s*https?://\S*', re.IGNORECASE),
    re.compile(r'https?://www\.processtext\.com/\S*', re.IGNORECASE),
]


def is_artifact_line(line: str) -> bool:
    """Return True if the line is a known conversion artifact and should be dropped."""
    stripped = line.strip()
    return any(p.search(stripped) for p in _LINE_DROP_PATTERNS)


def clean_line(line: str) -> str:
    """Strip inline converter artifacts from a line, returning the cleaned version."""
    for p in _INLINE_STRIP_PATTERNS:
        line = p.sub("", line)
    return line.strip()


def clean_paragraph(text: str) -> str:
    """Clean a multi-line paragraph: drop artifact lines, strip inline artifacts."""
    lines = []
    for line in text.splitlines():
        if is_artifact_line(line):
            continue
        cleaned = clean_line(line)
        if cleaned:
            lines.append(cleaned)
    return "\n".join(lines)


def filter_paragraphs(paragraphs: list[str]) -> list[str]:
    """Remove artifact lines from a list of paragraph strings."""
    result = []
    for para in paragraphs:
        if is_artifact_line(para):
            continue
        cleaned = clean_line(para)
        if cleaned and len(cleaned.split()) >= 4:
            result.append(cleaned)
    return result