# scripts/text_clean.py """ Shared text-cleaning utilities for ingest pipelines. Removes boilerplate lines injected by ebook converters, piracy watermarks, and other non-content artifacts before chunks are stored or embedded. """ from __future__ import annotations import re # Lines that match any of these patterns are dropped entirely. # Each pattern is matched against the stripped line (case-insensitive). _LINE_DROP_PATTERNS: list[re.Pattern] = [ # ABC Amber converter family re.compile(r'generated by abc amber', re.IGNORECASE), re.compile(r'processtext\.com', re.IGNORECASE), # Calibre / sigil metadata lines re.compile(r'calibre \d+\.\d+', re.IGNORECASE), # Standalone URLs (line is just a URL, no surrounding prose) re.compile(r'^https?://\S+$'), # Common piracy / file-sharing watermarks re.compile(r'www\.\w+\.(com|net|org)/\S*book', re.IGNORECASE), re.compile(r'downloaded from', re.IGNORECASE), re.compile(r'scanned by', re.IGNORECASE), re.compile(r'provided by', re.IGNORECASE), # Page-number-only lines from PDF extraction (e.g. "- 42 -" or "42") re.compile(r'^\s*-?\s*\d{1,4}\s*-?\s*$'), ] # Inline substrings to strip from within a line before further processing. _INLINE_STRIP_PATTERNS: list[re.Pattern] = [ re.compile(r'generated by abc amber \w+ converter,?\s*https?://\S*', re.IGNORECASE), re.compile(r'https?://www\.processtext\.com/\S*', re.IGNORECASE), ] def is_artifact_line(line: str) -> bool: """Return True if the line is a known conversion artifact and should be dropped.""" stripped = line.strip() return any(p.search(stripped) for p in _LINE_DROP_PATTERNS) def clean_line(line: str) -> str: """Strip inline converter artifacts from a line, returning the cleaned version.""" for p in _INLINE_STRIP_PATTERNS: line = p.sub("", line) return line.strip() def clean_paragraph(text: str) -> str: """Clean a multi-line paragraph: drop artifact lines, strip inline artifacts.""" lines = [] for line in text.splitlines(): if is_artifact_line(line): continue cleaned = clean_line(line) if cleaned: lines.append(cleaned) return "\n".join(lines) def filter_paragraphs(paragraphs: list[str]) -> list[str]: """Remove artifact lines from a list of paragraph strings.""" result = [] for para in paragraphs: if is_artifact_line(para): continue cleaned = clean_line(para) if cleaned and len(cleaned.split()) >= 4: result.append(cleaned) return result