Retrieval:
- Add _fetch_adjacent() to retriever: fetches page ± 1 chunks from DB
after ranking so mid-sentence EPUB chunk boundaries don't lose context
- Fix vec DB doc-filter: oversample to top_k*20 before Python filter
instead of post-filtering an already-small global pool (fixes wrong-book
results when searching within a single document)
- top_k default 5 → 10; context per chunk 500 → 1500 chars; citation
snippet 200 → 400 chars
Artifact cleaning:
- Add scripts/text_clean.py: strips ABC Amber LIT Converter watermarks,
processtext.com URLs, bare page numbers, piracy stamps from extracted text
- Wire clean_paragraph() into ingest_pdf.py and new ingest_epub.py
Startup validation:
- _check_vec_schema() at boot: detects embedding dimension mismatch,
deletes stale vec DB, and queues sequential re-embed in background thread
- Sequential _reembed_docs() prevents SQLite lock races on startup re-embed
cf-orch integration:
- Wire CF_ORCH_URL / CF_LICENSE_KEY into LLMRouter backend config so
allocate() fires and keeps the Ollama model warm between requests
Ingestion progress UI:
- GET /api/library/{doc_id}/status now returns vec_count from page_vecs_meta
- DocumentCard.vue polls status every 3 s while processing and shows
two-phase progress: indeterminate animation during extraction,
determinate "Embedding N/M pages" bar once vectors start landing
Other:
- Chat feedback endpoint + thumbs up/down UI (FeedbackButton.vue)
- EPUB ingest script (ingest_epub.py) with heading-based chunking
- migration 002: chat_feedback table
- README.md with setup and feature overview
72 lines
2.5 KiB
Python
72 lines
2.5 KiB
Python
# scripts/text_clean.py
|
|
"""
|
|
Shared text-cleaning utilities for ingest pipelines.
|
|
|
|
Removes boilerplate lines injected by ebook converters, piracy watermarks,
|
|
and other non-content artifacts before chunks are stored or embedded.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# Lines that match any of these patterns are dropped entirely.
|
|
# Each pattern is matched against the stripped line (case-insensitive).
|
|
_LINE_DROP_PATTERNS: list[re.Pattern] = [
|
|
# ABC Amber converter family
|
|
re.compile(r'generated by abc amber', re.IGNORECASE),
|
|
re.compile(r'processtext\.com', re.IGNORECASE),
|
|
# Calibre / sigil metadata lines
|
|
re.compile(r'calibre \d+\.\d+', re.IGNORECASE),
|
|
# Standalone URLs (line is just a URL, no surrounding prose)
|
|
re.compile(r'^https?://\S+$'),
|
|
# Common piracy / file-sharing watermarks
|
|
re.compile(r'www\.\w+\.(com|net|org)/\S*book', re.IGNORECASE),
|
|
re.compile(r'downloaded from', re.IGNORECASE),
|
|
re.compile(r'scanned by', re.IGNORECASE),
|
|
re.compile(r'provided by', re.IGNORECASE),
|
|
# Page-number-only lines from PDF extraction (e.g. "- 42 -" or "42")
|
|
re.compile(r'^\s*-?\s*\d{1,4}\s*-?\s*$'),
|
|
]
|
|
|
|
# Inline substrings to strip from within a line before further processing.
|
|
_INLINE_STRIP_PATTERNS: list[re.Pattern] = [
|
|
re.compile(r'generated by abc amber \w+ converter,?\s*https?://\S*', re.IGNORECASE),
|
|
re.compile(r'https?://www\.processtext\.com/\S*', re.IGNORECASE),
|
|
]
|
|
|
|
|
|
def is_artifact_line(line: str) -> bool:
|
|
"""Return True if the line is a known conversion artifact and should be dropped."""
|
|
stripped = line.strip()
|
|
return any(p.search(stripped) for p in _LINE_DROP_PATTERNS)
|
|
|
|
|
|
def clean_line(line: str) -> str:
|
|
"""Strip inline converter artifacts from a line, returning the cleaned version."""
|
|
for p in _INLINE_STRIP_PATTERNS:
|
|
line = p.sub("", line)
|
|
return line.strip()
|
|
|
|
|
|
def clean_paragraph(text: str) -> str:
|
|
"""Clean a multi-line paragraph: drop artifact lines, strip inline artifacts."""
|
|
lines = []
|
|
for line in text.splitlines():
|
|
if is_artifact_line(line):
|
|
continue
|
|
cleaned = clean_line(line)
|
|
if cleaned:
|
|
lines.append(cleaned)
|
|
return "\n".join(lines)
|
|
|
|
|
|
def filter_paragraphs(paragraphs: list[str]) -> list[str]:
|
|
"""Remove artifact lines from a list of paragraph strings."""
|
|
result = []
|
|
for para in paragraphs:
|
|
if is_artifact_line(para):
|
|
continue
|
|
cleaned = clean_line(para)
|
|
if cleaned and len(cleaned.split()) >= 4:
|
|
result.append(cleaned)
|
|
return result
|