Retrieval:
- Add _fetch_adjacent() to retriever: fetches page ± 1 chunks from DB
after ranking so mid-sentence EPUB chunk boundaries don't lose context
- Fix vec DB doc-filter: oversample to top_k*20 before Python filter
instead of post-filtering an already-small global pool (fixes wrong-book
results when searching within a single document)
- top_k default 5 → 10; context per chunk 500 → 1500 chars; citation
snippet 200 → 400 chars
Artifact cleaning:
- Add scripts/text_clean.py: strips ABC Amber LIT Converter watermarks,
processtext.com URLs, bare page numbers, piracy stamps from extracted text
- Wire clean_paragraph() into ingest_pdf.py and new ingest_epub.py
Startup validation:
- _check_vec_schema() at boot: detects embedding dimension mismatch,
deletes stale vec DB, and queues sequential re-embed in background thread
- Sequential _reembed_docs() prevents SQLite lock races on startup re-embed
cf-orch integration:
- Wire CF_ORCH_URL / CF_LICENSE_KEY into LLMRouter backend config so
allocate() fires and keeps the Ollama model warm between requests
Ingestion progress UI:
- GET /api/library/{doc_id}/status now returns vec_count from page_vecs_meta
- DocumentCard.vue polls status every 3 s while processing and shows
two-phase progress: indeterminate animation during extraction,
determinate "Embedding N/M pages" bar once vectors start landing
Other:
- Chat feedback endpoint + thumbs up/down UI (FeedbackButton.vue)
- EPUB ingest script (ingest_epub.py) with heading-based chunking
- migration 002: chat_feedback table
- README.md with setup and feature overview
108 lines
3.8 KiB
Python
108 lines
3.8 KiB
Python
# tests/test_text_clean.py
|
|
"""Tests for ebook artifact filtering in scripts/text_clean.py."""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from scripts.text_clean import (
|
|
clean_line,
|
|
clean_paragraph,
|
|
filter_paragraphs,
|
|
is_artifact_line,
|
|
)
|
|
|
|
|
|
class TestIsArtifactLine:
|
|
def test_abc_amber_lit(self):
|
|
assert is_artifact_line(
|
|
"Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html"
|
|
)
|
|
|
|
def test_abc_amber_rtf(self):
|
|
assert is_artifact_line("Generated by ABC Amber RTF Converter")
|
|
|
|
def test_processtext_url_only(self):
|
|
assert is_artifact_line("http://www.processtext.com/abclit.html")
|
|
|
|
def test_standalone_url(self):
|
|
assert is_artifact_line("https://www.example.com/book")
|
|
|
|
def test_page_number_only(self):
|
|
assert is_artifact_line("42")
|
|
assert is_artifact_line("- 42 -")
|
|
assert is_artifact_line(" 7 ")
|
|
|
|
def test_downloaded_from(self):
|
|
assert is_artifact_line("Downloaded from www.fictionsite.net")
|
|
|
|
def test_scanned_by(self):
|
|
assert is_artifact_line("Scanned by SomeUser")
|
|
|
|
def test_normal_prose_not_artifact(self):
|
|
assert not is_artifact_line(
|
|
'"And what if food isn\'t the only reason Jagang is going to Anderith?"'
|
|
)
|
|
|
|
def test_url_embedded_in_prose_not_dropped(self):
|
|
# A URL inside a sentence is not a standalone-URL artifact line
|
|
assert not is_artifact_line(
|
|
"You can read more about this at https://example.com and continue."
|
|
)
|
|
|
|
def test_short_page_header_not_dropped(self):
|
|
# "Chapter 1" is not an artifact — 4-digit number check only drops bare numbers
|
|
assert not is_artifact_line("Chapter 1")
|
|
|
|
|
|
class TestCleanLine:
|
|
def test_strips_inline_abc_amber(self):
|
|
line = "Some prose. Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html"
|
|
result = clean_line(line)
|
|
assert "ABC Amber" not in result
|
|
assert "processtext" not in result
|
|
assert "Some prose." in result
|
|
|
|
def test_passes_clean_line_unchanged(self):
|
|
line = "He cocked an eyebrow and smiled."
|
|
assert clean_line(line) == line
|
|
|
|
|
|
class TestCleanParagraph:
|
|
def test_drops_artifact_lines_from_paragraph(self):
|
|
text = (
|
|
"Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html\n"
|
|
'"And what if food isn\'t the only reason Jagang is going to Anderith?"\n'
|
|
"He cocked an eyebrow."
|
|
)
|
|
result = clean_paragraph(text)
|
|
assert "ABC Amber" not in result
|
|
assert "Jagang" in result
|
|
assert "eyebrow" in result
|
|
|
|
def test_all_artifact_paragraph_returns_empty(self):
|
|
text = "Generated by ABC Amber LIT Converter\nhttp://www.processtext.com/abclit.html"
|
|
assert clean_paragraph(text) == ""
|
|
|
|
def test_clean_paragraph_unchanged(self):
|
|
text = "Richard raised his sword.\nThe magic surged through him."
|
|
assert clean_paragraph(text) == text
|
|
|
|
|
|
class TestFilterParagraphs:
|
|
def test_drops_artifact_paragraphs(self):
|
|
paras = [
|
|
"Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html",
|
|
'"And what if food isn\'t the only reason Jagang is going to Anderith?"',
|
|
"He cocked an eyebrow at the question.",
|
|
]
|
|
result = filter_paragraphs(paras)
|
|
assert len(result) == 2
|
|
assert all("ABC Amber" not in p for p in result)
|
|
|
|
def test_drops_short_lines_under_4_words(self):
|
|
paras = ["Hi", "OK sure", "Valid sentence with enough words here."]
|
|
result = filter_paragraphs(paras)
|
|
assert result == ["Valid sentence with enough words here."]
|
|
|
|
def test_empty_input(self):
|
|
assert filter_paragraphs([]) == []
|