Retrieval:
- Add _fetch_adjacent() to retriever: fetches page ± 1 chunks from DB
after ranking so mid-sentence EPUB chunk boundaries don't lose context
- Fix vec DB doc-filter: oversample to top_k*20 before Python filter
instead of post-filtering an already-small global pool (fixes wrong-book
results when searching within a single document)
- top_k default 5 → 10; context per chunk 500 → 1500 chars; citation
snippet 200 → 400 chars
Artifact cleaning:
- Add scripts/text_clean.py: strips ABC Amber LIT Converter watermarks,
processtext.com URLs, bare page numbers, piracy stamps from extracted text
- Wire clean_paragraph() into ingest_pdf.py and new ingest_epub.py
Startup validation:
- _check_vec_schema() at boot: detects embedding dimension mismatch,
deletes stale vec DB, and queues sequential re-embed in background thread
- Sequential _reembed_docs() prevents SQLite lock races on startup re-embed
cf-orch integration:
- Wire CF_ORCH_URL / CF_LICENSE_KEY into LLMRouter backend config so
allocate() fires and keeps the Ollama model warm between requests
Ingestion progress UI:
- GET /api/library/{doc_id}/status now returns vec_count from page_vecs_meta
- DocumentCard.vue polls status every 3 s while processing and shows
two-phase progress: indeterminate animation during extraction,
determinate "Embedding N/M pages" bar once vectors start landing
Other:
- Chat feedback endpoint + thumbs up/down UI (FeedbackButton.vue)
- EPUB ingest script (ingest_epub.py) with heading-based chunking
- migration 002: chat_feedback table
- README.md with setup and feature overview
170 lines
6.3 KiB
Python
170 lines
6.3 KiB
Python
# tests/test_startup.py
|
|
"""Tests for startup vec DB schema validation (_check_vec_schema)."""
|
|
from __future__ import annotations
|
|
|
|
import os
|
|
import sqlite3
|
|
import threading
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
from app.main import _check_vec_schema, _reembed_docs
|
|
|
|
|
|
def _make_vec_db(path: str, dims: int) -> None:
|
|
"""Create a minimal sqlite-vec-style DB with the given dimension."""
|
|
conn = sqlite3.connect(path)
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
# Replicate the virtual table name used by LocalSQLiteVecStore
|
|
conn.execute(f"CREATE TABLE page_vecs_vecs (embedding float[{dims}])")
|
|
conn.execute(
|
|
"INSERT INTO sqlite_master(type, name, tbl_name, sql) VALUES (?,?,?,?)"
|
|
if False else ""
|
|
)
|
|
# Write a real sqlite_master entry via a virtual table workaround:
|
|
# Easiest is to put the dimension marker directly in a metadata table.
|
|
# But _check_vec_schema reads sqlite_master, so we need the real DDL there.
|
|
conn.close()
|
|
# sqlite_master is read-only — recreate using the real CREATE VIRTUAL TABLE path
|
|
# by faking it via a regular table with the matching name pattern.
|
|
conn2 = sqlite3.connect(path)
|
|
conn2.execute("DROP TABLE IF EXISTS page_vecs_vecs")
|
|
# Write a row that _check_vec_schema will parse via its regex
|
|
conn2.execute(
|
|
"CREATE TABLE _schema_hint (sql TEXT)"
|
|
)
|
|
conn2.execute(
|
|
"INSERT INTO _schema_hint VALUES (?)",
|
|
[f"CREATE VIRTUAL TABLE page_vecs_vecs USING vec0(embedding float[{dims}])"],
|
|
)
|
|
conn2.commit()
|
|
conn2.close()
|
|
|
|
|
|
def _make_real_vec_db(path: str, dims: int) -> None:
|
|
"""Create a vec DB whose sqlite_master actually contains the dimension DDL."""
|
|
import sqlite3 as _sq
|
|
# We can't load sqlite-vec in tests, so simulate by writing sqlite_master directly
|
|
# via a shadow table that _check_vec_schema reads.
|
|
conn = _sq.connect(path)
|
|
conn.execute(
|
|
f"""CREATE TABLE page_vecs_vecs (
|
|
embedding float[{dims}]
|
|
)"""
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
|
|
class TestCheckVecSchema:
|
|
def test_no_file_is_noop(self, tmp_path):
|
|
"""Missing vec DB should not raise."""
|
|
_check_vec_schema(str(tmp_path / "missing.db"), 1024, str(tmp_path / "main.db"))
|
|
|
|
def test_matching_dims_keeps_file(self, tmp_path):
|
|
"""Correct dimensions: vec DB must not be deleted."""
|
|
vec_path = str(tmp_path / "vecs.db")
|
|
conn = sqlite3.connect(vec_path)
|
|
conn.execute("CREATE TABLE page_vecs_vecs (embedding float[1024])")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
_check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
|
|
|
|
assert os.path.exists(vec_path), "Vec DB should not be deleted when dims match"
|
|
|
|
def test_mismatched_dims_deletes_file(self, tmp_path):
|
|
"""Dimension mismatch: vec DB must be deleted."""
|
|
vec_path = str(tmp_path / "vecs.db")
|
|
conn = sqlite3.connect(vec_path)
|
|
conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
db_path = str(tmp_path / "main.db")
|
|
_check_vec_schema(vec_path, 1024, db_path)
|
|
|
|
assert not os.path.exists(vec_path), "Vec DB should be deleted on dimension mismatch"
|
|
|
|
def test_mismatched_dims_queues_reembed(self, tmp_path):
|
|
"""Dimension mismatch: re-embed thread must be started for ready docs."""
|
|
vec_path = str(tmp_path / "vecs.db")
|
|
conn = sqlite3.connect(vec_path)
|
|
conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
db_path = str(tmp_path / "main.db")
|
|
schema = (
|
|
"CREATE TABLE documents ("
|
|
"id TEXT PRIMARY KEY, title TEXT, file_path TEXT, "
|
|
"status TEXT, task_id TEXT, page_count INTEGER, "
|
|
"error_msg TEXT, created_at TEXT, updated_at TEXT)"
|
|
)
|
|
main_conn = sqlite3.connect(db_path)
|
|
main_conn.execute(schema)
|
|
main_conn.execute(
|
|
"INSERT INTO documents VALUES ('abc123', 'Book', '/tmp/book.pdf', 'ready', NULL, 10, NULL, '2026-01-01', '2026-01-01')"
|
|
)
|
|
main_conn.commit()
|
|
main_conn.close()
|
|
|
|
started = []
|
|
real_thread_start = threading.Thread.start
|
|
|
|
def _capture_start(self):
|
|
started.append(self)
|
|
# Don't actually run the re-embed to keep tests fast
|
|
self.run = lambda: None
|
|
real_thread_start(self)
|
|
|
|
with patch.object(threading.Thread, "start", _capture_start):
|
|
_check_vec_schema(vec_path, 1024, db_path)
|
|
|
|
assert len(started) == 1, "Exactly one re-embed thread should be started"
|
|
assert started[0].name == "pagepiper-reembed"
|
|
|
|
def test_no_ready_docs_skips_thread(self, tmp_path):
|
|
"""Mismatch with no ready docs: no thread should be started."""
|
|
vec_path = str(tmp_path / "vecs.db")
|
|
conn = sqlite3.connect(vec_path)
|
|
conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
db_path = str(tmp_path / "main.db")
|
|
schema = (
|
|
"CREATE TABLE documents ("
|
|
"id TEXT PRIMARY KEY, title TEXT, file_path TEXT, "
|
|
"status TEXT, task_id TEXT, page_count INTEGER, "
|
|
"error_msg TEXT, created_at TEXT, updated_at TEXT)"
|
|
)
|
|
main_conn = sqlite3.connect(db_path)
|
|
main_conn.execute(schema)
|
|
main_conn.commit()
|
|
main_conn.close()
|
|
|
|
started = []
|
|
with patch.object(threading.Thread, "start", lambda self: started.append(self)):
|
|
_check_vec_schema(vec_path, 1024, db_path)
|
|
|
|
assert len(started) == 0
|
|
|
|
def test_empty_db_no_table_is_noop(self, tmp_path):
|
|
"""Vec DB exists but has no page_vecs_vecs table yet: no deletion."""
|
|
vec_path = str(tmp_path / "vecs.db")
|
|
sqlite3.connect(vec_path).close() # create empty file
|
|
|
|
_check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
|
|
|
|
assert os.path.exists(vec_path)
|
|
|
|
def test_corrupt_db_does_not_raise(self, tmp_path):
|
|
"""Corrupt or unreadable vec DB must not propagate exceptions."""
|
|
vec_path = str(tmp_path / "vecs.db")
|
|
with open(vec_path, "w") as f:
|
|
f.write("not a sqlite database")
|
|
|
|
_check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
|
|
# No assertion needed — just must not raise
|