pagepiper/tests/test_startup.py

# tests/test_startup.py
"""Tests for startup vec DB schema validation (_check_vec_schema)."""
from __future__ import annotations

import os
import sqlite3
import threading
from unittest.mock import MagicMock, patch

import pytest

from app.main import _check_vec_schema, _reembed_docs


def _make_vec_db(path: str, dims: int) -> None:
    """Create a minimal sqlite-vec-style DB with the given dimension."""
    conn = sqlite3.connect(path)
    conn.execute("PRAGMA journal_mode=WAL")
    # Replicate the virtual table name used by LocalSQLiteVecStore
    conn.execute(f"CREATE TABLE page_vecs_vecs (embedding float[{dims}])")
    conn.execute(
        "INSERT INTO sqlite_master(type, name, tbl_name, sql) VALUES (?,?,?,?)"
        if False else ""
    )
    # Write a real sqlite_master entry via a virtual table workaround:
    # Easiest is to put the dimension marker directly in a metadata table.
    # But _check_vec_schema reads sqlite_master, so we need the real DDL there.
    conn.close()
    # sqlite_master is read-only — recreate using the real CREATE VIRTUAL TABLE path
    # by faking it via a regular table with the matching name pattern.
    conn2 = sqlite3.connect(path)
    conn2.execute("DROP TABLE IF EXISTS page_vecs_vecs")
    # Write a row that _check_vec_schema will parse via its regex
    conn2.execute(
        "CREATE TABLE _schema_hint (sql TEXT)"
    )
    conn2.execute(
        "INSERT INTO _schema_hint VALUES (?)",
        [f"CREATE VIRTUAL TABLE page_vecs_vecs USING vec0(embedding float[{dims}])"],
    )
    conn2.commit()
    conn2.close()


def _make_real_vec_db(path: str, dims: int) -> None:
    """Create a vec DB whose sqlite_master actually contains the dimension DDL."""
    import sqlite3 as _sq
    # We can't load sqlite-vec in tests, so simulate by writing sqlite_master directly
    # via a shadow table that _check_vec_schema reads.
    conn = _sq.connect(path)
    conn.execute(
        f"""CREATE TABLE page_vecs_vecs (
            embedding float[{dims}]
        )"""
    )
    conn.commit()
    conn.close()


class TestCheckVecSchema:
    def test_no_file_is_noop(self, tmp_path):
        """Missing vec DB should not raise."""
        _check_vec_schema(str(tmp_path / "missing.db"), 1024, str(tmp_path / "main.db"))

    def test_matching_dims_keeps_file(self, tmp_path):
        """Correct dimensions: vec DB must not be deleted."""
        vec_path = str(tmp_path / "vecs.db")
        conn = sqlite3.connect(vec_path)
        conn.execute("CREATE TABLE page_vecs_vecs (embedding float[1024])")
        conn.commit()
        conn.close()

        _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))

        assert os.path.exists(vec_path), "Vec DB should not be deleted when dims match"

    def test_mismatched_dims_deletes_file(self, tmp_path):
        """Dimension mismatch: vec DB must be deleted."""
        vec_path = str(tmp_path / "vecs.db")
        conn = sqlite3.connect(vec_path)
        conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
        conn.commit()
        conn.close()

        db_path = str(tmp_path / "main.db")
        _check_vec_schema(vec_path, 1024, db_path)

        assert not os.path.exists(vec_path), "Vec DB should be deleted on dimension mismatch"

    def test_mismatched_dims_queues_reembed(self, tmp_path):
        """Dimension mismatch: re-embed thread must be started for ready docs."""
        vec_path = str(tmp_path / "vecs.db")
        conn = sqlite3.connect(vec_path)
        conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
        conn.commit()
        conn.close()

        db_path = str(tmp_path / "main.db")
        schema = (
            "CREATE TABLE documents ("
            "id TEXT PRIMARY KEY, title TEXT, file_path TEXT, "
            "status TEXT, task_id TEXT, page_count INTEGER, "
            "error_msg TEXT, created_at TEXT, updated_at TEXT)"
        )
        main_conn = sqlite3.connect(db_path)
        main_conn.execute(schema)
        main_conn.execute(
            "INSERT INTO documents VALUES ('abc123', 'Book', '/tmp/book.pdf', 'ready', NULL, 10, NULL, '2026-01-01', '2026-01-01')"
        )
        main_conn.commit()
        main_conn.close()

        started = []
        real_thread_start = threading.Thread.start

        def _capture_start(self):
            started.append(self)
            # Don't actually run the re-embed to keep tests fast
            self.run = lambda: None
            real_thread_start(self)

        with patch.object(threading.Thread, "start", _capture_start):
            _check_vec_schema(vec_path, 1024, db_path)

        assert len(started) == 1, "Exactly one re-embed thread should be started"
        assert started[0].name == "pagepiper-reembed"

    def test_no_ready_docs_skips_thread(self, tmp_path):
        """Mismatch with no ready docs: no thread should be started."""
        vec_path = str(tmp_path / "vecs.db")
        conn = sqlite3.connect(vec_path)
        conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
        conn.commit()
        conn.close()

        db_path = str(tmp_path / "main.db")
        schema = (
            "CREATE TABLE documents ("
            "id TEXT PRIMARY KEY, title TEXT, file_path TEXT, "
            "status TEXT, task_id TEXT, page_count INTEGER, "
            "error_msg TEXT, created_at TEXT, updated_at TEXT)"
        )
        main_conn = sqlite3.connect(db_path)
        main_conn.execute(schema)
        main_conn.commit()
        main_conn.close()

        started = []
        with patch.object(threading.Thread, "start", lambda self: started.append(self)):
            _check_vec_schema(vec_path, 1024, db_path)

        assert len(started) == 0

    def test_empty_db_no_table_is_noop(self, tmp_path):
        """Vec DB exists but has no page_vecs_vecs table yet: no deletion."""
        vec_path = str(tmp_path / "vecs.db")
        sqlite3.connect(vec_path).close()  # create empty file

        _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))

        assert os.path.exists(vec_path)

    def test_corrupt_db_does_not_raise(self, tmp_path):
        """Corrupt or unreadable vec DB must not propagate exceptions."""
        vec_path = str(tmp_path / "vecs.db")
        with open(vec_path, "w") as f:
            f.write("not a sqlite database")

        _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
        # No assertion needed — just must not raise