pagepiper/tests/test_search_api.py
pyr0ball 8eef52a054 feat: per-user database isolation for cloud instances (closes #4)
Implements Option A from the issue design: each cloud user gets their own
data directory (DATA_DIR/users/{user_id}/) with separate pagepiper.db,
pagepiper_vecs.db, uploads/, and books/. Local mode is unchanged.

Key changes:
- app/startup.py: extract apply_migrations, reembed_docs,
  check_and_rebuild_vec_schema out of main.py (no circular imports)
- app/config.py: add LOCAL_USER_ID constant and user_data_dir() helper
- app/cloud_session.py: extract resolve_authenticated_user(); require_paid_tier
  now returns user_id (str) instead of None
- app/deps.py: add UserCtx dataclass (db_path, vec_db_path, data_dir,
  watch_dir, bm25) + get_user_ctx dependency; per-user startup guard runs
  migrations + vec schema check once per process per user
- app/main.py: _bm25 singleton -> _bm25_map dict keyed by user_id;
  add _get_bm25_for(); lifespan only runs startup checks in local mode
- app/api/library.py, search.py, chat.py: thread UserCtx through all
  endpoints; remove module-level _mark_bm25_dirty injection pattern
- tests/conftest.py: override get_user_ctx in addition to get_db so all
  endpoints get a consistent test UserCtx
2026-05-13 16:31:51 -07:00

65 lines
2.6 KiB
Python

# tests/test_search_api.py
"""Tests for POST /api/search — BM25 keyword search (MIT, no tier gate)."""
from __future__ import annotations
import sqlite3
def _add_chunks(db_path: str, doc_id: str, chunks: list[dict]) -> None:
conn = sqlite3.connect(db_path)
conn.execute(
"INSERT OR IGNORE INTO documents(id, title, file_path, status) VALUES (?,'Book','p.pdf','ready')",
[doc_id],
)
for c in chunks:
conn.execute(
"INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?)",
[doc_id, c["page_number"], c["text"], "text_layer", len(c["text"].split())],
)
conn.commit()
conn.close()
def test_search_returns_results(client, test_db):
# BM25Okapi IDF is 0 when df == N/2 (e.g. 2 docs, 1 match → log(1.0) = 0).
# Add a 3rd unrelated chunk so relevant terms score above zero.
_add_chunks(test_db, "book-a", [
{"page_number": 1, "text": "Fireball deals 8d6 fire damage on a failed saving throw."},
{"page_number": 2, "text": "Cure Wounds restores hit points to a living creature."},
{"page_number": 3, "text": "Shield grants plus five to armor class until next turn."},
])
resp = client.post("/api/search", json={"query": "fireball fire damage"})
assert resp.status_code == 200
results = resp.json()
assert len(results) >= 1
assert results[0]["page_number"] == 1
assert results[0]["bm25_score"] > 0
assert "text_snippet" in results[0]
def test_search_empty_index_returns_empty(client):
resp = client.post("/api/search", json={"query": "anything"})
assert resp.status_code == 200
assert resp.json() == []
def test_search_filters_by_doc_ids(client, test_db):
# Three chunks so BM25Okapi IDF is non-zero for terms appearing in one doc.
_add_chunks(test_db, "book-a", [
{"page_number": 1, "text": "Grapple rules for melee attacks."},
{"page_number": 2, "text": "Shield spell protects from incoming blows."},
])
_add_chunks(test_db, "book-b", [{"page_number": 3, "text": "Grapple also applies to ranged attacks."}])
resp = client.post("/api/search", json={"query": "grapple", "doc_ids": ["book-a"]})
assert resp.status_code == 200
results = resp.json()
assert len(results) >= 1, "expected at least one grapple result from book-a"
assert all(r["doc_id"] == "book-a" for r in results)
def test_search_has_no_tier_gate(client):
# Search endpoint must return 200 with no PAGEPIPER_OLLAMA_URL set
resp = client.post("/api/search", json={"query": "anything"})
assert resp.status_code == 200 # Not 402