pagepiper/tests/test_search_api.py

# tests/test_search_api.py
"""Tests for POST /api/search — BM25 keyword search (MIT, no tier gate)."""
from __future__ import annotations

import sqlite3


def _add_chunks(db_path: str, doc_id: str, chunks: list[dict]) -> None:
    conn = sqlite3.connect(db_path)
    conn.execute(
        "INSERT OR IGNORE INTO documents(id, title, file_path, status) VALUES (?,'Book','p.pdf','ready')",
        [doc_id],
    )
    for c in chunks:
        conn.execute(
            "INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?)",
            [doc_id, c["page_number"], c["text"], "text_layer", len(c["text"].split())],
        )
    conn.commit()
    conn.close()


def test_search_returns_results(client, test_db):
    # BM25Okapi IDF is 0 when df == N/2 (e.g. 2 docs, 1 match → log(1.0) = 0).
    # Add a 3rd unrelated chunk so relevant terms score above zero.
    _add_chunks(test_db, "book-a", [
        {"page_number": 1, "text": "Fireball deals 8d6 fire damage on a failed saving throw."},
        {"page_number": 2, "text": "Cure Wounds restores hit points to a living creature."},
        {"page_number": 3, "text": "Shield grants plus five to armor class until next turn."},
    ])

    resp = client.post("/api/search", json={"query": "fireball fire damage"})
    assert resp.status_code == 200
    results = resp.json()
    assert len(results) >= 1
    assert results[0]["page_number"] == 1
    assert results[0]["bm25_score"] > 0
    assert "text_snippet" in results[0]


def test_search_empty_index_returns_empty(client):
    resp = client.post("/api/search", json={"query": "anything"})
    assert resp.status_code == 200
    assert resp.json() == []


def test_search_filters_by_doc_ids(client, test_db):
    # Three chunks so BM25Okapi IDF is non-zero for terms appearing in one doc.
    _add_chunks(test_db, "book-a", [
        {"page_number": 1, "text": "Grapple rules for melee attacks."},
        {"page_number": 2, "text": "Shield spell protects from incoming blows."},
    ])
    _add_chunks(test_db, "book-b", [{"page_number": 3, "text": "Grapple also applies to ranged attacks."}])

    resp = client.post("/api/search", json={"query": "grapple", "doc_ids": ["book-a"]})
    assert resp.status_code == 200
    results = resp.json()
    assert len(results) >= 1, "expected at least one grapple result from book-a"
    assert all(r["doc_id"] == "book-a" for r in results)


def test_search_has_no_tier_gate(client):
    # Search endpoint must return 200 with no PAGEPIPER_OLLAMA_URL set
    resp = client.post("/api/search", json={"query": "anything"})
    assert resp.status_code == 200  # Not 402