pagepiper/tests/test_ingest.py

140 lines
5.1 KiB
Python

# tests/test_ingest.py
"""Unit tests for scripts/ingest_pdf.py."""
from __future__ import annotations
import sqlite3
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
@pytest.fixture
def ingest_db(tmp_path) -> tuple[str, str]:
db_path = str(tmp_path / "test.db")
schema = Path("migrations/001_initial_schema.sql").read_text()
conn = sqlite3.connect(db_path)
conn.executescript(schema)
conn.execute(
"INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Test','test.pdf','pending')"
)
conn.commit()
conn.close()
vec_db_path = str(tmp_path / "vecs.db")
return db_path, vec_db_path
def _make_mock_chunk(page_number: int = 1, text: str = "Some page text about rules.") -> MagicMock:
chunk = MagicMock()
chunk.page_number = page_number
chunk.text = text
chunk.source = "text_layer"
chunk.word_count = len(text.split())
return chunk
def test_ingest_sets_status_ready_on_success(ingest_db):
db_path, vec_db_path = ingest_db
mock_extractor = MagicMock()
mock_extractor.chunk_pages.return_value = [_make_mock_chunk()]
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
from scripts.ingest_pdf import run
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
conn = sqlite3.connect(db_path)
row = conn.execute("SELECT status, page_count FROM documents WHERE id='d1'").fetchone()
conn.close()
assert row[0] == "ready"
assert row[1] == 1
def test_ingest_stores_page_chunks(ingest_db):
db_path, vec_db_path = ingest_db
mock_extractor = MagicMock()
chunks = [_make_mock_chunk(page_number=i + 1, text=f"Page {i+1} text content.") for i in range(3)]
mock_extractor.chunk_pages.return_value = chunks
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
from scripts.ingest_pdf import run
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
conn = sqlite3.connect(db_path)
rows = conn.execute(
"SELECT page_number, text FROM page_chunks WHERE doc_id='d1' ORDER BY page_number"
).fetchall()
conn.close()
assert len(rows) == 3
assert rows[0][0] == 1
assert "Page 1" in rows[0][1]
def test_ingest_sets_error_status_on_failure(ingest_db):
db_path, vec_db_path = ingest_db
with patch("circuitforge_core.documents.pdf.PDFExtractor", side_effect=RuntimeError("PDF corrupt")):
from scripts.ingest_pdf import run
with pytest.raises(RuntimeError):
run(doc_id="d1", file_path="bad.pdf", db_path=db_path, vec_db_path=vec_db_path)
conn = sqlite3.connect(db_path)
row = conn.execute("SELECT status, error_msg FROM documents WHERE id='d1'").fetchone()
conn.close()
assert row[0] == "error"
assert "PDF corrupt" in row[1]
def test_ingest_skips_embeddings_without_ollama_url(ingest_db, monkeypatch):
"""When PAGEPIPER_OLLAMA_URL is unset, no vec DB file should be created."""
db_path, vec_db_path = ingest_db
monkeypatch.delenv("PAGEPIPER_OLLAMA_URL", raising=False)
mock_extractor = MagicMock()
mock_extractor.chunk_pages.return_value = [_make_mock_chunk()]
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
from scripts.ingest_pdf import run
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
# No embeddings were requested, so the vec DB should not have been created
assert not Path(vec_db_path).exists(), "vec DB should not be created without OLLAMA_URL"
# Document should still be ready with chunks stored
conn = sqlite3.connect(db_path)
status = conn.execute("SELECT status FROM documents WHERE id='d1'").fetchone()[0]
chunk_count = conn.execute(
"SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'"
).fetchone()[0]
conn.close()
assert status == "ready"
assert chunk_count == 1
def test_ingest_replaces_existing_chunks_on_reingest(ingest_db):
"""Re-running ingest for the same doc_id replaces old page_chunks."""
db_path, vec_db_path = ingest_db
mock_extractor = MagicMock()
# First ingest: 3 pages
mock_extractor.chunk_pages.return_value = [
_make_mock_chunk(page_number=i + 1, text=f"Original page {i+1}.") for i in range(3)
]
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
from scripts.ingest_pdf import run
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
# Second ingest: 1 page (simulating a re-ingest after file change)
mock_extractor.chunk_pages.return_value = [_make_mock_chunk(text="Updated single page.")]
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
conn = sqlite3.connect(db_path)
rows = conn.execute(
"SELECT text FROM page_chunks WHERE doc_id='d1'"
).fetchall()
conn.close()
assert len(rows) == 1
assert "Updated" in rows[0][0]