# tests/test_ingest.py """Unit tests for scripts/ingest_pdf.py.""" from __future__ import annotations import sqlite3 from pathlib import Path from unittest.mock import MagicMock, patch import pytest from scripts.ingest_pdf import run @pytest.fixture def ingest_db(tmp_path) -> tuple[str, str]: db_path = str(tmp_path / "test.db") schema = Path("migrations/001_initial_schema.sql").read_text() conn = sqlite3.connect(db_path) conn.executescript(schema) conn.execute( "INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Test','test.pdf','pending')" ) conn.commit() conn.close() vec_db_path = str(tmp_path / "vecs.db") return db_path, vec_db_path def _make_mock_chunk(page_number: int = 1, text: str = "Some page text about rules.") -> MagicMock: chunk = MagicMock() chunk.page_number = page_number chunk.text = text chunk.source = "text_layer" chunk.word_count = len(text.split()) return chunk def test_ingest_sets_status_ready_on_success(ingest_db): db_path, vec_db_path = ingest_db mock_extractor = MagicMock() mock_extractor.chunk_pages.return_value = [_make_mock_chunk()] with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor): run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path) conn = sqlite3.connect(db_path) row = conn.execute("SELECT status, page_count FROM documents WHERE id='d1'").fetchone() conn.close() assert row[0] == "ready" assert row[1] == 1 def test_ingest_stores_page_chunks(ingest_db): db_path, vec_db_path = ingest_db mock_extractor = MagicMock() chunks = [_make_mock_chunk(page_number=i + 1, text=f"Page {i+1} text content.") for i in range(3)] mock_extractor.chunk_pages.return_value = chunks with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor): run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path) conn = sqlite3.connect(db_path) rows = conn.execute( "SELECT page_number, text FROM page_chunks WHERE doc_id='d1' ORDER BY page_number" ).fetchall() conn.close() assert len(rows) == 3 assert rows[0][0] == 1 assert "Page 1" in rows[0][1] def test_ingest_sets_error_status_on_failure(ingest_db): db_path, vec_db_path = ingest_db with patch("circuitforge_core.documents.pdf.PDFExtractor", side_effect=RuntimeError("PDF corrupt")): from scripts.ingest_pdf import run with pytest.raises(RuntimeError): run(doc_id="d1", file_path="bad.pdf", db_path=db_path, vec_db_path=vec_db_path) conn = sqlite3.connect(db_path) row = conn.execute("SELECT status, error_msg FROM documents WHERE id='d1'").fetchone() conn.close() assert row[0] == "error" assert "PDF corrupt" in row[1] def test_ingest_skips_embeddings_without_ollama_url(ingest_db, monkeypatch): """When PAGEPIPER_OLLAMA_URL is unset, no vec DB file should be created.""" db_path, vec_db_path = ingest_db monkeypatch.delenv("PAGEPIPER_OLLAMA_URL", raising=False) mock_extractor = MagicMock() mock_extractor.chunk_pages.return_value = [_make_mock_chunk()] with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor): run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path) # No embeddings were requested, so the vec DB should not have been created assert not Path(vec_db_path).exists(), "vec DB should not be created without OLLAMA_URL" # Document should still be ready with chunks stored conn = sqlite3.connect(db_path) status = conn.execute("SELECT status FROM documents WHERE id='d1'").fetchone()[0] chunk_count = conn.execute( "SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'" ).fetchone()[0] conn.close() assert status == "ready" assert chunk_count == 1 def test_ingest_replaces_existing_chunks_on_reingest(ingest_db): """Re-running ingest for the same doc_id replaces old page_chunks.""" db_path, vec_db_path = ingest_db mock_extractor = MagicMock() # First ingest: 3 pages mock_extractor.chunk_pages.return_value = [ _make_mock_chunk(page_number=i + 1, text=f"Original page {i+1}.") for i in range(3) ] with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor): run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path) # Second ingest: 1 page (simulating a re-ingest after file change) mock_extractor.chunk_pages.return_value = [_make_mock_chunk(text="Updated single page.")] with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor): run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path) conn = sqlite3.connect(db_path) rows = conn.execute( "SELECT text FROM page_chunks WHERE doc_id='d1'" ).fetchall() conn.close() assert len(rows) == 1 assert "Updated" in rows[0][0]