140 lines
5.1 KiB
Python
140 lines
5.1 KiB
Python
# tests/test_ingest.py
|
|
"""Unit tests for scripts/ingest_pdf.py."""
|
|
from __future__ import annotations
|
|
|
|
import sqlite3
|
|
from pathlib import Path
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import pytest
|
|
|
|
|
|
@pytest.fixture
|
|
def ingest_db(tmp_path) -> tuple[str, str]:
|
|
db_path = str(tmp_path / "test.db")
|
|
schema = Path("migrations/001_initial_schema.sql").read_text()
|
|
conn = sqlite3.connect(db_path)
|
|
conn.executescript(schema)
|
|
conn.execute(
|
|
"INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Test','test.pdf','pending')"
|
|
)
|
|
conn.commit()
|
|
conn.close()
|
|
vec_db_path = str(tmp_path / "vecs.db")
|
|
return db_path, vec_db_path
|
|
|
|
|
|
def _make_mock_chunk(page_number: int = 1, text: str = "Some page text about rules.") -> MagicMock:
|
|
chunk = MagicMock()
|
|
chunk.page_number = page_number
|
|
chunk.text = text
|
|
chunk.source = "text_layer"
|
|
chunk.word_count = len(text.split())
|
|
return chunk
|
|
|
|
|
|
def test_ingest_sets_status_ready_on_success(ingest_db):
|
|
db_path, vec_db_path = ingest_db
|
|
|
|
mock_extractor = MagicMock()
|
|
mock_extractor.chunk_pages.return_value = [_make_mock_chunk()]
|
|
|
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
|
from scripts.ingest_pdf import run
|
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
row = conn.execute("SELECT status, page_count FROM documents WHERE id='d1'").fetchone()
|
|
conn.close()
|
|
assert row[0] == "ready"
|
|
assert row[1] == 1
|
|
|
|
|
|
def test_ingest_stores_page_chunks(ingest_db):
|
|
db_path, vec_db_path = ingest_db
|
|
|
|
mock_extractor = MagicMock()
|
|
chunks = [_make_mock_chunk(page_number=i + 1, text=f"Page {i+1} text content.") for i in range(3)]
|
|
mock_extractor.chunk_pages.return_value = chunks
|
|
|
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
|
from scripts.ingest_pdf import run
|
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
rows = conn.execute(
|
|
"SELECT page_number, text FROM page_chunks WHERE doc_id='d1' ORDER BY page_number"
|
|
).fetchall()
|
|
conn.close()
|
|
assert len(rows) == 3
|
|
assert rows[0][0] == 1
|
|
assert "Page 1" in rows[0][1]
|
|
|
|
|
|
def test_ingest_sets_error_status_on_failure(ingest_db):
|
|
db_path, vec_db_path = ingest_db
|
|
|
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", side_effect=RuntimeError("PDF corrupt")):
|
|
from scripts.ingest_pdf import run
|
|
with pytest.raises(RuntimeError):
|
|
run(doc_id="d1", file_path="bad.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
row = conn.execute("SELECT status, error_msg FROM documents WHERE id='d1'").fetchone()
|
|
conn.close()
|
|
assert row[0] == "error"
|
|
assert "PDF corrupt" in row[1]
|
|
|
|
|
|
def test_ingest_skips_embeddings_without_ollama_url(ingest_db, monkeypatch):
|
|
"""When PAGEPIPER_OLLAMA_URL is unset, no vec DB file should be created."""
|
|
db_path, vec_db_path = ingest_db
|
|
monkeypatch.delenv("PAGEPIPER_OLLAMA_URL", raising=False)
|
|
|
|
mock_extractor = MagicMock()
|
|
mock_extractor.chunk_pages.return_value = [_make_mock_chunk()]
|
|
|
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
|
from scripts.ingest_pdf import run
|
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
|
|
|
# No embeddings were requested, so the vec DB should not have been created
|
|
assert not Path(vec_db_path).exists(), "vec DB should not be created without OLLAMA_URL"
|
|
|
|
# Document should still be ready with chunks stored
|
|
conn = sqlite3.connect(db_path)
|
|
status = conn.execute("SELECT status FROM documents WHERE id='d1'").fetchone()[0]
|
|
chunk_count = conn.execute(
|
|
"SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'"
|
|
).fetchone()[0]
|
|
conn.close()
|
|
assert status == "ready"
|
|
assert chunk_count == 1
|
|
|
|
|
|
def test_ingest_replaces_existing_chunks_on_reingest(ingest_db):
|
|
"""Re-running ingest for the same doc_id replaces old page_chunks."""
|
|
db_path, vec_db_path = ingest_db
|
|
|
|
mock_extractor = MagicMock()
|
|
|
|
# First ingest: 3 pages
|
|
mock_extractor.chunk_pages.return_value = [
|
|
_make_mock_chunk(page_number=i + 1, text=f"Original page {i+1}.") for i in range(3)
|
|
]
|
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
|
from scripts.ingest_pdf import run
|
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
|
|
|
# Second ingest: 1 page (simulating a re-ingest after file change)
|
|
mock_extractor.chunk_pages.return_value = [_make_mock_chunk(text="Updated single page.")]
|
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
|
|
|
conn = sqlite3.connect(db_path)
|
|
rows = conn.execute(
|
|
"SELECT text FROM page_chunks WHERE doc_id='d1'"
|
|
).fetchall()
|
|
conn.close()
|
|
assert len(rows) == 1
|
|
assert "Updated" in rows[0][0]
|