pagepiper/scripts/ingest_epub.py

# scripts/ingest_epub.py
"""
cf-orch task: pagepiper/ingest_epub

Extracts text from an EPUB file, stores chapter chunks in SQLite, and (if Ollama is
configured) generates embeddings and stores them in the sqlite-vec store.

Each EPUB chapter becomes one chunk (equivalent to a PDF page).

Entry point:
    python scripts/ingest_epub.py --doc-id X --file-path Y --db-path Z --vec-db-path W
"""
from __future__ import annotations

import logging
import os
import sqlite3
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger("pagepiper.ingest_epub")

EMBED_BATCH_SIZE = 64
_WORDS_PER_CHUNK = 500  # target chunk size for word-count fallback


@dataclass
class _Chunk:
    page_number: int
    text: str
    source: str
    word_count: int


def _paragraphs_from_soup(soup) -> list[str]:
    """Extract non-trivial, artifact-free text lines from parsed HTML."""
    from scripts.text_clean import filter_paragraphs
    raw = soup.get_text(separator="\n", strip=True)
    return filter_paragraphs(raw.splitlines())


def _chunks_from_paragraphs(paragraphs: list[str], start_num: int) -> list[_Chunk]:
    """Accumulate paragraphs into ~_WORDS_PER_CHUNK-word chunks."""
    chunks: list[_Chunk] = []
    current: list[str] = []
    current_count = 0
    chunk_num = start_num

    for para in paragraphs:
        words = para.split()
        if current_count + len(words) > _WORDS_PER_CHUNK and current:
            text = "\n".join(current)
            chunks.append(_Chunk(chunk_num, text, "text", current_count))
            chunk_num += 1
            current, current_count = [], 0
        current.append(para)
        current_count += len(words)

    if current:
        text = "\n".join(current)
        chunks.append(_Chunk(chunk_num, text, "text", current_count))

    return chunks


def _extract_chunks(file_path: str) -> list[_Chunk]:
    import ebooklib
    from ebooklib import epub
    from bs4 import BeautifulSoup
    from scripts.text_clean import clean_line, is_artifact_line

    book = epub.read_epub(file_path, options={"ignore_ncx": True})
    all_chunks: list[_Chunk] = []

    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        soup = BeautifulSoup(item.get_content(), "html.parser")
        headings = soup.find_all(["h1", "h2", "h3", "h4"])

        if len(headings) >= 2:
            # Heading-based split: one chunk per section
            current_parts: list[str] = []
            for elem in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]):
                if elem.name in ("h1", "h2", "h3", "h4"):
                    if current_parts:
                        text = "\n".join(current_parts).strip()
                        if text:
                            n = len(all_chunks) + 1
                            all_chunks.append(_Chunk(n, text, "text", len(text.split())))
                    current_parts = [elem.get_text(" ", strip=True)]
                else:
                    t = clean_line(elem.get_text(" ", strip=True))
                    if t and not is_artifact_line(t):
                        current_parts.append(t)
            if current_parts:
                text = "\n".join(current_parts).strip()
                if text:
                    n = len(all_chunks) + 1
                    all_chunks.append(_Chunk(n, text, "text", len(text.split())))
        else:
            # Word-count fallback: accumulate paragraphs into ~500-word chunks
            paragraphs = _paragraphs_from_soup(soup)
            if paragraphs:
                all_chunks.extend(_chunks_from_paragraphs(paragraphs, len(all_chunks) + 1))

    return all_chunks


def _update_status(
    conn: sqlite3.Connection,
    doc_id: str,
    status: str,
    page_count: int | None = None,
    error_msg: str | None = None,
) -> None:
    if page_count is not None:
        conn.execute(
            "UPDATE documents SET status=?, page_count=?, updated_at=datetime('now') WHERE id=?",
            [status, page_count, doc_id],
        )
    elif error_msg is not None:
        conn.execute(
            "UPDATE documents SET status=?, error_msg=?, updated_at=datetime('now') WHERE id=?",
            [status, error_msg, doc_id],
        )
    else:
        conn.execute(
            "UPDATE documents SET status=?, updated_at=datetime('now') WHERE id=?",
            [status, doc_id],
        )
    conn.commit()


def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None:
    """Run the full ingest pipeline for one EPUB. Called by cf-orch or BackgroundTasks."""
    conn: sqlite3.Connection | None = None
    try:
        conn = sqlite3.connect(db_path, timeout=30)
        conn.execute("PRAGMA journal_mode = WAL")
        conn.execute("PRAGMA foreign_keys = ON")
        _update_status(conn, doc_id, "processing")

        logger.info("Extracting chapters from %s", file_path)
        chunks = _extract_chunks(file_path)
        logger.info("Extracted %d chapters", len(chunks))

        conn.execute("DELETE FROM page_chunks WHERE doc_id=?", [doc_id])
        chunk_rows: list[tuple[str, int, str]] = []
        for chunk in chunks:
            row = conn.execute(
                """INSERT INTO page_chunks(doc_id, page_number, text, source, word_count)
                   VALUES (?,?,?,?,?) RETURNING id""",
                [doc_id, chunk.page_number, chunk.text, chunk.source, chunk.word_count],
            ).fetchone()
            chunk_rows.append((row[0], chunk.page_number, chunk.text))
        conn.commit()

        # Embedding failure is non-fatal: document remains BM25-searchable.
        ollama_url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip()
        if ollama_url and chunks:
            try:
                logger.info("Embedding %d chapters via Ollama at %s", len(chunks), ollama_url)
                from circuitforge_core.llm import LLMRouter
                from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore

                _clean = ollama_url.rstrip("/")
                base_url = _clean if _clean.endswith("/v1") else _clean + "/v1"
                router = LLMRouter({
                    "fallback_order": ["ollama"],
                    "backends": {
                        "ollama": {
                            "type": "openai_compat",
                            "base_url": base_url,
                            "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
                            "embedding_model": os.environ.get(
                                "PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
                            ),
                            "supports_images": False,
                        }
                    },
                })
                embed_dims = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024"))
                vec_store = LocalSQLiteVecStore(
                    db_path=vec_db_path, table="page_vecs", dimensions=embed_dims
                )
                vec_store.delete_where({"doc_id": doc_id})

                texts = [text for _, _, text in chunk_rows]
                vectors: list[list[float]] = []
                for i in range(0, len(texts), EMBED_BATCH_SIZE):
                    vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE]))

                for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors):
                    vec_store.upsert(
                        entry_id=chunk_id,
                        vector=vector,
                        metadata={"doc_id": doc_id, "page_number": page_number},
                    )
                logger.info("Stored %d embeddings", len(vectors))
            except Exception as embed_exc:
                logger.warning(
                    "Embedding skipped for doc %s — BM25 only (reason: %s)",
                    doc_id, embed_exc,
                )

        _update_status(conn, doc_id, "ready", page_count=len(chunks))
        logger.info("Ingest complete for doc %s (%d chapters)", doc_id, len(chunks))

    except Exception as exc:
        logger.error("Ingest failed for doc %s: %s", doc_id, exc, exc_info=True)
        if conn is not None:
            try:
                _update_status(conn, doc_id, "error", error_msg=str(exc))
            except Exception:
                logger.warning("Could not write error status for doc %s", doc_id)
        raise
    finally:
        if conn is not None:
            conn.close()


if __name__ == "__main__":
    import argparse

    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(
        description="Ingest an EPUB (cf-orch task entry point)"
    )
    parser.add_argument("--doc-id", required=True)
    parser.add_argument("--file-path", required=True)
    parser.add_argument("--db-path", required=True)
    parser.add_argument("--vec-db-path", required=True)
    a = parser.parse_args()
    run(
        doc_id=a.doc_id,
        file_path=a.file_path,
        db_path=a.db_path,
        vec_db_path=a.vec_db_path,
    )