turnstone/app/context/embedder.py

"""Context chunk embedding — BSL licensed.

Thin wrapper around app.services.embeddings that handles the DB I/O for
context_chunks.  All backend configuration (model, device, backend type) is
delegated to the service layer via TURNSTONE_EMBED_* env vars.

Re-exports EMBEDDING_AVAILABLE so callers that imported it from here continue
to work without changes.
"""
from __future__ import annotations

import logging
import sqlite3
from pathlib import Path

from app.services.embeddings import (
    EMBEDDING_AVAILABLE,  # re-export for backward compat
    get_embedder,
    pack_vector,
)

__all__ = ["EMBEDDING_AVAILABLE", "embed_chunks"]

logger = logging.getLogger(__name__)


def embed_chunks(
    db_path: Path,
    document_id: str,
    # Legacy params kept for backward compat — ignored when the ST backend is active.
    llm_url: str = "",
    model: str = "",
    timeout: float = 60.0,
) -> int:
    """Embed all un-embedded chunks for *document_id*.

    Uses the configured embedder (sentence-transformers by default; Ollama when
    TURNSTONE_EMBED_BACKEND=ollama).  Returns the count of newly embedded chunks.
    Returns 0 silently when no embedder is available.

    The legacy ``llm_url`` and ``model`` parameters are accepted but ignored when
    the sentence-transformers backend is active — configure via env vars instead.
    """
    embedder = get_embedder()
    if embedder is None:
        return 0

    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row

    rows = conn.execute(
        "SELECT id, text FROM context_chunks WHERE document_id = ? AND embedding IS NULL",
        (document_id,),
    ).fetchall()

    if not rows:
        conn.close()
        return 0

    texts = [r["text"] for r in rows]
    ids   = [r["id"]   for r in rows]

    count = 0
    try:
        vectors = embedder.embed_batch(texts)
        for chunk_id, vec in zip(ids, vectors):
            blob = pack_vector(vec)
            conn.execute(
                "UPDATE context_chunks SET embedding = ? WHERE id = ?",
                (blob, chunk_id),
            )
            count += 1
        conn.commit()
    except Exception as exc:
        logger.warning("Batch embedding failed for document %s: %s", document_id, exc)
    finally:
        conn.close()

    logger.debug("Embedded %d chunk(s) for document %s", count, document_id)
    return count