pagepiper/app/services/synthesizer.py

# app/services/synthesizer.py
"""
LLM answer synthesis over retrieved chunks.

BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier).
"""
from __future__ import annotations

from dataclasses import dataclass

from app.services.retriever import RetrievedChunk

_SYSTEM_PROMPT = (
    "You are a helpful document assistant. "
    "Answer the user's question using ONLY the provided document excerpts. "
    "For each claim, cite the source page as [p.N]. "
    "If the excerpts are insufficient, say so. Do not invent information."
)


@dataclass(frozen=True)
class Citation:
    doc_id: str
    page_number: int
    snippet: str
    bm25_score: float


@dataclass(frozen=True)
class SynthesisResult:
    answer: str
    citations: tuple[Citation, ...]


class Synthesizer:
    def __init__(self, llm) -> None:  # LLMRouter
        self._llm = llm

    def synthesize(
        self,
        message: str,
        history: list[dict],
        chunks: list[RetrievedChunk],
    ) -> SynthesisResult:
        # 1500 chars (~300 words) per chunk: enough to capture definitions that
        # appear mid-paragraph without blowing past a 32k-context model's limit.
        context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
        context = "\n\n---\n\n".join(context_parts)
        prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}"

        answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)

        citations = tuple(
            Citation(
                doc_id=c.doc_id,
                page_number=c.page_number,
                snippet=c.text[:400],
                bm25_score=c.bm25_score,
            )
            for c in chunks
        )
        return SynthesisResult(answer=answer, citations=citations)