# app/services/synthesizer.py """ LLM answer synthesis over retrieved chunks. BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier). """ from __future__ import annotations from dataclasses import dataclass from app.services.retriever import RetrievedChunk _SYSTEM_PROMPT = ( "You are a helpful document assistant. " "Answer the user's question using ONLY the provided document excerpts. " "For each claim, cite the source page as [p.N]. " "If the excerpts are insufficient, say so. Do not invent information." ) @dataclass(frozen=True) class Citation: doc_id: str page_number: int snippet: str bm25_score: float @dataclass(frozen=True) class SynthesisResult: answer: str citations: tuple[Citation, ...] class Synthesizer: def __init__(self, llm) -> None: # LLMRouter self._llm = llm def synthesize( self, message: str, history: list[dict], chunks: list[RetrievedChunk], ) -> SynthesisResult: # 1500 chars (~300 words) per chunk: enough to capture definitions that # appear mid-paragraph without blowing past a 32k-context model's limit. context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks] context = "\n\n---\n\n".join(context_parts) prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}" answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT) citations = tuple( Citation( doc_id=c.doc_id, page_number=c.page_number, snippet=c.text[:400], bm25_score=c.bm25_score, ) for c in chunks ) return SynthesisResult(answer=answer, citations=citations)