# app/services/synthesizer.py """ LLM answer synthesis over retrieved chunks. BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier). """ from __future__ import annotations from dataclasses import dataclass from app.services.retriever import RetrievedChunk _SYSTEM_PROMPT = ( "You are a document assistant. " "Answer questions using ONLY the document excerpts provided. " "Cite every claim with the source page as [p.N]. " "If the excerpts do not contain the answer, respond with exactly: " "'I could not find an answer to that question in the indexed documents.' " "Do NOT use knowledge from outside the provided excerpts. " "Do NOT speculate, infer, or guess beyond what is explicitly stated." ) _NO_RESULTS_ANSWER = ( "I could not find any relevant passages in the indexed documents for that question. " "Try rephrasing, or check that the relevant document has been ingested." ) @dataclass(frozen=True) class Citation: doc_id: str page_number: int snippet: str bm25_score: float @dataclass(frozen=True) class SynthesisResult: answer: str citations: tuple[Citation, ...] class Synthesizer: def __init__(self, llm) -> None: # LLMRouter self._llm = llm def synthesize( self, message: str, history: list[dict], chunks: list[RetrievedChunk], ) -> SynthesisResult: if not chunks: return SynthesisResult(answer=_NO_RESULTS_ANSWER, citations=()) # 1500 chars (~300 words) per chunk: enough to capture definitions that # appear mid-paragraph without blowing past a 32k-context model's limit. context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks] context = "\n\n---\n\n".join(context_parts) prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}" answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT) citations = tuple( Citation( doc_id=c.doc_id, page_number=c.page_number, snippet=c.text[:400], bm25_score=c.bm25_score, ) for c in chunks ) return SynthesisResult(answer=answer, citations=citations)