# app/services/synthesizer.py """ LLM answer synthesis over retrieved chunks. BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier). """ from __future__ import annotations from dataclasses import dataclass from app.services.retriever import RetrievedChunk _SYSTEM_PROMPT = ( "You are a strict document retrieval assistant. " "Your sole job is to extract and present information from the document excerpts given to you. " "You have no memory of books, stories, or authors. " "If the excerpts do not contain the answer, say so and stop. Never guess." ) _NO_RESULTS_ANSWER = ( "I could not find any relevant passages in the indexed documents for that question. " "Try rephrasing, or check that the relevant document has been ingested." ) # Phrases the model uses when it escapes the provided context and pulls from # training data. Any response containing one of these is replaced with the # canned no-answer message. _ESCAPE_PHRASES = [ "in the series", "in the novel", "in the book", "in the context of the series", "it can be assumed", "based on my knowledge", "based on the broader", "the broader story", "by terry goodkind", "sword of truth", "legend of the seeker", "throughout the series", "throughout the novel", "throughout the book", ] def _strip_escape(response: str) -> str: """Replace responses that leaked outside the provided context with the canned message. Detects the 'helpful override' pattern where the model acknowledges the excerpts lack the answer but supplements from training data anyway. """ lower = response.lower() if any(phrase in lower for phrase in _ESCAPE_PHRASES): return ( "I could not find an answer to that question in the indexed documents. " "The answer may be in a document that has not been ingested yet." ) return response @dataclass(frozen=True) class Citation: doc_id: str page_number: int snippet: str bm25_score: float @dataclass(frozen=True) class SynthesisResult: answer: str citations: tuple[Citation, ...] class Synthesizer: def __init__(self, llm) -> None: # LLMRouter self._llm = llm def synthesize( self, message: str, history: list[dict], chunks: list[RetrievedChunk], ) -> SynthesisResult: if not chunks: return SynthesisResult(answer=_NO_RESULTS_ANSWER, citations=()) # 1500 chars (~300 words) per chunk: enough to capture definitions that # appear mid-paragraph without blowing past a 32k-context model's limit. context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks] context = "\n\n---\n\n".join(context_parts) # Quote-first structure: the model must commit to a grounding passage # before generating an answer. Forces an explicit "NOT FOUND" admission # when the excerpt doesn't contain the answer, rather than the "the excerpt # doesn't say... however, in the series..." escape pattern. prompt = ( f"Excerpts from the indexed documents:\n\n{context}\n\n" f"---\n\n" f"Question: {message}\n\n" f"Step 1 — Find the relevant passage: Quote the exact sentence(s) from " f"the excerpts above that answer the question, or write NOT FOUND.\n\n" f"Step 2 — Answer: Based solely on what you quoted in Step 1, answer " f"the question with page citations [p.N]. If Step 1 is NOT FOUND, " f"write: \"I could not find an answer to that question in the indexed documents.\"" ) answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT) answer = _strip_escape(answer) citations = tuple( Citation( doc_id=c.doc_id, page_number=c.page_number, snippet=c.text[:400], bm25_score=c.bm25_score, ) for c in chunks ) return SynthesisResult(answer=answer, citations=citations)