pagepiper/app/services/synthesizer.py
pyr0ball 32cb21e2cd fix: reinforce no-hallucination constraint in user-turn prompt; cap per-doc retrieval
synthesizer: repeat the no-outside-knowledge rule inside the user message turn —
small models (7B) follow user-turn instructions more reliably than system-prompt
alone when parametric memory competes with the retrieved context

retriever: cap each document to max(2, top_k//3) slots in the ranked list so
one book cannot flood all result slots on character-name BM25 matches — forces
coverage across more documents when the answer may be in any of them
2026-05-06 10:26:51 -07:00

82 lines
2.7 KiB
Python

# app/services/synthesizer.py
"""
LLM answer synthesis over retrieved chunks.
BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier).
"""
from __future__ import annotations
from dataclasses import dataclass
from app.services.retriever import RetrievedChunk
_SYSTEM_PROMPT = (
"You are a document assistant. "
"Answer questions using ONLY the document excerpts provided. "
"Cite every claim with the source page as [p.N]. "
"If the excerpts do not contain the answer, respond with exactly: "
"'I could not find an answer to that question in the indexed documents.' "
"Do NOT use knowledge from outside the provided excerpts. "
"Do NOT speculate, infer, or guess beyond what is explicitly stated."
)
_NO_RESULTS_ANSWER = (
"I could not find any relevant passages in the indexed documents for that question. "
"Try rephrasing, or check that the relevant document has been ingested."
)
@dataclass(frozen=True)
class Citation:
doc_id: str
page_number: int
snippet: str
bm25_score: float
@dataclass(frozen=True)
class SynthesisResult:
answer: str
citations: tuple[Citation, ...]
class Synthesizer:
def __init__(self, llm) -> None: # LLMRouter
self._llm = llm
def synthesize(
self,
message: str,
history: list[dict],
chunks: list[RetrievedChunk],
) -> SynthesisResult:
if not chunks:
return SynthesisResult(answer=_NO_RESULTS_ANSWER, citations=())
# 1500 chars (~300 words) per chunk: enough to capture definitions that
# appear mid-paragraph without blowing past a 32k-context model's limit.
context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
context = "\n\n---\n\n".join(context_parts)
# Repeat the no-outside-knowledge constraint inside the user turn.
# Small models (7B) follow user-turn instructions more reliably than
# system-prompt-only constraints when their training data conflicts.
prompt = (
f"Document excerpts:\n\n{context}\n\n"
f"Question: {message}\n\n"
f"IMPORTANT: Answer using ONLY the excerpts above. "
f"If the answer is not present in the excerpts, respond with exactly: "
f"\"I could not find an answer to that question in the indexed documents.\""
)
answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)
citations = tuple(
Citation(
doc_id=c.doc_id,
page_number=c.page_number,
snippet=c.text[:400],
bm25_score=c.bm25_score,
)
for c in chunks
)
return SynthesisResult(answer=answer, citations=citations)