fix: quote-first prompt structure + escape phrase post-processing to kill hallucinations

three-layer approach to stop 7B model from supplementing retrieved context
with training-data knowledge:

1. system prompt redesigned: 'no memory of books/stories/authors' eliminates
   the model's self-permission to draw on parametric knowledge

2. quote-first prompt structure: model must commit to a specific quoted passage
   before generating an answer — explicit NOT FOUND required when excerpts lack
   the answer, preventing the 'excerpt doesn't say X... however in the series...'
   escape pattern

3. _strip_escape() post-processor: catches any residual leakage by scanning for
   known escape phrases ('in the series', 'by terry goodkind', 'it can be assumed',
   etc.) and replacing the response with the canned no-answer message
This commit is contained in:
pyr0ball 2026-05-06 10:30:11 -07:00
parent 32cb21e2cd
commit 3765fbc0f9

View file

@ -11,13 +11,10 @@ from dataclasses import dataclass
from app.services.retriever import RetrievedChunk from app.services.retriever import RetrievedChunk
_SYSTEM_PROMPT = ( _SYSTEM_PROMPT = (
"You are a document assistant. " "You are a strict document retrieval assistant. "
"Answer questions using ONLY the document excerpts provided. " "Your sole job is to extract and present information from the document excerpts given to you. "
"Cite every claim with the source page as [p.N]. " "You have no memory of books, stories, or authors. "
"If the excerpts do not contain the answer, respond with exactly: " "If the excerpts do not contain the answer, say so and stop. Never guess."
"'I could not find an answer to that question in the indexed documents.' "
"Do NOT use knowledge from outside the provided excerpts. "
"Do NOT speculate, infer, or guess beyond what is explicitly stated."
) )
_NO_RESULTS_ANSWER = ( _NO_RESULTS_ANSWER = (
@ -25,6 +22,41 @@ _NO_RESULTS_ANSWER = (
"Try rephrasing, or check that the relevant document has been ingested." "Try rephrasing, or check that the relevant document has been ingested."
) )
# Phrases the model uses when it escapes the provided context and pulls from
# training data. Any response containing one of these is replaced with the
# canned no-answer message.
_ESCAPE_PHRASES = [
"in the series",
"in the novel",
"in the book",
"in the context of the series",
"it can be assumed",
"based on my knowledge",
"based on the broader",
"the broader story",
"by terry goodkind",
"sword of truth",
"legend of the seeker",
"throughout the series",
"throughout the novel",
"throughout the book",
]
def _strip_escape(response: str) -> str:
"""Replace responses that leaked outside the provided context with the canned message.
Detects the 'helpful override' pattern where the model acknowledges the
excerpts lack the answer but supplements from training data anyway.
"""
lower = response.lower()
if any(phrase in lower for phrase in _ESCAPE_PHRASES):
return (
"I could not find an answer to that question in the indexed documents. "
"The answer may be in a document that has not been ingested yet."
)
return response
@dataclass(frozen=True) @dataclass(frozen=True)
class Citation: class Citation:
@ -57,18 +89,23 @@ class Synthesizer:
# appear mid-paragraph without blowing past a 32k-context model's limit. # appear mid-paragraph without blowing past a 32k-context model's limit.
context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks] context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
context = "\n\n---\n\n".join(context_parts) context = "\n\n---\n\n".join(context_parts)
# Repeat the no-outside-knowledge constraint inside the user turn. # Quote-first structure: the model must commit to a grounding passage
# Small models (7B) follow user-turn instructions more reliably than # before generating an answer. Forces an explicit "NOT FOUND" admission
# system-prompt-only constraints when their training data conflicts. # when the excerpt doesn't contain the answer, rather than the "the excerpt
# doesn't say... however, in the series..." escape pattern.
prompt = ( prompt = (
f"Document excerpts:\n\n{context}\n\n" f"Excerpts from the indexed documents:\n\n{context}\n\n"
f"---\n\n"
f"Question: {message}\n\n" f"Question: {message}\n\n"
f"IMPORTANT: Answer using ONLY the excerpts above. " f"Step 1 — Find the relevant passage: Quote the exact sentence(s) from "
f"If the answer is not present in the excerpts, respond with exactly: " f"the excerpts above that answer the question, or write NOT FOUND.\n\n"
f"\"I could not find an answer to that question in the indexed documents.\"" f"Step 2 — Answer: Based solely on what you quoted in Step 1, answer "
f"the question with page citations [p.N]. If Step 1 is NOT FOUND, "
f"write: \"I could not find an answer to that question in the indexed documents.\""
) )
answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT) answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)
answer = _strip_escape(answer)
citations = tuple( citations = tuple(
Citation( Citation(