fix: quote-first prompt structure + escape phrase post-processing to kill hallucinations

three-layer approach to stop 7B model from supplementing retrieved context with training-data knowledge: 1. system prompt redesigned: 'no memory of books/stories/authors' eliminates the model's self-permission to draw on parametric knowledge 2. quote-first prompt structure: model must commit to a specific quoted passage before generating an answer — explicit NOT FOUND required when excerpts lack the answer, preventing the 'excerpt doesn't say X... however in the series...' escape pattern 3. _strip_escape() post-processor: catches any residual leakage by scanning for known escape phrases ('in the series', 'by terry goodkind', 'it can be assumed', etc.) and replacing the response with the canned no-answer message
2026-05-06 10:30:11 -07:00 · 2026-05-06 10:30:11 -07:00 · 3765fbc0f9
commit 3765fbc0f9
parent 32cb21e2cd
1 changed files with 51 additions and 14 deletions
--- a/app/services/synthesizer.py
+++ b/app/services/synthesizer.py
@ -11,13 +11,10 @@ from dataclasses import dataclass
 from app.services.retriever import RetrievedChunk

 _SYSTEM_PROMPT = (
-    "You are a document assistant. "
-    "Answer questions using ONLY the document excerpts provided. "
-    "Cite every claim with the source page as [p.N]. "
-    "If the excerpts do not contain the answer, respond with exactly: "
-    "'I could not find an answer to that question in the indexed documents.' "
-    "Do NOT use knowledge from outside the provided excerpts. "
-    "Do NOT speculate, infer, or guess beyond what is explicitly stated."
+    "You are a strict document retrieval assistant. "
+    "Your sole job is to extract and present information from the document excerpts given to you. "
+    "You have no memory of books, stories, or authors. "
+    "If the excerpts do not contain the answer, say so and stop. Never guess."
 )

 _NO_RESULTS_ANSWER = (
@ -25,6 +22,41 @@ _NO_RESULTS_ANSWER = (
    "Try rephrasing, or check that the relevant document has been ingested."
 )

+# Phrases the model uses when it escapes the provided context and pulls from
+# training data. Any response containing one of these is replaced with the
+# canned no-answer message.
+_ESCAPE_PHRASES = [
+    "in the series",
+    "in the novel",
+    "in the book",
+    "in the context of the series",
+    "it can be assumed",
+    "based on my knowledge",
+    "based on the broader",
+    "the broader story",
+    "by terry goodkind",
+    "sword of truth",
+    "legend of the seeker",
+    "throughout the series",
+    "throughout the novel",
+    "throughout the book",
+]
+
+
+def _strip_escape(response: str) -> str:
+    """Replace responses that leaked outside the provided context with the canned message.
+
+    Detects the 'helpful override' pattern where the model acknowledges the
+    excerpts lack the answer but supplements from training data anyway.
+    """
+    lower = response.lower()
+    if any(phrase in lower for phrase in _ESCAPE_PHRASES):
+        return (
+            "I could not find an answer to that question in the indexed documents. "
+            "The answer may be in a document that has not been ingested yet."
+        )
+    return response
+

@dataclass(frozen=True)
 class Citation:
@ -57,18 +89,23 @@ class Synthesizer:
        # appear mid-paragraph without blowing past a 32k-context model's limit.
        context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
        context = "\n\n---\n\n".join(context_parts)
-        # Repeat the no-outside-knowledge constraint inside the user turn.
-        # Small models (7B) follow user-turn instructions more reliably than
-        # system-prompt-only constraints when their training data conflicts.
+        # Quote-first structure: the model must commit to a grounding passage
+        # before generating an answer. Forces an explicit "NOT FOUND" admission
+        # when the excerpt doesn't contain the answer, rather than the "the excerpt
+        # doesn't say... however, in the series..." escape pattern.
        prompt = (
-            f"Document excerpts:\n\n{context}\n\n"
+            f"Excerpts from the indexed documents:\n\n{context}\n\n"
+            f"---\n\n"
            f"Question: {message}\n\n"
-            f"IMPORTANT: Answer using ONLY the excerpts above. "
-            f"If the answer is not present in the excerpts, respond with exactly: "
-            f"\"I could not find an answer to that question in the indexed documents.\""
+            f"Step 1 — Find the relevant passage: Quote the exact sentence(s) from "
+            f"the excerpts above that answer the question, or write NOT FOUND.\n\n"
+            f"Step 2 — Answer: Based solely on what you quoted in Step 1, answer "
+            f"the question with page citations [p.N]. If Step 1 is NOT FOUND, "
+            f"write: \"I could not find an answer to that question in the indexed documents.\""
        )

        answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)
+        answer = _strip_escape(answer)

        citations = tuple(
            Citation(