diff --git a/app/services/retriever.py b/app/services/retriever.py
index 20f0895..9616e3c 100644
--- a/app/services/retriever.py
+++ b/app/services/retriever.py
@@ -170,14 +170,30 @@ class Retriever:
             vec = (1.0 / (1.0 + r.vector_score)) if r.vector_score is not None else 0.0
             return bm25 * 0.5 + vec * 0.5
 
-        ranked = sorted(merged.values(), key=_combined, reverse=True)[:top_k]
+        all_ranked = sorted(merged.values(), key=_combined, reverse=True)
+
         # Discard results where the best match is pure noise (neither BM25 term
         # overlap nor vector similarity exceeded the minimum signal threshold).
         # This lets the caller's empty-result guard fire instead of sending
         # low-confidence chunks to the LLM where it fills gaps with training data.
         MIN_SIGNAL = 0.01
-        if ranked and _combined(ranked[0]) < MIN_SIGNAL:
+        if all_ranked and _combined(all_ranked[0]) < MIN_SIGNAL:
             return []
+
+        # Cap per-document contribution to max_per_doc of top_k so that one book
+        # does not crowd out all slots when the query matches it heavily by name
+        # alone (e.g. a character name that appears in every chapter).
+        max_per_doc = max(2, top_k // 3)
+        ranked: list[RetrievedChunk] = []
+        doc_counts: dict[str, int] = {}
+        for r in all_ranked:
+            if len(ranked) >= top_k:
+                break
+            count = doc_counts.get(r.doc_id, 0)
+            if count < max_per_doc:
+                ranked.append(r)
+                doc_counts[r.doc_id] = count + 1
+
         adjacent = _fetch_adjacent(ranked, db_path)
         return ranked + adjacent
 
diff --git a/app/services/synthesizer.py b/app/services/synthesizer.py
index befba3e..3a93ec2 100644
--- a/app/services/synthesizer.py
+++ b/app/services/synthesizer.py
@@ -57,7 +57,16 @@ class Synthesizer:
         # appear mid-paragraph without blowing past a 32k-context model's limit.
         context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
         context = "\n\n---\n\n".join(context_parts)
-        prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}"
+        # Repeat the no-outside-knowledge constraint inside the user turn.
+        # Small models (7B) follow user-turn instructions more reliably than
+        # system-prompt-only constraints when their training data conflicts.
+        prompt = (
+            f"Document excerpts:\n\n{context}\n\n"
+            f"Question: {message}\n\n"
+            f"IMPORTANT: Answer using ONLY the excerpts above. "
+            f"If the answer is not present in the excerpts, respond with exactly: "
+            f"\"I could not find an answer to that question in the indexed documents.\""
+        )
 
         answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)