- Strengthen synthesizer system prompt: hard 'respond with exactly' constraint instead of soft 'say so'; removes any wiggle room for the model to supplement from training data - Add early return in synthesize() when chunks is empty (belt-and-suspenders alongside the existing guard in chat.py) - Add MIN_SIGNAL threshold (0.01) in retriever: if the top combined score is below the threshold, return empty so the caller's no-results path fires instead of sending noise chunks to the LLM
73 lines
2.2 KiB
Python
73 lines
2.2 KiB
Python
# app/services/synthesizer.py
|
|
"""
|
|
LLM answer synthesis over retrieved chunks.
|
|
|
|
BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from app.services.retriever import RetrievedChunk
|
|
|
|
_SYSTEM_PROMPT = (
|
|
"You are a document assistant. "
|
|
"Answer questions using ONLY the document excerpts provided. "
|
|
"Cite every claim with the source page as [p.N]. "
|
|
"If the excerpts do not contain the answer, respond with exactly: "
|
|
"'I could not find an answer to that question in the indexed documents.' "
|
|
"Do NOT use knowledge from outside the provided excerpts. "
|
|
"Do NOT speculate, infer, or guess beyond what is explicitly stated."
|
|
)
|
|
|
|
_NO_RESULTS_ANSWER = (
|
|
"I could not find any relevant passages in the indexed documents for that question. "
|
|
"Try rephrasing, or check that the relevant document has been ingested."
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class Citation:
|
|
doc_id: str
|
|
page_number: int
|
|
snippet: str
|
|
bm25_score: float
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class SynthesisResult:
|
|
answer: str
|
|
citations: tuple[Citation, ...]
|
|
|
|
|
|
class Synthesizer:
|
|
def __init__(self, llm) -> None: # LLMRouter
|
|
self._llm = llm
|
|
|
|
def synthesize(
|
|
self,
|
|
message: str,
|
|
history: list[dict],
|
|
chunks: list[RetrievedChunk],
|
|
) -> SynthesisResult:
|
|
if not chunks:
|
|
return SynthesisResult(answer=_NO_RESULTS_ANSWER, citations=())
|
|
|
|
# 1500 chars (~300 words) per chunk: enough to capture definitions that
|
|
# appear mid-paragraph without blowing past a 32k-context model's limit.
|
|
context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
|
|
context = "\n\n---\n\n".join(context_parts)
|
|
prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}"
|
|
|
|
answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)
|
|
|
|
citations = tuple(
|
|
Citation(
|
|
doc_id=c.doc_id,
|
|
page_number=c.page_number,
|
|
snippet=c.text[:400],
|
|
bm25_score=c.bm25_score,
|
|
)
|
|
for c in chunks
|
|
)
|
|
return SynthesisResult(answer=answer, citations=citations)
|