Two-phase streaming architecture:
Phase 1 (sync thread): IngredientClassifier builds element profiles +
gap list from SQLite — thread-safe, no async context needed
Phase 2 (async): LLMRecipeGenerator.stream_generate() yields tokens via
cf-orch warm vllm (existing /stream-token path) or AsyncOpenAI against
Ollama if the coordinator is unavailable
Backend (app/services/recipe/llm_recipe.py):
- stream_generate() async generator; _try_alloc_for_stream() sync helper
- _stream_openai_compat() static method handles __auto__ model resolution
- LLMRecipeGenerator(None) is safe for streaming (store not used)
Endpoint (app/api/endpoints/recipes.py):
- ?stream=true on POST /recipes/suggest returns StreamingResponse
- X-Accel-Buffering: no prevents nginx buffering without nginx.conf edits
Frontend (api.ts, recipes.ts, RecipesView.vue):
- suggestRecipeStream() uses fetch + ReadableStream (POST; EventSource
only supports GET)
- streamSuggest() action in recipes store builds request internally
- RecipesView.streamRecipe() silently falls back to native SSE when
cf-orch token fetch fails rather than surfacing an error
478 lines
20 KiB
Python
478 lines
20 KiB
Python
"""LLM-driven recipe generator for Levels 3 and 4."""
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
import re
|
||
from contextlib import nullcontext
|
||
from typing import TYPE_CHECKING, AsyncGenerator
|
||
|
||
from openai import AsyncOpenAI, OpenAI
|
||
|
||
if TYPE_CHECKING:
|
||
from app.db.store import Store
|
||
|
||
from app.models.schemas.recipe import RecipeRequest, RecipeResult, RecipeSuggestion
|
||
from app.services.recipe.element_classifier import IngredientProfile
|
||
from app.services.recipe.style_adapter import StyleAdapter
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
def _filter_allergies(pantry_items: list[str], allergies: list[str]) -> list[str]:
|
||
"""Return pantry items with allergy matches removed (bidirectional substring)."""
|
||
if not allergies:
|
||
return list(pantry_items)
|
||
return [
|
||
item for item in pantry_items
|
||
if not any(
|
||
allergy.lower() in item.lower() or item.lower() in allergy.lower()
|
||
for allergy in allergies
|
||
)
|
||
]
|
||
|
||
|
||
class LLMRecipeGenerator:
|
||
def __init__(self, store: "Store") -> None:
|
||
self._store = store
|
||
self._style_adapter = StyleAdapter()
|
||
|
||
def build_level3_prompt(
|
||
self,
|
||
req: RecipeRequest,
|
||
profiles: list[IngredientProfile],
|
||
gaps: list[str],
|
||
) -> str:
|
||
"""Build a structured element-scaffold prompt for Level 3."""
|
||
allergy_list = req.allergies
|
||
safe_pantry = _filter_allergies(req.pantry_items, allergy_list)
|
||
|
||
covered_elements: list[str] = []
|
||
for profile in profiles:
|
||
for element in profile.elements:
|
||
if element not in covered_elements:
|
||
covered_elements.append(element)
|
||
|
||
lines: list[str] = [
|
||
"You are a creative chef. Generate a recipe using the ingredients below.",
|
||
"IMPORTANT: When you use a pantry item, list it in Ingredients using its exact name from the pantry list. Do not add adjectives, quantities, or cooking states (e.g. use 'butter', not 'unsalted butter' or '2 tbsp butter').",
|
||
"IMPORTANT: Only use pantry items that make culinary sense for the dish. Do NOT force flavoured/sweetened items (vanilla yoghurt, fruit yoghurt, jam, dessert sauces, flavoured syrups) into savoury dishes. Plain yoghurt, plain cream, and plain dairy are fine in savoury cooking.",
|
||
"IMPORTANT: Do not default to the same ingredient repeatedly across dishes. If a pantry item does not genuinely improve this specific dish, leave it out.",
|
||
"",
|
||
f"Pantry items: {', '.join(safe_pantry)}",
|
||
]
|
||
|
||
if req.constraints:
|
||
lines.append(f"Dietary constraints: {', '.join(req.constraints)}")
|
||
|
||
if allergy_list:
|
||
lines.append(f"IMPORTANT — must NOT contain: {', '.join(allergy_list)}")
|
||
|
||
if req.exclude_ingredients:
|
||
lines.append(f"IMPORTANT — user does not want these today: {', '.join(req.exclude_ingredients)}. Do not include them.")
|
||
|
||
lines.append("")
|
||
lines.append(f"Covered culinary elements: {', '.join(covered_elements) or 'none'}")
|
||
|
||
if gaps:
|
||
lines.append(
|
||
f"Missing elements to address: {', '.join(gaps)}. "
|
||
"Incorporate ingredients or techniques to fill these gaps."
|
||
)
|
||
|
||
if req.style_id:
|
||
template = self._style_adapter.get(req.style_id)
|
||
if template:
|
||
lines.append(f"Cuisine style: {template.name}")
|
||
if template.aromatics:
|
||
lines.append(f"Preferred aromatics: {', '.join(template.aromatics[:4])}")
|
||
|
||
unit_line = (
|
||
"Use metric units (grams, ml, Celsius) for all quantities and temperatures."
|
||
if req.unit_system == "metric"
|
||
else "Use imperial units (oz, cups, Fahrenheit) for all quantities and temperatures."
|
||
)
|
||
lines += [
|
||
unit_line,
|
||
"",
|
||
"Reply using EXACTLY this plain-text format — no markdown, no bold, no extra commentary:",
|
||
"Title: <name of the dish>",
|
||
"Ingredients: <comma-separated list>",
|
||
"Directions:",
|
||
"1. <first step>",
|
||
"2. <second step>",
|
||
"3. <continue for each step>",
|
||
"Notes: <optional tips>",
|
||
]
|
||
|
||
return "\n".join(lines)
|
||
|
||
def build_level4_prompt(
|
||
self,
|
||
req: RecipeRequest,
|
||
) -> str:
|
||
"""Build a minimal wildcard prompt for Level 4."""
|
||
allergy_list = req.allergies
|
||
safe_pantry = _filter_allergies(req.pantry_items, allergy_list)
|
||
|
||
lines: list[str] = [
|
||
"Surprise me with a creative, unexpected recipe.",
|
||
"Only use ingredients that make culinary sense together. Do not force flavoured/sweetened items (vanilla yoghurt, flavoured syrups, jam) into savoury dishes.",
|
||
f"Ingredients available: {', '.join(safe_pantry)}",
|
||
]
|
||
|
||
if req.constraints:
|
||
lines.append(f"Constraints: {', '.join(req.constraints)}")
|
||
|
||
if allergy_list:
|
||
lines.append(f"Must NOT contain: {', '.join(allergy_list)}")
|
||
|
||
if req.exclude_ingredients:
|
||
lines.append(f"Do not use today: {', '.join(req.exclude_ingredients)}")
|
||
|
||
unit_line = (
|
||
"Use metric units (grams, ml, Celsius) for all quantities and temperatures."
|
||
if req.unit_system == "metric"
|
||
else "Use imperial units (oz, cups, Fahrenheit) for all quantities and temperatures."
|
||
)
|
||
lines += [
|
||
"Treat any mystery ingredient as a wildcard — use your imagination.",
|
||
unit_line,
|
||
"Reply using EXACTLY this plain-text format — no markdown, no bold:",
|
||
"Title: <name of the dish>",
|
||
"Ingredients: <comma-separated list>",
|
||
"Directions:",
|
||
"1. <first step>",
|
||
"2. <second step>",
|
||
"Notes: <optional tips>",
|
||
]
|
||
|
||
return "\n".join(lines)
|
||
|
||
_SERVICE_TYPE = "cf-text"
|
||
_MODEL_CANDIDATES = ["granite-4.1-8b", "deepseek-r1-1.5b"]
|
||
_TTL_S = 300.0
|
||
_CALLER = "kiwi-recipe"
|
||
|
||
def _get_llm_context(self):
|
||
"""Return a sync context manager that yields an Allocation or None.
|
||
|
||
When CF_ORCH_URL is set, uses CFOrchClient to acquire a cf-text allocation
|
||
(which handles service lifecycle and VRAM). Falls back to nullcontext(None)
|
||
when the env var is absent or CFOrchClient raises on construction.
|
||
"""
|
||
cf_orch_url = os.environ.get("CF_ORCH_URL")
|
||
if cf_orch_url:
|
||
try:
|
||
from circuitforge_orch.client import CFOrchClient
|
||
client = CFOrchClient(cf_orch_url)
|
||
return client.allocate(
|
||
service=self._SERVICE_TYPE,
|
||
model_candidates=self._MODEL_CANDIDATES,
|
||
ttl_s=self._TTL_S,
|
||
caller=self._CALLER,
|
||
pipeline=os.environ.get("CF_APP_NAME") or None,
|
||
)
|
||
except Exception as exc:
|
||
logger.debug("CFOrchClient init failed, falling back to direct URL: %s", exc)
|
||
return nullcontext(None)
|
||
|
||
def _call_llm(self, prompt: str) -> str:
|
||
"""Call the LLM, using CFOrchClient allocation when CF_ORCH_URL is set.
|
||
|
||
With CF_ORCH_URL set: acquires a vLLM allocation via CFOrchClient and
|
||
calls the OpenAI-compatible API directly against the allocated service URL.
|
||
Falls back to LLMRouter when:
|
||
- Allocation succeeded but the service is cold (warm=False) — avoids
|
||
making the user wait for model load; LLMRouter uses Ollama which is
|
||
already running.
|
||
- Allocation succeeded but the connection to the service URL fails — the
|
||
agent may have registered the service but failed to start it.
|
||
Without CF_ORCH_URL: uses LLMRouter directly.
|
||
"""
|
||
ctx = self._get_llm_context()
|
||
alloc = None
|
||
try:
|
||
alloc = ctx.__enter__()
|
||
except Exception as exc:
|
||
msg = str(exc)
|
||
# 429 = coordinator at capacity (all nodes at max_concurrent limit).
|
||
# Don't fall back to LLMRouter — it's also overloaded and the slow
|
||
# fallback causes nginx 504s. Return "" fast so the caller degrades
|
||
# gracefully (empty recipe result) rather than timing out.
|
||
if "429" in msg or "max_concurrent" in msg.lower():
|
||
logger.info("cf-orch at capacity — returning empty result (graceful degradation)")
|
||
if ctx is not None:
|
||
try:
|
||
ctx.__exit__(None, None, None)
|
||
except Exception:
|
||
pass
|
||
return ""
|
||
logger.debug("cf-orch allocation failed, falling back to LLMRouter: %s", exc)
|
||
ctx = None # __enter__ raised — do not call __exit__
|
||
|
||
try:
|
||
if alloc is not None:
|
||
# Skip cold services — model not yet loaded means the user would
|
||
# wait 60–120 s for model load before any response. Use LLMRouter
|
||
# (Ollama) instead, which is already warm on the host.
|
||
if not alloc.warm:
|
||
logger.info(
|
||
"cf-orch vllm allocated but cold (warm=False) — releasing and falling back to LLMRouter"
|
||
)
|
||
raise RuntimeError("vllm cold")
|
||
|
||
base_url = alloc.url.rstrip("/") + "/v1"
|
||
client = OpenAI(base_url=base_url, api_key="any")
|
||
model = alloc.model or "__auto__"
|
||
if model == "__auto__":
|
||
model = client.models.list().data[0].id
|
||
resp = client.chat.completions.create(
|
||
model=model,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
)
|
||
return resp.choices[0].message.content or ""
|
||
else:
|
||
from circuitforge_core.llm.router import LLMRouter
|
||
return LLMRouter().complete(prompt)
|
||
except Exception as exc:
|
||
logger.error("LLM call failed: %s", exc)
|
||
# When cf-orch gave us an allocation but the service is unreachable
|
||
# (cold skip, connection refused, or other error), fall back to
|
||
# LLMRouter rather than silently returning empty.
|
||
# Skip "vllm" in the fallback order — that backend also routes through
|
||
# cf-orch, which would trigger a second (wasted) cold allocation.
|
||
if alloc is not None:
|
||
logger.info("Falling back to LLMRouter after vllm failure")
|
||
try:
|
||
from circuitforge_core.llm.router import LLMRouter
|
||
router = LLMRouter()
|
||
_order = [b for b in (router.config.get("fallback_order") or []) if b != "vllm"]
|
||
return router.complete(prompt, fallback_order=_order or None)
|
||
except Exception as fallback_exc:
|
||
logger.error("LLMRouter fallback also failed: %s", fallback_exc)
|
||
return ""
|
||
finally:
|
||
if ctx is not None:
|
||
try:
|
||
ctx.__exit__(None, None, None)
|
||
except Exception:
|
||
pass
|
||
|
||
# Strips markdown bold/italic markers so "**Directions:**" parses like "Directions:"
|
||
_MD_BOLD = re.compile(r"\*{1,2}([^*]+)\*{1,2}")
|
||
|
||
def _strip_md(self, text: str) -> str:
|
||
return self._MD_BOLD.sub(r"\1", text).strip()
|
||
|
||
def _parse_response(self, response: str) -> dict[str, str | list[str]]:
|
||
"""Parse LLM response text into structured recipe fields.
|
||
|
||
Handles both plain-text and markdown-formatted responses. Directions are
|
||
preserved as newline-separated text so the caller can split on step numbers.
|
||
"""
|
||
result: dict[str, str | list[str]] = {
|
||
"title": "",
|
||
"ingredients": [],
|
||
"directions": "",
|
||
"notes": "",
|
||
}
|
||
|
||
current_key: str | None = None
|
||
buffer: list[str] = []
|
||
|
||
def _flush(key: str | None, buf: list[str]) -> None:
|
||
if key is None or not buf:
|
||
return
|
||
if key == "directions":
|
||
result["directions"] = "\n".join(buf)
|
||
elif key == "ingredients":
|
||
text = " ".join(buf)
|
||
result["ingredients"] = [i.strip() for i in text.split(",") if i.strip()]
|
||
else:
|
||
result[key] = " ".join(buf).strip()
|
||
|
||
for raw_line in response.splitlines():
|
||
line = self._strip_md(raw_line)
|
||
lower = line.lower()
|
||
if lower.startswith("title:"):
|
||
_flush(current_key, buffer)
|
||
current_key, buffer = "title", [line.split(":", 1)[1].strip()]
|
||
elif lower.startswith("ingredients:"):
|
||
_flush(current_key, buffer)
|
||
current_key, buffer = "ingredients", [line.split(":", 1)[1].strip()]
|
||
elif lower.startswith("directions:"):
|
||
_flush(current_key, buffer)
|
||
rest = line.split(":", 1)[1].strip()
|
||
current_key, buffer = "directions", ([rest] if rest else [])
|
||
elif lower.startswith("notes:"):
|
||
_flush(current_key, buffer)
|
||
current_key, buffer = "notes", [line.split(":", 1)[1].strip()]
|
||
elif current_key and line.strip():
|
||
buffer.append(line.strip())
|
||
elif current_key is None and line.strip() and ":" not in line:
|
||
# Before any section header: a 2-10 word colon-free line is the dish name
|
||
words = line.split()
|
||
if 2 <= len(words) <= 10 and not result["title"]:
|
||
result["title"] = line.strip()
|
||
|
||
_flush(current_key, buffer)
|
||
return result
|
||
|
||
def generate(
|
||
self,
|
||
req: RecipeRequest,
|
||
profiles: list[IngredientProfile],
|
||
gaps: list[str],
|
||
) -> RecipeResult:
|
||
"""Generate a recipe via LLM and return a RecipeResult."""
|
||
if req.level == 4:
|
||
prompt = self.build_level4_prompt(req)
|
||
else:
|
||
prompt = self.build_level3_prompt(req, profiles, gaps)
|
||
|
||
response = self._call_llm(prompt)
|
||
|
||
if not response:
|
||
return RecipeResult(suggestions=[], element_gaps=gaps)
|
||
|
||
parsed = self._parse_response(response)
|
||
|
||
raw_directions = parsed.get("directions", "")
|
||
if isinstance(raw_directions, str):
|
||
# Split on newlines; strip leading step numbers ("1.", "2.", "- ", "* ")
|
||
_step_prefix = re.compile(r"^\s*(?:\d+[.)]\s*|[-*]\s+)")
|
||
directions_list = [
|
||
_step_prefix.sub("", s).strip()
|
||
for s in raw_directions.splitlines()
|
||
if s.strip()
|
||
]
|
||
else:
|
||
directions_list = list(raw_directions)
|
||
raw_notes = parsed.get("notes", "")
|
||
notes_str: str = raw_notes if isinstance(raw_notes, str) else ""
|
||
|
||
all_ingredients: list[str] = list(parsed.get("ingredients", []))
|
||
pantry_set = {item.lower() for item in (req.pantry_items or [])}
|
||
|
||
# Strip leading quantities/units (e.g. "2 cups rice" → "rice") before
|
||
# checking against pantry, since LLMs return formatted ingredient strings.
|
||
_qty_re = re.compile(
|
||
r"^\s*[\d½¼¾⅓⅔]+[\s/\-]*" # leading digits or fractions
|
||
r"(?:cup|cups|tbsp|tsp|tablespoon|teaspoon|oz|lb|lbs|g|kg|"
|
||
r"can|cans|clove|cloves|bunch|package|pkg|slice|slices|"
|
||
r"piece|pieces|pinch|dash|handful|head|heads|large|small|medium"
|
||
r")s?\b[,\s]*",
|
||
re.IGNORECASE,
|
||
)
|
||
missing = []
|
||
for ing in all_ingredients:
|
||
bare = _qty_re.sub("", ing).strip().lower()
|
||
if bare not in pantry_set and ing.lower() not in pantry_set:
|
||
missing.append(bare or ing)
|
||
|
||
suggestion = RecipeSuggestion(
|
||
id=0,
|
||
title=parsed.get("title") or "LLM Recipe",
|
||
match_count=len(req.pantry_items),
|
||
element_coverage={},
|
||
missing_ingredients=missing,
|
||
directions=directions_list,
|
||
notes=notes_str,
|
||
level=req.level,
|
||
is_wildcard=(req.level == 4),
|
||
)
|
||
|
||
return RecipeResult(
|
||
suggestions=[suggestion],
|
||
element_gaps=gaps,
|
||
)
|
||
|
||
async def stream_generate(
|
||
self,
|
||
req: RecipeRequest,
|
||
profiles: list,
|
||
gaps: list[str],
|
||
) -> AsyncGenerator[str, None]:
|
||
"""Stream LLM tokens for L3/L4. Yields raw text chunks as they arrive.
|
||
|
||
Tries cf-orch warm vllm first; falls back to Ollama via AsyncOpenAI.
|
||
When neither is reachable, falls back to blocking _call_llm and yields
|
||
the complete response as a single chunk so the caller always gets output.
|
||
"""
|
||
if req.level == 4:
|
||
prompt = self.build_level4_prompt(req)
|
||
else:
|
||
prompt = self.build_level3_prompt(req, profiles, gaps)
|
||
|
||
# Phase 1: try cf-orch warm vllm (sync allocation, wrapped in thread)
|
||
alloc_info = await asyncio.to_thread(self._try_alloc_for_stream)
|
||
if alloc_info is not None:
|
||
alloc, ctx = alloc_info
|
||
try:
|
||
async for token in self._stream_openai_compat(
|
||
alloc.url.rstrip("/") + "/v1", "any", alloc.model or "__auto__", prompt
|
||
):
|
||
yield token
|
||
return
|
||
except Exception as exc:
|
||
logger.debug("cf-orch stream failed, falling back to Ollama: %s", exc)
|
||
finally:
|
||
await asyncio.to_thread(lambda: _safe_exit(ctx))
|
||
|
||
# Phase 2: Ollama streaming via OpenAI-compat API
|
||
from circuitforge_core.llm.router import LLMRouter
|
||
router = LLMRouter()
|
||
ollama = router.config.get("backends", {}).get("ollama")
|
||
if ollama and ollama.get("enabled", True):
|
||
base_url = ollama["base_url"]
|
||
model = ollama.get("model", "llama3")
|
||
try:
|
||
async for token in self._stream_openai_compat(base_url, "any", model, prompt):
|
||
yield token
|
||
return
|
||
except Exception as exc:
|
||
logger.warning("Ollama streaming failed, falling back to blocking: %s", exc)
|
||
|
||
# Phase 3: blocking fallback — yields full response at once
|
||
result = await asyncio.to_thread(self._call_llm, prompt)
|
||
if result:
|
||
yield result
|
||
|
||
def _try_alloc_for_stream(self):
|
||
"""Attempt cf-orch allocation synchronously; return (alloc, ctx) or None."""
|
||
ctx = self._get_llm_context()
|
||
try:
|
||
alloc = ctx.__enter__()
|
||
if alloc is not None and alloc.warm:
|
||
return alloc, ctx
|
||
# Not warm — release and signal fallback
|
||
_safe_exit(ctx)
|
||
except Exception as exc:
|
||
logger.debug("cf-orch alloc for stream failed: %s", exc)
|
||
return None
|
||
|
||
@staticmethod
|
||
async def _stream_openai_compat(
|
||
base_url: str, api_key: str, model: str, prompt: str
|
||
) -> AsyncGenerator[str, None]:
|
||
client = AsyncOpenAI(base_url=base_url, api_key=api_key)
|
||
if model == "__auto__":
|
||
models = await client.models.list()
|
||
model = models.data[0].id
|
||
stream = await client.chat.completions.create(
|
||
model=model,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
stream=True,
|
||
)
|
||
async for chunk in stream:
|
||
if chunk.choices and chunk.choices[0].delta.content:
|
||
yield chunk.choices[0].delta.content
|
||
|
||
|
||
def _safe_exit(ctx) -> None:
|
||
try:
|
||
ctx.__exit__(None, None, None)
|
||
except Exception:
|
||
pass
|