feat(resume-matcher): tier-aware writing model routing via cf-orch

Premium/ultra users with a custom_writing_model in their session are routed to that model as the first cf-orch candidate; all other tiers use the shared Qwen2.5-3B-Instruct base. complete_json() is unchanged since fine-tuned writing models aren't trained for structured output. Adds _request_tier and _request_writing_model ContextVars. Resolution order: USER_WRITING_MODELS env var (Monday path) then Heimdall meta (future path via peregrine#110).
2026-04-26 09:18:55 -07:00 · 2026-04-26 09:18:55 -07:00 · b03add8663
commit b03add8663
parent 5e63faba0c
1 changed files with 33 additions and 1 deletions
--- a/resume_matcher/apps/backend/app/llm.py
+++ b/resume_matcher/apps/backend/app/llm.py
@ -59,6 +59,12 @@ MAX_JSON_CONTENT_SIZE = 1024 * 1024  # 1MB
 # Request-scoped user_id — set once by session_middleware_dep, read inside _allocate_orch_async.
 # ContextVar is safe for concurrent async requests: each request task gets its own copy.
 _request_user_id: ContextVar[str | None] = ContextVar("request_user_id", default=None)
+_request_tier: ContextVar[str | None] = ContextVar("request_tier", default=None)
+# Custom writing model for premium/ultra users — populated from Heimdall license key meta.
+# Set to None for all other tiers; complete() falls back to the shared base model.
+_request_writing_model: ContextVar[str | None] = ContextVar("request_writing_model", default=None)
+
+_PREMIUM_TIERS: frozenset[str] = frozenset({"premium", "ultra"})


 def set_request_user_id(user_id: str | None) -> None:
@ -69,6 +75,22 @@ def get_request_user_id() -> str | None:
    return _request_user_id.get()


+def set_request_tier(tier: str | None) -> None:
+    _request_tier.set(tier)
+
+
+def get_request_tier() -> str | None:
+    return _request_tier.get()
+
+
+def set_request_writing_model(model: str | None) -> None:
+    _request_writing_model.set(model)
+
+
+def get_request_writing_model() -> str | None:
+    return _request_writing_model.get()
+
+
 class LLMConfig(BaseModel):
    """LLM configuration model."""

@ -481,10 +503,20 @@ async def complete(
        cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
        if cf_orch_url:
            try:
+                # Premium/ultra users get their personal fine-tuned writing model as the
+                # first candidate; the base model is the fallback so cf-orch can
+                # degrade gracefully if the personal model isn't loaded yet.
+                tier = get_request_tier()
+                writing_model = get_request_writing_model()
+                model_candidates: list[str] = (
+                    [writing_model, "Qwen2.5-3B-Instruct"]
+                    if writing_model and tier in _PREMIUM_TIERS
+                    else ["Qwen2.5-3B-Instruct"]
+                )
                async with _allocate_orch_async(
                    cf_orch_url,
                    "vllm",
-                    model_candidates=["Qwen2.5-3B-Instruct"],
+                    model_candidates=model_candidates,
                    ttl_s=300.0,
                    caller="peregrine-resume-matcher",
                ) as alloc: