From b03add8663d766ba550b792914d6cbd200cdf2a7 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Sun, 26 Apr 2026 09:18:55 -0700
Subject: [PATCH] feat(resume-matcher): tier-aware writing model routing via
 cf-orch

Premium/ultra users with a custom_writing_model in their session are
routed to that model as the first cf-orch candidate; all other tiers
use the shared Qwen2.5-3B-Instruct base. complete_json() is unchanged
since fine-tuned writing models aren't trained for structured output.

Adds _request_tier and _request_writing_model ContextVars. Resolution
order: USER_WRITING_MODELS env var (Monday path) then Heimdall meta
(future path via peregrine#110).
---
 resume_matcher/apps/backend/app/llm.py | 34 +++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/resume_matcher/apps/backend/app/llm.py b/resume_matcher/apps/backend/app/llm.py
index a9dcbd7..13b3cff 100644
--- a/resume_matcher/apps/backend/app/llm.py
+++ b/resume_matcher/apps/backend/app/llm.py
@@ -59,6 +59,12 @@ MAX_JSON_CONTENT_SIZE = 1024 * 1024  # 1MB
 # Request-scoped user_id — set once by session_middleware_dep, read inside _allocate_orch_async.
 # ContextVar is safe for concurrent async requests: each request task gets its own copy.
 _request_user_id: ContextVar[str | None] = ContextVar("request_user_id", default=None)
+_request_tier: ContextVar[str | None] = ContextVar("request_tier", default=None)
+# Custom writing model for premium/ultra users — populated from Heimdall license key meta.
+# Set to None for all other tiers; complete() falls back to the shared base model.
+_request_writing_model: ContextVar[str | None] = ContextVar("request_writing_model", default=None)
+
+_PREMIUM_TIERS: frozenset[str] = frozenset({"premium", "ultra"})
 
 
 def set_request_user_id(user_id: str | None) -> None:
@@ -69,6 +75,22 @@ def get_request_user_id() -> str | None:
     return _request_user_id.get()
 
 
+def set_request_tier(tier: str | None) -> None:
+    _request_tier.set(tier)
+
+
+def get_request_tier() -> str | None:
+    return _request_tier.get()
+
+
+def set_request_writing_model(model: str | None) -> None:
+    _request_writing_model.set(model)
+
+
+def get_request_writing_model() -> str | None:
+    return _request_writing_model.get()
+
+
 class LLMConfig(BaseModel):
     """LLM configuration model."""
 
@@ -481,10 +503,20 @@ async def complete(
         cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip()
         if cf_orch_url:
             try:
+                # Premium/ultra users get their personal fine-tuned writing model as the
+                # first candidate; the base model is the fallback so cf-orch can
+                # degrade gracefully if the personal model isn't loaded yet.
+                tier = get_request_tier()
+                writing_model = get_request_writing_model()
+                model_candidates: list[str] = (
+                    [writing_model, "Qwen2.5-3B-Instruct"]
+                    if writing_model and tier in _PREMIUM_TIERS
+                    else ["Qwen2.5-3B-Instruct"]
+                )
                 async with _allocate_orch_async(
                     cf_orch_url,
                     "vllm",
-                    model_candidates=["Qwen2.5-3B-Instruct"],
+                    model_candidates=model_candidates,
                     ttl_s=300.0,
                     caller="peregrine-resume-matcher",
                 ) as alloc: