From b03add8663d766ba550b792914d6cbd200cdf2a7 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Sun, 26 Apr 2026 09:18:55 -0700 Subject: [PATCH] feat(resume-matcher): tier-aware writing model routing via cf-orch Premium/ultra users with a custom_writing_model in their session are routed to that model as the first cf-orch candidate; all other tiers use the shared Qwen2.5-3B-Instruct base. complete_json() is unchanged since fine-tuned writing models aren't trained for structured output. Adds _request_tier and _request_writing_model ContextVars. Resolution order: USER_WRITING_MODELS env var (Monday path) then Heimdall meta (future path via peregrine#110). --- resume_matcher/apps/backend/app/llm.py | 34 +++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/resume_matcher/apps/backend/app/llm.py b/resume_matcher/apps/backend/app/llm.py index a9dcbd7..13b3cff 100644 --- a/resume_matcher/apps/backend/app/llm.py +++ b/resume_matcher/apps/backend/app/llm.py @@ -59,6 +59,12 @@ MAX_JSON_CONTENT_SIZE = 1024 * 1024 # 1MB # Request-scoped user_id — set once by session_middleware_dep, read inside _allocate_orch_async. # ContextVar is safe for concurrent async requests: each request task gets its own copy. _request_user_id: ContextVar[str | None] = ContextVar("request_user_id", default=None) +_request_tier: ContextVar[str | None] = ContextVar("request_tier", default=None) +# Custom writing model for premium/ultra users — populated from Heimdall license key meta. +# Set to None for all other tiers; complete() falls back to the shared base model. +_request_writing_model: ContextVar[str | None] = ContextVar("request_writing_model", default=None) + +_PREMIUM_TIERS: frozenset[str] = frozenset({"premium", "ultra"}) def set_request_user_id(user_id: str | None) -> None: @@ -69,6 +75,22 @@ def get_request_user_id() -> str | None: return _request_user_id.get() +def set_request_tier(tier: str | None) -> None: + _request_tier.set(tier) + + +def get_request_tier() -> str | None: + return _request_tier.get() + + +def set_request_writing_model(model: str | None) -> None: + _request_writing_model.set(model) + + +def get_request_writing_model() -> str | None: + return _request_writing_model.get() + + class LLMConfig(BaseModel): """LLM configuration model.""" @@ -481,10 +503,20 @@ async def complete( cf_orch_url = os.environ.get("CF_ORCH_URL", "").strip() if cf_orch_url: try: + # Premium/ultra users get their personal fine-tuned writing model as the + # first candidate; the base model is the fallback so cf-orch can + # degrade gracefully if the personal model isn't loaded yet. + tier = get_request_tier() + writing_model = get_request_writing_model() + model_candidates: list[str] = ( + [writing_model, "Qwen2.5-3B-Instruct"] + if writing_model and tier in _PREMIUM_TIERS + else ["Qwen2.5-3B-Instruct"] + ) async with _allocate_orch_async( cf_orch_url, "vllm", - model_candidates=["Qwen2.5-3B-Instruct"], + model_candidates=model_candidates, ttl_s=300.0, caller="peregrine-resume-matcher", ) as alloc: