From 2d095f0090f52b424bee4bb5180a31b73245bf33 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 2 Apr 2026 16:36:07 -0700 Subject: [PATCH] fix(llm-server): handle transformers 5.x BatchEncoding; use dtype kwarg - apply_chat_template() returns BatchEncoding in transformers 5.x (not bare tensor); extract .input_ids explicitly with fallback for 4.x compat - Switch from deprecated torch_dtype= to dtype= in from_pretrained() --- circuitforge_core/resources/inference/llm_server.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/circuitforge_core/resources/inference/llm_server.py b/circuitforge_core/resources/inference/llm_server.py index 932e860..a049e0f 100644 --- a/circuitforge_core/resources/inference/llm_server.py +++ b/circuitforge_core/resources/inference/llm_server.py @@ -62,11 +62,13 @@ def chat_completions(req: ChatRequest) -> dict[str, Any]: conversation = [{"role": m.role, "content": m.content} for m in req.messages] try: - input_ids = _tokenizer.apply_chat_template( + encoded = _tokenizer.apply_chat_template( conversation, return_tensors="pt", add_generation_prompt=True, - ).to(_device) + ) + # transformers 5.x returns BatchEncoding; 4.x returned a bare tensor + input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(_device) except Exception as exc: raise HTTPException(500, detail=f"Tokenisation failed: {exc}") @@ -113,7 +115,7 @@ def _load_model(model_path: str, gpu_id: int) -> None: _tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) _model = AutoModelForCausalLM.from_pretrained( model_path, - torch_dtype=torch.float16 if "cuda" in _device else torch.float32, + dtype=torch.float16 if "cuda" in _device else torch.float32, device_map={"": _device}, trust_remote_code=True, )