fix(llm-server): handle transformers 5.x BatchEncoding; use dtype kwarg
- apply_chat_template() returns BatchEncoding in transformers 5.x (not bare tensor); extract .input_ids explicitly with fallback for 4.x compat - Switch from deprecated torch_dtype= to dtype= in from_pretrained()
This commit is contained in:
parent
c78341fc6f
commit
2d095f0090
1 changed files with 5 additions and 3 deletions
|
|
@ -62,11 +62,13 @@ def chat_completions(req: ChatRequest) -> dict[str, Any]:
|
||||||
|
|
||||||
conversation = [{"role": m.role, "content": m.content} for m in req.messages]
|
conversation = [{"role": m.role, "content": m.content} for m in req.messages]
|
||||||
try:
|
try:
|
||||||
input_ids = _tokenizer.apply_chat_template(
|
encoded = _tokenizer.apply_chat_template(
|
||||||
conversation,
|
conversation,
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
).to(_device)
|
)
|
||||||
|
# transformers 5.x returns BatchEncoding; 4.x returned a bare tensor
|
||||||
|
input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(_device)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
|
raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
|
||||||
|
|
||||||
|
|
@ -113,7 +115,7 @@ def _load_model(model_path: str, gpu_id: int) -> None:
|
||||||
_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
||||||
_model = AutoModelForCausalLM.from_pretrained(
|
_model = AutoModelForCausalLM.from_pretrained(
|
||||||
model_path,
|
model_path,
|
||||||
torch_dtype=torch.float16 if "cuda" in _device else torch.float32,
|
dtype=torch.float16 if "cuda" in _device else torch.float32,
|
||||||
device_map={"": _device},
|
device_map={"": _device},
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue