当前位置：首页 > news >正文

python使用transformer库推理

news 2025/9/6 12:38:00

代码

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer# 1. load model
model_path = "/ssd3/models/Qwen2.5-0.5B-Instruct/"model = AutoModelForCausalLM.from_pretrained(model_path,device_map='cuda',torch_dtype=torch.float16,
)# 2. init tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Need to set the padding token to the eos token for generation
tokenizer.pad_token = tokenizer.eos_tokenprompts = ["你是谁",
]for prompt in prompts:messages = [{"role": "user", "content": prompt},]batch = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)# 3. tokenizemodel_inputs = tokenizer([batch], return_tensors="pt").to('cuda')# model_inputs = tokenizer([prompt], padding=True, truncation=True, return_tensors="pt").to('cuda')# 4. infergenerated_ids = model.generate(**model_inputs, max_new_tokens=16)generated_ids = [output_ids[len(input_ids) :]for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]# 5. detokenizeresponse = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)print(response)

debug信息

# debug model: 
Qwen2ForCausalLM((model): Qwen2Model((embed_tokens): Embedding(151936, 896)(layers): ModuleList((0-23): 24 x Qwen2DecoderLayer((self_attn): Qwen2Attention((q_proj): Linear(in_features=896, out_features=896, bias=True)(k_proj): Linear(in_features=896, out_features=128, bias=True)(v_proj): Linear(in_features=896, out_features=128, bias=True)(o_proj): Linear(in_features=896, out_features=896, bias=False))(mlp): Qwen2MLP((gate_proj): Linear(in_features=896, out_features=4864, bias=False)(up_proj): Linear(in_features=896, out_features=4864, bias=False)(down_proj): Linear(in_features=4864, out_features=896, bias=False)(act_fn): SiLU())(input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)(post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)))(norm): Qwen2RMSNorm((896,), eps=1e-06)(rotary_emb): Qwen2RotaryEmbedding())(lm_head): Linear(in_features=896, out_features=151936, bias=False)
)# debug batch: 
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
你是谁<|im_end|>
<|im_start|>assistant# debug model_inputs: 
{'input_ids': tensor([[151644,   8948,    198,   2610,    525,   1207,  16948,     11,   3465,553,  54364,  14817,     13,   1446,    525,    264,  10950,  17847,13, 151645,    198, 151644,    872,    198, 105043, 100165, 151645,198, 151644,  77091,    198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}# debug generated_ids: 
[tensor([104198,     48,  16948,   3837, 102661,  99718, 102014, 104491],device='cuda:0')]['我是Qwen，阿里云推出的一种']