python使用transformer库推理
代码
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer# 1. load model
model_path = "/ssd3/models/Qwen2.5-0.5B-Instruct/"model = AutoModelForCausalLM.from_pretrained(model_path,device_map='cuda',torch_dtype=torch.float16,
)# 2. init tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Need to set the padding token to the eos token for generation
tokenizer.pad_token = tokenizer.eos_tokenprompts = ["你是谁",
]for prompt in prompts:messages = [{"role": "user", "content": prompt},]batch = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)# 3. tokenizemodel_inputs = tokenizer([batch], return_tensors="pt").to('cuda')# model_inputs = tokenizer([prompt], padding=True, truncation=True, return_tensors="pt").to('cuda')# 4. infergenerated_ids = model.generate(**model_inputs, max_new_tokens=16)generated_ids = [output_ids[len(input_ids) :]for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]# 5. detokenizeresponse = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)print(response)
debug信息
# debug model:
Qwen2ForCausalLM((model): Qwen2Model((embed_tokens): Embedding(151936, 896)(layers): ModuleList((0-23): 24 x Qwen2DecoderLayer((self_attn): Qwen2Attention((q_proj): Linear(in_features=896, out_features=896, bias=True)(k_proj): Linear(in_features=896, out_features=128, bias=True)(v_proj): Linear(in_features=896, out_features=128, bias=True)(o_proj): Linear(in_features=896, out_features=896, bias=False))(mlp): Qwen2MLP((gate_proj): Linear(in_features=896, out_features=4864, bias=False)(up_proj): Linear(in_features=896, out_features=4864, bias=False)(down_proj): Linear(in_features=4864, out_features=896, bias=False)(act_fn): SiLU())(input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)(post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)))(norm): Qwen2RMSNorm((896,), eps=1e-06)(rotary_emb): Qwen2RotaryEmbedding())(lm_head): Linear(in_features=896, out_features=151936, bias=False)
)# debug batch:
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
你是谁<|im_end|>
<|im_start|>assistant# debug model_inputs:
{'input_ids': tensor([[151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465,553, 54364, 14817, 13, 1446, 525, 264, 10950, 17847,13, 151645, 198, 151644, 872, 198, 105043, 100165, 151645,198, 151644, 77091, 198]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}# debug generated_ids:
[tensor([104198, 48, 16948, 3837, 102661, 99718, 102014, 104491],device='cuda:0')]['我是Qwen,阿里云推出的一种']