基于autoawq进行qwen3 的awq量化
awq量化 精度降低6个点。推理耗时降低从0.447s降低到0.4s
在llamafactory环境中,安装
pip install autoawq
量化代码:
def qu_awq():from awq import AutoAWQForCausalLMfrom transformers import AutoTokenizerimport jsonmodel_path = "model_path"quant_path = "awq_model_path"calib_data = "_quantize.json"quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}# Load modelmodel = AutoAWQForCausalLM.from_pretrained(model_path)tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, device_map="auto", safetensors=True)# The pattern of data""" # Examplemsg=[{"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},{"role": "user", "content": "Tell me who you are."},{"role": "assistant", "content": "I am a large language model named Qwen..."}]data = []for msg in dataset:text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)data.append(text.strip())return data"""# !!!!!!!!! Customize the code here for calib_data processing !!!!!!!!!!!!!!def data_gen():data = []with open(calib_data, "r", encoding="utf-8") as file:for line in file:msg = json.loads(line)["messages"]text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=False)data.append(text.strip())return data# !!!!!!!!! Customize the code here for calib_data processing !!!!!!!!!!!!!!with open(calib_data, 'r', encoding='utf-8') as f:json_data = json.load(f)json_data = [each["text"] for each in json_data]# Quantizemodel.quantize(tokenizer,quant_config=quant_config,calib_data=json_data,n_parallel_calib_samples=1,max_calib_samples=256,max_calib_seq_len=1024,)# Save quantized modelmodel.save_quantized(quant_path)tokenizer.save_pretrained(quant_path)print(f'Model is quantized and saved at "{quant_path}"')qu_awq()
推理:
#!/bin/bash
# XFORMERS 比 FLASH_ATTN 少10ms
#export VLLM_ATTENTION_BACKEND=XFORMERS #old machine use
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
source /opt/conda/etc/profile.d/conda.sh
conda activate /opt/conda/envs/vllm085
Model_path="/llm/models/general_knowledge_agent_router/general_knowledge_agent_202250820_v21_01_awq5"
#Model_path="/llm/models/Qwen3-4B-Instruct-2507"CUDA_VISIBLE_DEVICES=0 nohup python -m vllm.entrypoints.openai.api_server \--model ${Model_path} \--served-model-name 'qwen3_4b' \--host 0.0.0.0 \--port 9005 \--max-model-len 9000 \--trust-remote-code \--device cuda \--tensor-parallel-size 1 \--swap-space 0 \--quantization awq \--dtype float16 \--gpu-memory-utilization 0.7 \--max-num-seqs 1 > eval_qwen3_quant.log 2>&1 &