import os
import gc
import torch
import loggingfrom datasets import load_dataset
from llmcompressor import oneshot
from llmcompressor.modifiers.awq import AWQModifier
from transformers import AutoModelForCausalLM, AutoTokenizer
gc.collect()
torch.cuda.empty_cache()
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)model_path = "./models/Qwen3-Coder-30B-A3B-Instruct"
quant_path = "./Qwen3-Coder-30B-A3B-Instruct-AWQ"
max_memory = {0: "23GB", 1: "23GB", "cpu": "100GB"}
logger.info(f"设定最大内存分配: {max_memory}")logger.info("使用 accelerate 的 'auto' device_map 并结合 max_memory 进行加载...")
torch_dtype = torch.float16
logger.info(f"设定模型加载精度 (torch_dtype): {torch_dtype}")
logger.info("正在加载模型和tokenizer...")
try:model = AutoModelForCausalLM.from_pretrained(model_path,torch_dtype=torch_dtype,trust_remote_code=True,device_map="auto",max_memory=max_memory,attn_implementation="eager",low_cpu_mem_usage=True)logger.info(f"模型已加载,分配的设备映射 (model.hf_device_map): {model.hf_device_map}")
except Exception as e:logger.error(f"加载模型时出错: {e}")raisetokenizer = AutoTokenizer.from_pretrained(model_path,trust_remote_code=True
)
logger.info("正在加载校准数据集...")
NUM_CALIBRATION_SAMPLES = 256
MAX_SEQUENCE_LENGTH = 512
logger.info(f"使用前 {NUM_CALIBRATION_SAMPLES} 个样本进行校准...")
ds = load_dataset("parquet",data_files="./train-00000-of-00206.parquet",split=f"train[:{NUM_CALIBRATION_SAMPLES}]"
)
ds = ds.shuffle(seed=42)
logger.info(f"数据集列名: {ds.column_names}")
logger.info(f"样本数量: {len(ds)}")
def preprocess(examples):conversations = [[{"role": "user", "content": text}]for text in examples["content"]]texts = tokenizer.apply_chat_template(conversations,tokenize=False,)return {"text": texts}logger.info("正在预处理校准数据集(多进程加速)...")
ds = ds.map(preprocess, batched=True, batch_size=128)
logger.info("配置 AWQ 量化参数...")
recipe = [AWQModifier(ignore=["lm_head","re:.*mlp.gate$","re:.*mlp.shared_experts.gate$"],scheme="W4A16", targets=["Linear"],),
]
logger.info("开始 AWQ 量化过程...")
oneshot(model=model,dataset=ds,recipe=recipe,max_seq_length=MAX_SEQUENCE_LENGTH,num_calibration_samples=NUM_CALIBRATION_SAMPLES,
)
logger.info("\n\n")
logger.info("========== 样本生成测试 ==============")
input_text = "写一个Python函数,实现快速排序"
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=100)
logger.info(tokenizer.decode(output[0], skip_special_tokens=True))
logger.info("==========================================\n\n")
logger.info("正在保存量化模型(4GB分片)...")
model.save_pretrained(quant_path,save_compressed=True,max_shard_size="4GB",safe_serialization=True
)
tokenizer.save_pretrained(quant_path)
logger.info(f"AWQ 量化完成!模型已保存至 {quant_path}")
gc.collect()
torch.cuda.empty_cache()
import globshards = glob.glob(os.path.join(quant_path, "model-*.safetensors"))
logger.info(f"\n模型已成功分片保存,共 {len(shards)} 个分片:")
for shard in shards:size_gb = os.path.getsize(shard) / (1024**3)logger.info(f"- {os.path.basename(shard)}: {size_gb:.2f} GB")logger.info(f"\n量化完成!模型已保存至 {quant_path}")
logger.info("注意:模型现在使用标准命名格式 model-00001-of-0000X.safetensors")