当前位置：首页 > news >正文

微调codegeex

news 2025/9/18 14:43:46

项目需要微调这个模型，我的环境：4张V100，16GB，因此一张卡放不下，需要多卡微调。但是浅搜了一下发现官方没有提供微调demo：https://github.com/THUDM/CodeGeeX/issues/15
由于codegeex是由chatglm微调来了，于是以为可以直接用chatglm的代码来微调，试了https://github.com/THUDM/GLM-4/tree/main/finetune_demo和这个https://github.com/THUDM/ChatGLM-6B/tree/main/ptuning都会报错，于是只能另寻他法。

出于惰性，还是先尝试找别人写好的，先找到了一个MTFCoder框架，根据其描述是支持codegeex的，还发布了微信公众号文章，按照它的教程create环境并运行，就是会一直报错，安装各种包就各种冲突，然后是

CUDA SETUP: CUDA detection failed! Possible reasons:
1. You need to manually override the PyTorch CUDA version. Please see: "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md
2. CUDA driver not installed
3. CUDA not installed
4. You have multiple conflicting CUDA libraries
5. Required library not pre-compiled for this bitsandbytes release!
CUDA SETUP: If you compiled from source, try again with make CUDA_VERSION=DETECTED_CUDA_VERSION for example, make CUDA_VERSION=118.
CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via conda list | grep cuda.

费了好大劲找出原因，这是由于系统的cuda版本和pytorch安装的不一致导致的，重新安装pytorch之后解决了，紧接着报这个错：

[rank5]:     torch._C._cuda_setDevice(device)
[rank5]: RuntimeError: CUDA error: invalid device ordinal
[rank5]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
[rank5]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1
[rank5]: Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

一直解决不了，遂放弃。。。
然后又找到一个：https://github.com/hiyouga/ChatGLM-Efficient-Tuning，虽然主页写了不再维护，但是还是想先试试，结果也是各种报错。
先是

ImportError: cannot import name 'PPODecorators' from 'trl.trainer.ppo_trainer'

看了一下requirements里面的要求：trl>=0.4.7，实际安装的已经远高于该版本了，于是回退到trl>=0.4.7解决了问题，然后还是报错，又是版本不匹配，解决了几个之后还是失去了耐心，果然不再维护的仓库还是少碰。。。

终于下定决心自己写，发现其实也不难，就是处理数据+加载模型+设置参数+开始训练，代码如下：

from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import torch
from datetime import datetime
from accelerate import dispatch_model
from accelerate import infer_auto_device_map

def process_func(example):
    MAX_LENGTH = 450 # 过长的输入会增加计算量，导致更高的 GPU/CPU 占用，影响训练和推理效率。适当的 MAX_LENGTH 让批处理（batch processing）更加高效，避免不必要的 padding 计算。
    '''
    每个模型都会有自己的tokenizer，一般都会提供apply_chat_template，如果不会提供的话就自己用字符串拼接，
    比如chatglm系列的就是<|system|>\nxxxxx<|user|>\nxxxx<|assisstant|>\nxxxx
    qwen系列的是<|im_start|>system\nxxxx<|im_end|>\n<|im_start|>user\nxxxx<|im_end|>\n<|im_start|>assistant\nxxxx
    用自带的一般都会提供apply_chat_template就不会出错
    '''
    instruction_str = tokenizer.apply_chat_template(
            [{"role": "system", "content": example['instruction']},
             {"role": "user", "content": example['input']},],
            tokenize=False, # 就直接免去再调用一次tokenizer
            add_generation_prompt=True,
            add_special_tokens = False
        )
    instruction = tokenizer(instruction_str, add_special_tokens=False)
    response = tokenizer(example['output'], add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] 
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

t1 = datetime.now()
df = pd.read_json('/home/xxx/data/train_data_codegeex.json', encoding='utf-8') # 数据集路径，内容的格式为[{"instruction":"","input":"","output":""},{"instruction":"","input":"","output":""}...]不过相信你只要看了代码自己也能推断出来
ds = Dataset.from_pandas(df)
MODEL_PATH = r"/home/xxx/codegeex4-all-9b"
LORA_PATH = r"/home/xxx/20250216/CodeGeex4_lora"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False, trust_remote_code=True)
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
print('###',tokenizer.decode(tokenized_id[0]['input_ids'])) #输出看看MAX_LENGTH够不够，会不会截断不该截断的东西
print('###',tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"]))))
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.bfloat16,trust_remote_code=True)
model.enable_input_require_grads()
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], # 具体调整哪一层比较好需要做实验去试
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05
)
model = get_peft_model(model, config)
model.print_trainable_parameters()
args = TrainingArguments(
    output_dir=LORA_PATH,
    per_device_train_batch_size=1, # per_device_train_batch_size * gradient_accumulation_steps = 实际的batch_size，如果显存小就将前者调小，后者调大，相当于用时间换空间
    gradient_accumulation_steps=4,
    logging_steps=5,
    num_train_epochs=20,
    save_steps=10, # 我的数据集比较少，所以设得比较小，根据自己的情况调整
    learning_rate=1e-5,
    save_on_each_node=True,
    # gradient_checkpointing=True
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
trainer.train()
print('训练用时：n', datetime.now()-t1)

以为万事大吉了，结果运行之后报错：

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0!

因此加了下面这段代码：

from accelerate import dispatch_model

# 获取原模型的 device_map
device_map = model.hf_device_map  # 这个是 transformers 自动分配的设备映射

# LoRA 适配后，重新分配设备
model = get_peft_model(model, config)
model = dispatch_model(model, device_map=device_map)  # 让 LoRA 适配后模型继续遵循原来的 device_map

结果报错：

ValueError: The device_map provided does not give any device for the following parameters: base_model.model.transformer.embedding.word_embeddings.weight, base_model.model.transformer.rotary_pos_emb.inv_freq, ……

原因是get_peft_model(model, config) 修改了模型的参数结构，但 device_map 仍然是 LoRA 适配前生成的，导致 dispatch_model 不能正确分配 LoRA 参数。因此应该先 LoRA 适配，手动让transformers 自动生成 device_map，然后用 dispatch_model 重新分配设备。

from accelerate import infer_auto_device_map, dispatch_model
from transformers import AutoModelForCausalLM

# 先加载原始模型（不指定 device_map="auto"）
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# 进行 LoRA 适配
model = get_peft_model(model, config)

# 自动生成 `device_map`
device_map = infer_auto_device_map(model, max_memory={0: "16GiB", 1: "16GiB", 2: "16GiB", 3: "16GiB"})  # 让 `transformers` 计算合理的设备分配

# 重新分配模型到多个 GPU
model = dispatch_model(model, device_map=device_map)

结果倒是不会出之前的错了，但是在运行train()的时候报错CUDA out of memory！然后我print了一下每个参数的device，发现参数基本都放在了cuda:0上面，少量放在cuda:1上，2和3根本没用到，这样虽然加载是够了，但是训练就不够了。因此我想到一个小trick，max_memory设小一点，让模型的参数平均分布在多张卡上，这样训练起来才有空间，我试了一下{0: "6GiB", 1: "6GiB", 2: "6GiB", 3: "6GiB"}，这样就平均一些，不过主要也只用到前两张卡，只有最后一层放在了cuda:2。再跑，又出现了原来的问题，说明还是得自己设置device_map，但是全部都自己设置也非常麻烦，于是我就在infer_auto_device_map的基础上做一些修改，把同一层的参数放到同一张卡上，然后试着跑，如果报错就根据报错的代码找到对应的卡以及卡上的数据，再进行调整，最终终于给我跑起来了，真是不容易。。。

最终版本的代码如下：

from datasets import Dataset
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer, GenerationConfig
import torch
from datetime import datetime
from accelerate import dispatch_model
from accelerate import infer_auto_device_map

def process_func(example):
    MAX_LENGTH = 450 
    instruction_str = tokenizer.apply_chat_template(
            [{"role": "system", "content": example['instruction']},
             {"role": "user", "content": example['input']},],
            tokenize=False, 
            add_generation_prompt=True,
            add_special_tokens = False
        )
    instruction = tokenizer(instruction_str, add_special_tokens=False)
    response = tokenizer(example['output'], add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1] 
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    assert isinstance(input_ids, list), f"input_ids 不是列表: {type(input_ids)}"
    assert isinstance(attention_mask, list), f"attention_mask 不是列表: {type(attention_mask)}"
    assert isinstance(labels, list), f"labels 不是列表: {type(labels)}"
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

t1 = datetime.now()
df = pd.read_json('/home/xxx/data/train_data_codegeex.json', encoding='utf-8')
ds = Dataset.from_pandas(df)
MODEL_PATH = r"/home/xxx/codegeex4-all-9b"
LORA_PATH = r"/home/xxx/20250216/CodeGeex4_lora"
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False, trust_remote_code=True)
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)
print('###',tokenizer.decode(tokenized_id[0]['input_ids']))
print('###',tokenizer.decode(list(filter(lambda x: x != -100, tokenized_id[1]["labels"]))))
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, torch_dtype=torch.bfloat16,trust_remote_code=True)
model.enable_input_require_grads()
from peft import LoraConfig, TaskType, get_peft_model
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["query_key_value", "dense", "dense_h_to_4h", "dense_4h_to_h"], # 具体调整哪一层比较好需要做实验去试
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05
)
model = get_peft_model(model, config)
device_map = infer_auto_device_map(model, max_memory={0: "6GiB", 1: "6GiB", 2: "6GiB", 3: "6GiB"})
device_map["base_model.model.transformer.encoder.layers.9.mlp"] = 0
device_map["base_model.model.transformer.encoder.layers.24.mlp.dense_4h_to_h"] = 1
device_map["base_model.model.transformer.output_layer"] = 0
for name, device in device_map.items():
    print(f"{name}: {device}")
model = dispatch_model(model, device_map=device_map)
model.print_trainable_parameters()
args = TrainingArguments(
    output_dir=LORA_PATH,
    per_device_train_batch_size=1, 
    gradient_accumulation_steps=4,
    logging_steps=5,
    num_train_epochs=20,
    save_steps=10, 
    learning_rate=1e-5,
    save_on_each_node=True,
    # gradient_checkpointing=True
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)
trainer.train()
print('训练用时：n', datetime.now()-t1)

经过这次尝试，我也总结出一些微调的小经验：
优先自己写，不要偷懒，别人写的不一定适合自己的机器和环境，时间就都浪费在debug上了，啥也学不到
device_map的设置可以多尝试，不知道哪些数据要放在同一张卡上，那就拿一张卡只放一层的数据，报错的时候会显示两个cuda的号码，就可以推断出那个只有一层数据的卡上的数据所属的层。
max_memory不用设满，根据自己的需求调。