accelerate、trainer、lightning还是pytorch?
2025年该使用ray、accelerate、trainer、lightning还是pytorch? - CodeCrafter的回答
以下是四种不同方式的最基础训练模板,以一个简单的文本分类任务(比如IMDB情感分析)为例,展示如何使用:
一、最普通的 PyTorch 写法(原生 PyTorch)
这是最基础的写法,手动管理数据加载、模型、优化器、损失函数和训练循环。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer# 假设我们有一个简单的文本分类任务,比如二分类
# 这里为了简化,使用伪数据,实际中应该用真实的数据集如IMDBclass FakeTextDataset(Dataset):def __init__(self, num_samples=1000, max_length=128):self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")self.texts = ["This is a sample sentence." for _ in range(num_samples)]self.labels = [0 if i % 2 == 0 else 1 for i in range(num_samples)] # 二分类self.max_length = max_lengthdef __len__(self):return len(self.texts)def __getitem__(self, idx):encoding = self.tokenizer(self.texts[idx],max_length=self.max_length,padding="max_length",truncation=True,return_tensors="pt")item = {key: val.squeeze(0) for key, val in encoding.items()}item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)return item# 定义一个简单模型(这里直接用BERT做示例,实际可以更简单)
from transformers import AutoModelForSequenceClassificationmodel_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)# 数据
dataset = FakeTextDataset()
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)# 优化器和损失函数
optimizer = optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = nn.CrossEntropyLoss()# 训练循环
num_epochs = 3
model.train()
for epoch in range(num_epochs):for batch in dataloader:optimizer.zero_grad()input_ids = batch['input_ids'].to(device)attention_mask = batch['attention_mask'].to(device)labels = batch['labels'].to(device)outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)loss = outputs.lossloss.backward()optimizer.step()print(f"Epoch {epoch + 1} completed")print("Training finished.")
二、使用 PyTorch Lightning
PyTorch Lightning 是对原生 PyTorch 的封装,提供了更简洁的训练循环和日志等功能。
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pytorch_lightning as plclass FakeTextDataset(Dataset):def __init__(self, num_samples=1000, max_length=128):self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")self.texts = ["This is a sample sentence." for _ in range(num_samples)]self.labels = [0 if i % 2 == 0 else 1 for i in range(num_samples)]self.max_length = max_lengthdef __len__(self):return len(self.texts)def __getitem__(self, idx):encoding = self.tokenizer(self.texts[idx],max_length=self.max_length,padding="max_length",truncation=True,return_tensors="pt")item = {key: val.squeeze(0) for key, val in encoding.items()}item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)return itemclass TextClassificationModel(pl.LightningModule):def __init__(self):super().__init__()self.model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)self.loss_fn = torch.nn.CrossEntropyLoss()def forward(self, input_ids, attention_mask, labels=None):return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)def training_step(self, batch, batch_idx):outputs = self(**batch)loss = outputs.lossself.log("train_loss", loss)return lossdef configure_optimizers(self):return torch.optim.AdamW(self.parameters(), lr=5e-5)def train_dataloader(self):dataset = FakeTextDataset()return DataLoader(dataset, batch_size=8, shuffle=True)# 训练
trainer = pl.Trainer(max_epochs=3, accelerator="auto", devices="auto")
model = TextClassificationModel()
trainer.fit(model)
三、使用 HuggingFace Accelerate
HuggingFace Accelerate 是用于分布式/多GPU/TPU训练的工具,可以和原生 PyTorch 配合使用,简化设备管理。
# 首先确保你已经运行过:accelerate config,进行配置
from accelerate import Accelerator
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm# 准备数据集
class FakeTextDataset(torch.utils.data.Dataset):def __init__(self, num_samples=1000, max_length=128):from transformers import AutoTokenizerself.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")self.texts = ["This is a sample sentence." for _ in range(num_samples)]self.labels = [0 if i % 2 == 0 else 1 for i in range(num_samples)]self.max_length = max_lengthdef __len__(self):return len(self.texts)def __getitem__(self, idx):encoding = self.tokenizer(self.texts[idx],max_length=self.max_length,padding="max_length",truncation=True,return_tensors="pt")item = {k: v.squeeze(0) for k, v in encoding.items()}item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)return item# 模型
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()# Accelerate 初始化
accelerator = Accelerator()
model, optimizer, dataloader = accelerator.prepare(model, optimizer, DataLoader(FakeTextDataset(), batch_size=8, shuffle=True)
)device = accelerator.device
model.to(device)# 训练循环
num_epochs = 3
model.train()
for epoch in range(num_epochs):progress_bar = tqdm(dataloader, disable=not accelerator.is_local_main_process)for batch in progress_bar:optimizer.zero_grad()input_ids = batch['input_ids'].to(device)attention_mask = batch['attention_mask'].to(device)labels = batch['labels'].to(device)outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)loss = outputs.lossaccelerator.backward(loss)optimizer.step()progress_bar.set_description(f"Epoch {epoch}, Loss: {loss.item():.4f}")print("Training finished with Accelerate.")
四、使用 HuggingFace Trainer
HuggingFace Trainer 是官方提供的高级训练接口,几乎不需要写训练循环,适合快速实验。
from transformers import (AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments,DataCollatorWithPadding
)
from datasets import Dataset
import random# 创建伪数据集(实际可用 load_dataset("imdb") 等)
def create_fake_dataset(num_samples=1000):texts = ["This is a sample sentence." for _ in range(num_samples)]labels = [0 if i % 2 == 0 else 1 for i in range(num_samples)]return {"text": texts, "label": labels}raw_data = create_fake_dataset()
dataset = Dataset.from_dict(raw_data)# 划分训练/验证(这里简单起见只用训练集)
dataset = dataset.train_test_split(test_size=0.1)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]# 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")def tokenize_function(examples):return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_eval = eval_dataset.map(tokenize_function, batched=True)# 模型
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)# TrainingArguments
training_args = TrainingArguments(output_dir="./results",evaluation_strategy="epoch",save_strategy="epoch",learning_rate=5e-5,per_device_train_batch_size=8,per_device_eval_batch_size=8,num_train_epochs=3,weight_decay=0.01,logging_dir='./logs',logging_steps=10,load_best_model_at_end=True,
)# Trainer
trainer = Trainer(model=model,args=training_args,train_dataset=tokenized_train,eval_dataset=tokenized_eval,tokenizer=tokenizer,
)# 开始训练
trainer.train()
注:如果你有现成的 HuggingFace Dataset(如> datasets.load_dataset(“imdb”)),可以直接替换上面的数据部分,更加方便。
总结对比
方法 | 特点 | 适用场景 | 代码复杂度 |
---|---|---|---|
原生 PyTorch | 最灵活,最底层,需要手动管理一切 | 想完全控制训练流程 | ⭐⭐⭐⭐⭐ |
PyTorch Lightning | 结构化、模块化,自动训练循环,支持日志/多GPU等 | 希望比原生PyTorch简洁,又不想依赖HF生态 | ⭐⭐⭐ |
HuggingFace Accelerate | 专注于分布式训练,与原生PyTorch兼容,自动处理设备/多GPU | 大规模训练/多机多卡/TPU,喜欢灵活控制 | ⭐⭐⭐⭐ |
HuggingFace Trainer | 极简,几乎不用写训练逻辑,HF生态集成最好 | 快速实验,使用HF模型和数据集 | ⭐⭐ |
🔧 推荐选择:
• 如果你初学/想完全控制 → 原生 PyTorch
• 如果你想要结构清晰、易扩展、支持日志 → PyTorch Lightning
• 如果你要训练大模型/分布式/多GPU → Accelerate
• 如果你用HuggingFace模型,想快速训练 → Trainer
如你有具体任务(如文本生成、多模态等),我也可以提供对应任务的模板。