东莞网站优化的具体方案重庆做网络优化公司电话
一、Embedding与变长输入处理
One-hot编码:模型没办法知道不同词之间的距离,比如男人和绅士之间的距离。所以为了让模型知道词之间的距离,Embedding:要把词变为一个向量,让我们知道词之间是不是近义词。
发展历史:
Word2vec:里面有Skip-gram:由一个词预测多个词,还有CBOW:多词预测一次
Skip-gram:“我 知道 彭于晏 很帅” 上下文 输入 输出 彭于晏
“我 知道 吴彦祖 很帅”
那么因为前面的输入很像,所以彭于晏和吴彦祖可以被认为是近义词
这样就可以把词变成密集向量
Embedding技术:若刚好有一个样本,每个词的id分别是[3,2,5,9,1]
假设词典大小是1w,1w行16列,那么3就取第三行的密集向量,以此类推,这样一维就变成二维的了。先把样本都变成密集向量,变成矩阵以后,直接和后端模型对接,最终去训练。而原本的Word2vec是独立运作的模型,先算出密集向量,才能进行后面的模型训练
二、代码
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as Fprint(sys.version_info)
for module in mpl, np, pd, sklearn, torch:print(module.__name__, module.__version__)device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)seed = 42#%% md
# 准备数据
#%%
from tensorflow import keras #这里的波浪线不用管#用karas有的数据集imdb,电影分类,分电影是积极的,还是消极的,正常都可以下载成功
imdb = keras.datasets.imdb
#载入数据使用下面两个参数
vocab_size = 10000 #词典大小,仅保留训练数据中前10000个最经常出现的单词,低频单词被舍弃
index_from = 3 #0,1,2,3空出来做别的事
#前一万个词出现词频最高的会保留下来进行处理,后面的作为特殊字符处理,
# 小于3的id都是特殊字符,下面代码有写
# 需要注意的一点是取出来的词表还是从1开始的,需要做处理
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size, index_from=index_from)#载入词表,看下词表长度,词表就像英语字典
word_index = imdb.get_word_index()
print(len(word_index))
print(type(word_index))
构造 word2idx 和 idx2word:
word2idx = {word: idx + 3 for word, idx in word_index.items()} # 0,1,2,3空出来做别的事,这里的idx是从1开始的,所以加3
word2idx.update({"[PAD]": 0, # 填充 token"[BOS]": 1, # begin of sentence"[UNK]": 2, # 未知 token"[EOS]": 3, # end of sentence
})idx2word = {idx: word for word, idx in word2idx.items()} # 反向词典,id变为单词
训练的时候样本长度必须固定,这里定500:
# 选择 max_length
length_collect = {}
#统计样本中每个长度出现的次数
for text in train_data:length = len(text) #句子长度length_collect[length] = length_collect.get(length, 0) + 1 #统计长度的频率MAX_LENGTH = 500
plt.bar(length_collect.keys(), length_collect.values()) #长度分布图
plt.axvline(MAX_LENGTH, label="max length", c="gray", ls=":") #画一条线,可以看到大部分都在500以内
plt.legend()
plt.show()
Tokenizer:
需要通过这个把话变成id
class Tokenizer:def __init__(self, word2idx, idx2word, max_length=500, pad_idx=0, bos_idx=1, eos_idx=3, unk_idx=2):self.word2idx = word2idx #词表,单词到idself.idx2word = idx2word #词表,id到单词self.max_length = max_lengthself.pad_idx = pad_idx #填充self.bos_idx = bos_idx #开始self.eos_idx = eos_idx #结束self.unk_idx = unk_idx #未知,未出现在最高频词表中的词def encode(self, text_list):"""将文本列表转化为索引列表:param text_list:当前批次的文本列表,text_list必须是一个二维的字符串列表:return:"""max_length = min(self.max_length, 2 + max([len(text) for text in text_list])) #最大长度,最大长度是500,但是如果当前批次所有句子长度都小于500,就取句子长度(句子长度是本组句子中最长的那个),2是为了留出开始和结束的位置indices = []for text in text_list:index = [self.word2idx.get(word, self.unk_idx) for word in text] #单词转化为id,未知的词用unk_idx代替index = [self.bos_idx] + index + [self.eos_idx] #添加开始和结束if len(index) < max_length:index = index + [self.pad_idx] * (max_length - len(index)) #填充0else:index = index[:max_length] #如果句子长度大于500,就截断indices.append(index)return torch.tensor(indices) #二维列表转化为tensordef decode(self, indices_list, remove_bos=True, remove_eos=True, remove_pad=True, split=False):"""将索引列表转化为文本列表:param indices_list:某批次的索引列表:param remove_bos::param remove_eos::param remove_pad::param split::return:"""text_list = []for indices in indices_list:text = []for index in indices:word = self.idx2word.get(index, "[UNK]")if remove_bos and word == "[BOS]":continueif remove_eos and word == "[EOS]":breakif remove_pad and word == "[PAD]":breaktext.append(word)text_list.append(" ".join(text) if not split else text)return text_listtokenizer = Tokenizer(word2idx=word2idx, idx2word=idx2word)
raw_text = ["hello world".split(), "tokenize text datas with batch".split(), "this is a test".split()]
indices = tokenizer.encode(raw_text) #encode支持批量处理
print("raw text")
for raw in raw_text:print(raw)
print("indices")
for index in indices:print(index)
将文本列表转化为索引列表,类型必须是二维字符串列表
为什么要用批次里面的最大长度,因为其他的用pad填充,有超过的就截断
数据集与DataLoader:
from torch.utils.data import Dataset, DataLoaderclass IMDBDataset(Dataset):def __init__(self, data, labels, remain_length=True):if remain_length: #字符串输出样本中,是否含有【BOS】和【EOS】,【PAD】self.data = tokenizer.decode(data, remove_bos=False, remove_eos=False, remove_pad=False)else:# 缩减一下数据self.data = tokenizer.decode(data)self.labels = labelsdef __getitem__(self, index):text = self.data[index]label = self.labels[index]return text, labeldef __len__(self):return len(self.data)def collate_fct(batch):"""将batch数据处理成tensor形式:param batch::return:"""text_list = [item[0].split() for item in batch] #batch是128样本,每个样本类型是元组,第一个元素是文本,第二个元素是标签label_list = [item[1] for item in batch]text_list = tokenizer.encode(text_list).to(dtype=torch.int) # 文本转化为索引return text_list, torch.tensor(label_list).reshape(-1, 1).to(dtype=torch.float)train_ds = IMDBDataset(train_data, train_labels)
test_ds = IMDBDataset(test_data, test_labels)
# target output size of 5
m = nn.AdaptiveAvgPool1d(1) # 自适应平均池化
input = torch.randn(1, 3, 9)
output = m(input)
output.size() #可以看到最后一维变成了1
#%%
class AddingModel(nn.Module):def __init__(self, embedding_dim=16, hidden_dim=64, vocab_size=vocab_size):super(AddingModel, self).__init__()self.embeding = nn.Embedding(vocab_size, embedding_dim) # 词嵌入self.pool = nn.AdaptiveAvgPool1d(1) # 自适应平均池化,对应tf是全局平均值池化self.layer = nn.Linear(embedding_dim, hidden_dim) # 全连接层self.fc = nn.Linear(hidden_dim, 1) # 全连接层def forward(self, x):# [bs, seq length] [128, 500] --->[128,500,16]x = self.embeding(x)# print(f'embeding x size:{x.shape}')# [bs, seq length, embedding_dim]-->[bs, embedding_dim, seq length],尺寸[128,500,16]--》[128,16,500]x = x.permute(0, 2, 1)# print(f'permute x size:{x.shape}')x = self.pool(x) # 每个样本变为一个密集向量,在seq_length维度上进行平均池化,[128,16,500]-->[128,16,1]# print(f'pool x size:{x.shape}')x=x.squeeze(2) # [bs, embedding_dim, 1] ->[bs, embedding_dim]# [bs, embedding_dim] -> [bs, hidden_dim]x = self.layer(x)x = self.fc(x) # [bs, hidden_dim] -> [bs, 1]return xfor key, value in AddingModel().named_parameters():print(f"{key:^40}paramerters num: {np.prod(value.shape)}")
#%%
#统计总参数量
print(sum([np.prod(value.shape) for key, value in AddingModel().named_parameters()]))
#%%
16 * 64
#%%
#随机一个tensor,做上面模型的forward
#随机一个tensor,尺寸128,500,值必须在0-10000之间
x = torch.randint(0, 10000, (128, 500)).to(dtype=torch.int)
AddingModel()(x).shape
#%% md
# 训练
#%%
from sklearn.metrics import accuracy_score@torch.no_grad()
def evaluating(model, dataloader, loss_fct):loss_list = []pred_list = []label_list = []for datas, labels in dataloader:datas = datas.to(device)labels = labels.to(device)# 前向计算logits = model(datas)loss = loss_fct(logits, labels) # 验证集损失loss_list.append(loss.item())# 二分类 ,因为sigmoid输入是大于0的话,得到的概率是大于0.5的,认为是1分类preds = logits > 0pred_list.extend(preds.cpu().numpy().tolist())label_list.extend(labels.cpu().numpy().tolist())acc = accuracy_score(label_list, pred_list)return np.mean(loss_list), acc#%% md
### TensorBoard 可视化训练过程中可以使用如下命令启动tensorboard服务。```shell
tensorboard \--logdir=runs \ # log 存放路径--host 0.0.0.0 \ # ip--port 8848 # 端口
```
#%%
from torch.utils.tensorboard import SummaryWriterclass TensorBoardCallback:def __init__(self, log_dir, flush_secs=10):"""Args:log_dir (str): dir to write log.flush_secs (int, optional): write to dsk each flush_secs seconds. Defaults to 10."""self.writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs)def draw_model(self, model, input_shape):self.writer.add_graph(model, input_to_model=torch.randn(input_shape))def add_loss_scalars(self, step, loss, val_loss):self.writer.add_scalars(main_tag="training/loss",tag_scalar_dict={"loss": loss, "val_loss": val_loss},global_step=step,)def add_acc_scalars(self, step, acc, val_acc):self.writer.add_scalars(main_tag="training/accuracy",tag_scalar_dict={"accuracy": acc, "val_accuracy": val_acc},global_step=step,)def add_lr_scalars(self, step, learning_rate):self.writer.add_scalars(main_tag="training/learning_rate",tag_scalar_dict={"learning_rate": learning_rate},global_step=step,)def __call__(self, step, **kwargs):# add lossloss = kwargs.pop("loss", None)val_loss = kwargs.pop("val_loss", None)if loss is not None and val_loss is not None:self.add_loss_scalars(step, loss, val_loss)# add accacc = kwargs.pop("acc", None)val_acc = kwargs.pop("val_acc", None)if acc is not None and val_acc is not None:self.add_acc_scalars(step, acc, val_acc)# add lrlearning_rate = kwargs.pop("lr", None)if learning_rate is not None:self.add_lr_scalars(step, learning_rate)#%% md
### Save Best#%%
class SaveCheckpointsCallback:def __init__(self, save_dir, save_step=5000, save_best_only=True):"""Save checkpoints each save_epoch epoch. We save checkpoint by epoch in this implementation.Usually, training scripts with pytorch evaluating model and save checkpoint by step.Args:save_dir (str): dir to save checkpointsave_epoch (int, optional): the frequency to save checkpoint. Defaults to 1.save_best_only (bool, optional): If True, only save the best model or save each model at every epoch."""self.save_dir = save_dirself.save_step = save_stepself.save_best_only = save_best_onlyself.best_metrics = -1# mkdirif not os.path.exists(self.save_dir):os.mkdir(self.save_dir)def __call__(self, step, state_dict, metric=None):if step % self.save_step > 0:returnif self.save_best_only:assert metric is not Noneif metric >= self.best_metrics:# save checkpointstorch.save(state_dict, os.path.join(self.save_dir, "best.ckpt"))# update best metricsself.best_metrics = metricelse:torch.save(state_dict, os.path.join(self.save_dir, f"{step}.ckpt"))#%% md
### Early Stop
#%%
class EarlyStopCallback:def __init__(self, patience=5, min_delta=0.01):"""Args:patience (int, optional): Number of epochs with no improvement after which training will be stopped.. Defaults to 5.min_delta (float, optional): Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement. Defaults to 0.01."""self.patience = patienceself.min_delta = min_deltaself.best_metric = -1self.counter = 0def __call__(self, metric):if metric >= self.best_metric + self.min_delta:# update best metricself.best_metric = metric# reset counter self.counter = 0else:self.counter += 1@propertydef early_stop(self):return self.counter >= self.patience#%%
# 训练
def training(model,train_loader,val_loader,epoch,loss_fct,optimizer,tensorboard_callback=None,save_ckpt_callback=None,early_stop_callback=None,eval_step=500,
):record_dict = {"train": [],"val": []}global_step = 0model.train()with tqdm(total=epoch * len(train_loader)) as pbar:for epoch_id in range(epoch):# trainingfor datas, labels in train_loader:datas = datas.to(device)labels = labels.to(device)# 梯度清空optimizer.zero_grad()# 模型前向计算logits = model(datas)# 计算损失loss = loss_fct(logits, labels)# 梯度回传loss.backward()# 调整优化器,包括学习率的变动等optimizer.step()preds = logits > 0 #当sigmoid输出大于0.5时,预测为1,否则预测为0,这里大于0,刚好sigmoid的值是0.5,预测为1acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())loss = loss.cpu().item()# recordrecord_dict["train"].append({"loss": loss, "acc": acc, "step": global_step})# evaluatingif global_step % eval_step == 0:model.eval()val_loss, val_acc = evaluating(model, val_loader, loss_fct)record_dict["val"].append({"loss": val_loss, "acc": val_acc, "step": global_step})model.train()# 1. 使用 tensorboard 可视化if tensorboard_callback is not None:tensorboard_callback(global_step,loss=loss, val_loss=val_loss,acc=acc, val_acc=val_acc,lr=optimizer.param_groups[0]["lr"],)# 2. 保存模型权重 save model checkpointif save_ckpt_callback is not None:save_ckpt_callback(global_step, model.state_dict(), metric=val_acc)# 3. 早停 Early Stopif early_stop_callback is not None:early_stop_callback(val_acc)if early_stop_callback.early_stop:print(f"Early stop at epoch {epoch_id} / global_step {global_step}")return record_dict# udate stepglobal_step += 1pbar.update(1)pbar.set_postfix({"epoch": epoch_id})return record_dictepoch = 20model = AddingModel()# 1. 定义损失函数 采用二进制交叉熵损失, 先sigmoid再计算交叉熵
loss_fct = F.binary_cross_entropy_with_logits
# loss_fct =nn.BCEWithLogitsLoss()
# 2. 定义优化器 采用 adam
# Optimizers specified in the torch.optim package
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)# 1. tensorboard 可视化
if not os.path.exists("runs"):os.mkdir("runs")
tensorboard_callback = TensorBoardCallback("runs/imdb-adding")
# tensorboard_callback.draw_model(model, [1, MAX_LENGTH])
# 2. save best
if not os.path.exists("checkpoints"):os.makedirs("checkpoints")
save_ckpt_callback = SaveCheckpointsCallback("checkpoints/imdb-adding", save_step=len(train_dl), save_best_only=True)
# 3. early stop
early_stop_callback = EarlyStopCallback(patience=5)model = model.to(device)
record = training(model,train_dl,test_dl,epoch,loss_fct,optimizer,tensorboard_callback=None,save_ckpt_callback=save_ckpt_callback,early_stop_callback=early_stop_callback,eval_step=len(train_dl)
)
#%%
#画线要注意的是损失是不一定在零到1之间的
def plot_learning_curves(record_dict, sample_step=500):# build DataFrametrain_df = pd.DataFrame(record_dict["train"]).set_index("step").iloc[::sample_step]val_df = pd.DataFrame(record_dict["val"]).set_index("step")# plotfig_num = len(train_df.columns)fig, axs = plt.subplots(1, fig_num, figsize=(5 * fig_num, 5))for idx, item in enumerate(train_df.columns):axs[idx].plot(train_df.index, train_df[item], label=f"train_{item}")axs[idx].plot(val_df.index, val_df[item], label=f"val_{item}")axs[idx].grid()axs[idx].legend()# axs[idx].set_xticks(range(0, train_df.index[-1], 5000))# axs[idx].set_xticklabels(map(lambda x: f"{int(x/1000)}k", range(0, train_df.index[-1], 5000)))axs[idx].set_xlabel("step")plt.show()plot_learning_curves(record, sample_step=10) #横坐标是 steps
#%% md
# 评估
#%%
# dataload for evaluating# load checkpoints
model.load_state_dict(torch.load("checkpoints/imdb-adding/best.ckpt", weights_only=True,map_location="cpu"))model.eval()
loss, acc = evaluating(model, test_dl, loss_fct)
print(f"loss: {loss:.4f}\naccuracy: {acc:.4f}")