当前位置: 首页 > wzjs >正文

东莞网站优化的具体方案seo搜索引擎优化视频

东莞网站优化的具体方案,seo搜索引擎优化视频,一了网站,怎么查看网页的源代码一、Embedding与变长输入处理 One-hot编码:模型没办法知道不同词之间的距离,比如男人和绅士之间的距离。所以为了让模型知道词之间的距离,Embedding:要把词变为一个向量,让我们知道词之间是不是近义词。 发展历史&am…

一、Embedding与变长输入处理

One-hot编码:模型没办法知道不同词之间的距离,比如男人和绅士之间的距离。所以为了让模型知道词之间的距离,Embedding:要把词变为一个向量,让我们知道词之间是不是近义词。

发展历史:

Word2vec:里面有Skip-gram:由一个词预测多个词,还有CBOW:多词预测一次

Skip-gram:“我 知道 彭于晏 很帅”           上下文 输入     输出 彭于晏

                    “我 知道 吴彦祖 很帅”

那么因为前面的输入很像,所以彭于晏和吴彦祖可以被认为是近义词

这样就可以把词变成密集向量

Embedding技术:若刚好有一个样本,每个词的id分别是[3,2,5,9,1]

假设词典大小是1w,1w行16列,那么3就取第三行的密集向量,以此类推,这样一维就变成二维的了。先把样本都变成密集向量,变成矩阵以后,直接和后端模型对接,最终去训练。而原本的Word2vec是独立运作的模型,先算出密集向量,才能进行后面的模型训练

二、代码

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as Fprint(sys.version_info)
for module in mpl, np, pd, sklearn, torch:print(module.__name__, module.__version__)device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)seed = 42#%% md
# 准备数据
#%%
from tensorflow import keras  #这里的波浪线不用管#用karas有的数据集imdb,电影分类,分电影是积极的,还是消极的,正常都可以下载成功
imdb = keras.datasets.imdb
#载入数据使用下面两个参数
vocab_size = 10000  #词典大小,仅保留训练数据中前10000个最经常出现的单词,低频单词被舍弃
index_from = 3  #0,1,2,3空出来做别的事
#前一万个词出现词频最高的会保留下来进行处理,后面的作为特殊字符处理,
# 小于3的id都是特殊字符,下面代码有写
# 需要注意的一点是取出来的词表还是从1开始的,需要做处理
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size, index_from=index_from)#载入词表,看下词表长度,词表就像英语字典
word_index = imdb.get_word_index()
print(len(word_index))
print(type(word_index))

构造 word2idx 和 idx2word:

word2idx = {word: idx + 3 for word, idx in word_index.items()}  # 0,1,2,3空出来做别的事,这里的idx是从1开始的,所以加3
word2idx.update({"[PAD]": 0,  # 填充 token"[BOS]": 1,  # begin of sentence"[UNK]": 2,  # 未知 token"[EOS]": 3,  # end of sentence
})idx2word = {idx: word for word, idx in word2idx.items()}  # 反向词典,id变为单词

训练的时候样本长度必须固定,这里定500: 

# 选择 max_length
length_collect = {}
#统计样本中每个长度出现的次数
for text in train_data:length = len(text)  #句子长度length_collect[length] = length_collect.get(length, 0) + 1  #统计长度的频率MAX_LENGTH = 500
plt.bar(length_collect.keys(), length_collect.values())  #长度分布图
plt.axvline(MAX_LENGTH, label="max length", c="gray", ls=":")  #画一条线,可以看到大部分都在500以内
plt.legend()
plt.show()

Tokenizer:

需要通过这个把话变成id

class Tokenizer:def __init__(self, word2idx, idx2word, max_length=500, pad_idx=0, bos_idx=1, eos_idx=3, unk_idx=2):self.word2idx = word2idx  #词表,单词到idself.idx2word = idx2word  #词表,id到单词self.max_length = max_lengthself.pad_idx = pad_idx  #填充self.bos_idx = bos_idx  #开始self.eos_idx = eos_idx  #结束self.unk_idx = unk_idx  #未知,未出现在最高频词表中的词def encode(self, text_list):"""将文本列表转化为索引列表:param text_list:当前批次的文本列表,text_list必须是一个二维的字符串列表:return:"""max_length = min(self.max_length, 2 + max([len(text) for text in text_list]))  #最大长度,最大长度是500,但是如果当前批次所有句子长度都小于500,就取句子长度(句子长度是本组句子中最长的那个),2是为了留出开始和结束的位置indices = []for text in text_list:index = [self.word2idx.get(word, self.unk_idx) for word in text]  #单词转化为id,未知的词用unk_idx代替index = [self.bos_idx] + index + [self.eos_idx]  #添加开始和结束if len(index) < max_length:index = index + [self.pad_idx] * (max_length - len(index))  #填充0else:index = index[:max_length]  #如果句子长度大于500,就截断indices.append(index)return torch.tensor(indices)  #二维列表转化为tensordef decode(self, indices_list, remove_bos=True, remove_eos=True, remove_pad=True, split=False):"""将索引列表转化为文本列表:param indices_list:某批次的索引列表:param remove_bos::param remove_eos::param remove_pad::param split::return:"""text_list = []for indices in indices_list:text = []for index in indices:word = self.idx2word.get(index, "[UNK]")if remove_bos and word == "[BOS]":continueif remove_eos and word == "[EOS]":breakif remove_pad and word == "[PAD]":breaktext.append(word)text_list.append(" ".join(text) if not split else text)return text_listtokenizer = Tokenizer(word2idx=word2idx, idx2word=idx2word)
raw_text = ["hello world".split(), "tokenize text datas with batch".split(), "this is a test".split()]
indices = tokenizer.encode(raw_text)  #encode支持批量处理
print("raw text")
for raw in raw_text:print(raw)
print("indices")
for index in indices:print(index)

将文本列表转化为索引列表,类型必须是二维字符串列表

为什么要用批次里面的最大长度,因为其他的用pad填充,有超过的就截断

 数据集与DataLoader:

from torch.utils.data import Dataset, DataLoaderclass IMDBDataset(Dataset):def __init__(self, data, labels, remain_length=True):if remain_length:  #字符串输出样本中,是否含有【BOS】和【EOS】,【PAD】self.data = tokenizer.decode(data, remove_bos=False, remove_eos=False, remove_pad=False)else:# 缩减一下数据self.data = tokenizer.decode(data)self.labels = labelsdef __getitem__(self, index):text = self.data[index]label = self.labels[index]return text, labeldef __len__(self):return len(self.data)def collate_fct(batch):"""将batch数据处理成tensor形式:param batch::return:"""text_list = [item[0].split() for item in batch]  #batch是128样本,每个样本类型是元组,第一个元素是文本,第二个元素是标签label_list = [item[1] for item in batch]text_list = tokenizer.encode(text_list).to(dtype=torch.int)  # 文本转化为索引return text_list, torch.tensor(label_list).reshape(-1, 1).to(dtype=torch.float)train_ds = IMDBDataset(train_data, train_labels)
test_ds = IMDBDataset(test_data, test_labels)
# target output size of 5
m = nn.AdaptiveAvgPool1d(1)  # 自适应平均池化
input = torch.randn(1, 3, 9)
output = m(input)
output.size()  #可以看到最后一维变成了1
#%%
class AddingModel(nn.Module):def __init__(self, embedding_dim=16, hidden_dim=64, vocab_size=vocab_size):super(AddingModel, self).__init__()self.embeding = nn.Embedding(vocab_size, embedding_dim)  # 词嵌入self.pool = nn.AdaptiveAvgPool1d(1)  # 自适应平均池化,对应tf是全局平均值池化self.layer = nn.Linear(embedding_dim, hidden_dim)  # 全连接层self.fc = nn.Linear(hidden_dim, 1)  # 全连接层def forward(self, x):# [bs, seq length] [128, 500] --->[128,500,16]x = self.embeding(x)# print(f'embeding x size:{x.shape}')# [bs, seq length, embedding_dim]-->[bs, embedding_dim, seq length],尺寸[128,500,16]--》[128,16,500]x = x.permute(0, 2, 1)# print(f'permute x size:{x.shape}')x = self.pool(x)  # 每个样本变为一个密集向量,在seq_length维度上进行平均池化,[128,16,500]-->[128,16,1]# print(f'pool x size:{x.shape}')x=x.squeeze(2)  # [bs, embedding_dim, 1] ->[bs, embedding_dim]# [bs, embedding_dim] -> [bs, hidden_dim]x = self.layer(x)x = self.fc(x)  # [bs, hidden_dim] -> [bs, 1]return xfor key, value in AddingModel().named_parameters():print(f"{key:^40}paramerters num: {np.prod(value.shape)}")
#%%
#统计总参数量
print(sum([np.prod(value.shape) for key, value in AddingModel().named_parameters()]))
#%%
16 * 64
#%%
#随机一个tensor,做上面模型的forward
#随机一个tensor,尺寸128,500,值必须在0-10000之间
x = torch.randint(0, 10000, (128, 500)).to(dtype=torch.int)
AddingModel()(x).shape
#%% md
# 训练
#%%
from sklearn.metrics import accuracy_score@torch.no_grad()
def evaluating(model, dataloader, loss_fct):loss_list = []pred_list = []label_list = []for datas, labels in dataloader:datas = datas.to(device)labels = labels.to(device)# 前向计算logits = model(datas)loss = loss_fct(logits, labels)  # 验证集损失loss_list.append(loss.item())# 二分类 ,因为sigmoid输入是大于0的话,得到的概率是大于0.5的,认为是1分类preds = logits > 0pred_list.extend(preds.cpu().numpy().tolist())label_list.extend(labels.cpu().numpy().tolist())acc = accuracy_score(label_list, pred_list)return np.mean(loss_list), acc#%% md
### TensorBoard 可视化训练过程中可以使用如下命令启动tensorboard服务。```shell
tensorboard \--logdir=runs \     # log 存放路径--host 0.0.0.0 \    # ip--port 8848         # 端口
```
#%%
from torch.utils.tensorboard import SummaryWriterclass TensorBoardCallback:def __init__(self, log_dir, flush_secs=10):"""Args:log_dir (str): dir to write log.flush_secs (int, optional): write to dsk each flush_secs seconds. Defaults to 10."""self.writer = SummaryWriter(log_dir=log_dir, flush_secs=flush_secs)def draw_model(self, model, input_shape):self.writer.add_graph(model, input_to_model=torch.randn(input_shape))def add_loss_scalars(self, step, loss, val_loss):self.writer.add_scalars(main_tag="training/loss",tag_scalar_dict={"loss": loss, "val_loss": val_loss},global_step=step,)def add_acc_scalars(self, step, acc, val_acc):self.writer.add_scalars(main_tag="training/accuracy",tag_scalar_dict={"accuracy": acc, "val_accuracy": val_acc},global_step=step,)def add_lr_scalars(self, step, learning_rate):self.writer.add_scalars(main_tag="training/learning_rate",tag_scalar_dict={"learning_rate": learning_rate},global_step=step,)def __call__(self, step, **kwargs):# add lossloss = kwargs.pop("loss", None)val_loss = kwargs.pop("val_loss", None)if loss is not None and val_loss is not None:self.add_loss_scalars(step, loss, val_loss)# add accacc = kwargs.pop("acc", None)val_acc = kwargs.pop("val_acc", None)if acc is not None and val_acc is not None:self.add_acc_scalars(step, acc, val_acc)# add lrlearning_rate = kwargs.pop("lr", None)if learning_rate is not None:self.add_lr_scalars(step, learning_rate)#%% md
### Save Best#%%
class SaveCheckpointsCallback:def __init__(self, save_dir, save_step=5000, save_best_only=True):"""Save checkpoints each save_epoch epoch. We save checkpoint by epoch in this implementation.Usually, training scripts with pytorch evaluating model and save checkpoint by step.Args:save_dir (str): dir to save checkpointsave_epoch (int, optional): the frequency to save checkpoint. Defaults to 1.save_best_only (bool, optional): If True, only save the best model or save each model at every epoch."""self.save_dir = save_dirself.save_step = save_stepself.save_best_only = save_best_onlyself.best_metrics = -1# mkdirif not os.path.exists(self.save_dir):os.mkdir(self.save_dir)def __call__(self, step, state_dict, metric=None):if step % self.save_step > 0:returnif self.save_best_only:assert metric is not Noneif metric >= self.best_metrics:# save checkpointstorch.save(state_dict, os.path.join(self.save_dir, "best.ckpt"))# update best metricsself.best_metrics = metricelse:torch.save(state_dict, os.path.join(self.save_dir, f"{step}.ckpt"))#%% md
### Early Stop
#%%
class EarlyStopCallback:def __init__(self, patience=5, min_delta=0.01):"""Args:patience (int, optional): Number of epochs with no improvement after which training will be stopped.. Defaults to 5.min_delta (float, optional): Minimum change in the monitored quantity to qualify as an improvement, i.e. an absolute change of less than min_delta, will count as no improvement. Defaults to 0.01."""self.patience = patienceself.min_delta = min_deltaself.best_metric = -1self.counter = 0def __call__(self, metric):if metric >= self.best_metric + self.min_delta:# update best metricself.best_metric = metric# reset counter self.counter = 0else:self.counter += 1@propertydef early_stop(self):return self.counter >= self.patience#%%
# 训练
def training(model,train_loader,val_loader,epoch,loss_fct,optimizer,tensorboard_callback=None,save_ckpt_callback=None,early_stop_callback=None,eval_step=500,
):record_dict = {"train": [],"val": []}global_step = 0model.train()with tqdm(total=epoch * len(train_loader)) as pbar:for epoch_id in range(epoch):# trainingfor datas, labels in train_loader:datas = datas.to(device)labels = labels.to(device)# 梯度清空optimizer.zero_grad()# 模型前向计算logits = model(datas)# 计算损失loss = loss_fct(logits, labels)# 梯度回传loss.backward()# 调整优化器,包括学习率的变动等optimizer.step()preds = logits > 0 #当sigmoid输出大于0.5时,预测为1,否则预测为0,这里大于0,刚好sigmoid的值是0.5,预测为1acc = accuracy_score(labels.cpu().numpy(), preds.cpu().numpy())loss = loss.cpu().item()# recordrecord_dict["train"].append({"loss": loss, "acc": acc, "step": global_step})# evaluatingif global_step % eval_step == 0:model.eval()val_loss, val_acc = evaluating(model, val_loader, loss_fct)record_dict["val"].append({"loss": val_loss, "acc": val_acc, "step": global_step})model.train()# 1. 使用 tensorboard 可视化if tensorboard_callback is not None:tensorboard_callback(global_step,loss=loss, val_loss=val_loss,acc=acc, val_acc=val_acc,lr=optimizer.param_groups[0]["lr"],)# 2. 保存模型权重 save model checkpointif save_ckpt_callback is not None:save_ckpt_callback(global_step, model.state_dict(), metric=val_acc)# 3. 早停 Early Stopif early_stop_callback is not None:early_stop_callback(val_acc)if early_stop_callback.early_stop:print(f"Early stop at epoch {epoch_id} / global_step {global_step}")return record_dict# udate stepglobal_step += 1pbar.update(1)pbar.set_postfix({"epoch": epoch_id})return record_dictepoch = 20model = AddingModel()# 1. 定义损失函数 采用二进制交叉熵损失, 先sigmoid再计算交叉熵
loss_fct = F.binary_cross_entropy_with_logits
# loss_fct =nn.BCEWithLogitsLoss()
# 2. 定义优化器 采用 adam
# Optimizers specified in the torch.optim package
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)# 1. tensorboard 可视化
if not os.path.exists("runs"):os.mkdir("runs")
tensorboard_callback = TensorBoardCallback("runs/imdb-adding")
# tensorboard_callback.draw_model(model, [1, MAX_LENGTH])
# 2. save best
if not os.path.exists("checkpoints"):os.makedirs("checkpoints")
save_ckpt_callback = SaveCheckpointsCallback("checkpoints/imdb-adding", save_step=len(train_dl), save_best_only=True)
# 3. early stop
early_stop_callback = EarlyStopCallback(patience=5)model = model.to(device)
record = training(model,train_dl,test_dl,epoch,loss_fct,optimizer,tensorboard_callback=None,save_ckpt_callback=save_ckpt_callback,early_stop_callback=early_stop_callback,eval_step=len(train_dl)
)
#%%
#画线要注意的是损失是不一定在零到1之间的
def plot_learning_curves(record_dict, sample_step=500):# build DataFrametrain_df = pd.DataFrame(record_dict["train"]).set_index("step").iloc[::sample_step]val_df = pd.DataFrame(record_dict["val"]).set_index("step")# plotfig_num = len(train_df.columns)fig, axs = plt.subplots(1, fig_num, figsize=(5 * fig_num, 5))for idx, item in enumerate(train_df.columns):axs[idx].plot(train_df.index, train_df[item], label=f"train_{item}")axs[idx].plot(val_df.index, val_df[item], label=f"val_{item}")axs[idx].grid()axs[idx].legend()# axs[idx].set_xticks(range(0, train_df.index[-1], 5000))# axs[idx].set_xticklabels(map(lambda x: f"{int(x/1000)}k", range(0, train_df.index[-1], 5000)))axs[idx].set_xlabel("step")plt.show()plot_learning_curves(record, sample_step=10)  #横坐标是 steps
#%% md
# 评估
#%%
# dataload for evaluating# load checkpoints
model.load_state_dict(torch.load("checkpoints/imdb-adding/best.ckpt", weights_only=True,map_location="cpu"))model.eval()
loss, acc = evaluating(model, test_dl, loss_fct)
print(f"loss:     {loss:.4f}\naccuracy: {acc:.4f}")

http://www.dtcms.com/wzjs/94006.html

相关文章:

  • 山东烟台疫情最新数据石家庄关键词优化报价
  • 校内网站建设与维护百度知道问答
  • 长沙网站制作哪家强软文范例100例
  • 学校的网站建设和资源库建设一站式营销平台
  • 南宁网站建设流程厦门seo计费
  • 长春高端模板建站网站seo优化有哪些方面
  • 自己做网站 最好的软件seo全称
  • 全国公安网站备案网站优化软件费用
  • 夜店做鸭网站新公司如何做推广
  • 长沙做网站找谁关键词分析工具
  • 网站建设怎么创业永久免费个人网站申请注册
  • 拓网手机版网站管理系统怎么营销自己的产品
  • 网站开发 免代码论坛推广方案
  • 网站建设公司 首推万维科技各大网站域名大全
  • 网站建设模板删不掉广州seo优化费用
  • 贝壳找房网站做销售电商运营一天都干啥
  • 旅行社网站建设方案书刚刚中国突然宣布
  • 网站引导页怎么做活动软文模板
  • 商城网站建设开发web软件商品推广软文800字
  • 昭通商城网站建设正规推广平台有哪些
  • 网店网页设计培训湖南网站seo公司
  • 如何查看网站做没做百度推广长沙百度网站推广公司
  • 做钓鱼网站查处google搜索首页
  • 深圳seo网站推广公司个人网站搭建
  • 做网站公司推荐seo案例分享
  • 山海关网站制作seo排名点击软件运营
  • 杭州市拱墅区住房与建设局网站提高seo关键词排名
  • wordpress如何修改html代码东莞百度seo排名
  • 郑州专业网站制作的公司苏州网站制作推广
  • 广州技术支持 骏域网站建设一个网站的seo优化有哪些