当前位置：首页 > news >正文

自用006

news 来源：原创 2025/6/26 8:30:06

006

import math
from collections import defaultdictclass NgramLanguageModel:def __init__(self, corpus=None, n=3):self.n = nself.sep = "_"     # 用来分割两个词，没有实际含义，只要是字典里不存在的符号都可以self.sos = "<sos>"    #start of sentence，句子开始的标识符self.eos = "<eos>"    #end of sentence，句子结束的标识符self.unk_prob = 1e-5  #给unk分配一个比较小的概率值，避免集外词概率为0self.fix_backoff_prob = 0.4  #使用固定的回退概率self.ngram_count_dict = dict((x + 1, defaultdict(int)) for x in range(n))self.ngram_count_prob_dict = dict((x + 1, defaultdict(int)) for x in range(n))self.ngram_count(corpus)self.calc_ngram_prob()#将文本切分成词或字或tokendef sentence_segment(self, sentence):return sentence.split()#return jieba.lcut(sentence)#统计ngram的数量def ngram_count(self, corpus):for sentence in corpus:word_lists = self.sentence_segment(sentence)word_lists = [self.sos] + word_lists + [self.eos]  #前后补充开始符和结尾符for window_size in range(1, self.n + 1):           #按不同窗长扫描文本for index, word in enumerate(word_lists):#取到末尾时窗口长度会小于指定的gram，跳过那几个if len(word_lists[index:index + window_size]) != window_size:continue#用分隔符连接word形成一个ngram用于存储ngram = self.sep.join(word_lists[index:index + window_size])self.ngram_count_dict[window_size][ngram] += 1#计算总词数，后续用于计算一阶ngram概率self.ngram_count_dict[0] = sum(self.ngram_count_dict[1].values())return#计算ngram概率def calc_ngram_prob(self):for window_size in range(1, self.n + 1):for ngram, count in self.ngram_count_dict[window_size].items():if window_size > 1:ngram_splits = ngram.split(self.sep)              #ngram        :a b cngram_prefix = self.sep.join(ngram_splits[:-1])   #ngram_prefix :a bngram_prefix_count = self.ngram_count_dict[window_size - 1][ngram_prefix] #Count(a,b)else:ngram_prefix_count = self.ngram_count_dict[0]     #count(total word)# word = ngram_splits[-1]# self.ngram_count_prob_dict[word + "|" + ngram_prefix] = count / ngram_prefix_countself.ngram_count_prob_dict[window_size][ngram] = count / ngram_prefix_countreturn#获取ngram概率，其中用到了回退平滑，回退概率采取固定值def get_ngram_prob(self, ngram):n = len(ngram.split(self.sep))if ngram in self.ngram_count_prob_dict[n]:#尝试直接取出概率return self.ngram_count_prob_dict[n][ngram]elif n == 1:#一阶gram查找不到，说明是集外词，不做回退return self.unk_probelse:#高于一阶的可以回退ngram = self.sep.join(ngram.split(self.sep)[1:])return self.fix_backoff_prob * self.get_ngram_prob(ngram)#回退法预测句子概率def calc_sentence_ppl(self, sentence):word_list = self.sentence_segment(sentence)word_list = [self.sos] + word_list + [self.eos]sentence_prob = 0for index, word in enumerate(word_list):ngram = self.sep.join(word_list[max(0, index - self.n + 1):index + 1])prob = self.get_ngram_prob(ngram)# print(ngram, prob)sentence_prob += math.log(prob)return 2 ** (sentence_prob * (-1 / len(word_list)))if __name__ == "__main__":corpus = open("sample.txt", encoding="utf8").readlines()lm = NgramLanguageModel(corpus, 3)print("词总数:", lm.ngram_count_dict[0])print(lm.ngram_count_prob_dict)print(lm.calc_sentence_ppl("a c b e f d"))

#coding:utf8import torch
import torch.nn as nn
import numpy as np
import math
import random
import re
import os
from nnlm import build_model, build_vocab"""
使用训练好的语言模型
"""def load_trained_language_model(path):char_dim = 128        #每个字的维度,与训练时保持一直window_size = 6       #样本文本长度,与训练时保持一直vocab = build_vocab("vocab.txt")      # 加载字表model = build_model(vocab, char_dim)  # 加载模型model.load_state_dict(torch.load(path))  #加载训练好的模型权重model.eval()if torch.cuda.is_available():model = model.cuda()model.window_size = window_sizemodel.vocab = vocabreturn model#计算文本ppl
def calc_perplexity(sentence, model):prob = 0with torch.no_grad():for i in range(1, len(sentence)):start = max(0, i - model.window_size)window = sentence[start:i]x = [model.vocab.get(char, model.vocab["<UNK>"]) for char in window]x = torch.LongTensor([x])target = sentence[i]target_index = model.vocab.get(target, model.vocab["<UNK>"])if torch.cuda.is_available():x = x.cuda()pred_prob_distribute = model(x)[0]target_prob = pred_prob_distribute[target_index]# print(window , "->", target, "prob:", float(target_prob))prob += math.log(target_prob, 10)return 2 ** (prob * ( -1 / len(sentence)))#加载训练好的所有模型
def load_models():model_paths = os.listdir(os.path.dirname(os.path.abspath(__file__)) + "/model")class_to_model = {}for model_path in model_paths:class_name = model_path.replace(".pth", "")model_path = os.path.join("model", model_path)class_to_model[class_name] = load_trained_language_model(model_path)return class_to_model#基于语言模型的文本分类伪代码
#class_to_model: {"class1":<language model obj1>, "class2":<language model obj2>, ..}
#每个语言模型，用对应的领域语料训练
def text_classification_based_on_language_model(class_to_model, sentence):ppl = []for class_name, class_lm in class_to_model.items():#用每个语言模型计算pplppl.append([class_name, calc_perplexity(sentence, class_lm)])ppl = sorted(ppl, key=lambda x:x[1])print(sentence)print(ppl[0: 3])print("==================")return pplsentence = ["在全球货币体系出现危机的情况下","点击进入双色球玩法经典选号图表","慢时尚服饰最大的优点是独特","做处女座朋友的人真的很难","网戒中心要求家长全程陪护","在欧巡赛扭转了自己此前不利的状态","选择独立的别墅会比公寓更适合你",]class_to_model = load_models()
for s in sentence:text_classification_based_on_language_model(class_to_model, s)

#coding:utf8import torch
import torch.nn as nn
import numpy as np
import math
import random
import os
import re
import matplotlib.pyplot as plt"""
基于pytorch的rnn语言模型
"""class LanguageModel(nn.Module):def __init__(self, input_dim, vocab):super(LanguageModel, self).__init__()self.embedding = nn.Embedding(len(vocab) + 1, input_dim)self.layer = nn.RNN(input_dim, input_dim, num_layers=2, batch_first=True)self.classify = nn.Linear(input_dim, len(vocab) + 1)self.dropout = nn.Dropout(0.1)self.loss = nn.functional.cross_entropy#当输入真实标签，返回loss值；无真实标签，返回预测值def forward(self, x, y=None):x = self.embedding(x)  #output shape:(batch_size, sen_len, input_dim)x, _ = self.layer(x)      #output shape:(batch_size, sen_len, input_dim)x = x[:, -1, :]        #output shape:(batch_size, input_dim)x = self.dropout(x)y_pred = self.classify(x)   #output shape:(batch_size, vocab_size)if y is not None:return self.loss(y_pred, y) #[1*vocab_size] []else:return torch.softmax(y_pred, dim=-1)#读取语料获得字符集
#输出一份
def build_vocab_from_corpus(path):vocab = set()with open(path, encoding="utf8") as f:for index, char in enumerate(f.read()):vocab.add(char)vocab.add("<UNK>") #增加一个unk token用来处理未登录词writer = open("vocab.txt", "w", encoding="utf8")for char in sorted(vocab):writer.write(char + "\n")return vocab#加载字表
def build_vocab(vocab_path):vocab = {}with open(vocab_path, encoding="utf8") as f:for index, line in enumerate(f):char = line[:-1]        #去掉结尾换行符vocab[char] = index + 1 #留出0位给pad tokenvocab["\n"] = 1return vocab#加载语料
def load_corpus(path):return open(path, encoding="utf8").read()#随机生成一个样本
#从文本中截取随机窗口，前n个字作为输入，最后一个字作为输出
def build_sample(vocab, window_size, corpus):start = random.randint(0, len(corpus) - 1 - window_size)end = start + window_sizewindow = corpus[start:end]target = corpus[end]# print(window, target)x = [vocab.get(word, vocab["<UNK>"]) for word in window]   #将字转换成序号y = vocab[target]return x, y#建立数据集
#sample_length 输入需要的样本数量。需要多少生成多少
#vocab 词表
#window_size 样本长度
#corpus 语料字符串
def build_dataset(sample_length, vocab, window_size, corpus):dataset_x = []dataset_y = []for i in range(sample_length):x, y = build_sample(vocab, window_size, corpus)dataset_x.append(x)dataset_y.append(y)return torch.LongTensor(dataset_x), torch.LongTensor(dataset_y)#建立模型
def build_model(vocab, char_dim):model = LanguageModel(char_dim, vocab)return model#计算文本ppl
def calc_perplexity(sentence, model, vocab, window_size):prob = 0model.eval()with torch.no_grad():for i in range(1, len(sentence)):start = max(0, i - window_size)window = sentence[start:i]x = [vocab.get(char, vocab["<UNK>"]) for char in window]x = torch.LongTensor([x])target = sentence[i]target_index = vocab.get(target, vocab["<UNK>"])if torch.cuda.is_available():x = x.cuda()pred_prob_distribute = model(x)[0]target_prob = pred_prob_distribute[target_index]prob += math.log(target_prob, 10)return 2 ** (prob * ( -1 / len(sentence)))def train(corpus_path, save_weight=True):epoch_num = 10        #训练轮数batch_size = 128       #每次训练样本个数train_sample = 10000   #每轮训练总共训练的样本总数char_dim = 128        #每个字的维度window_size = 6       #样本文本长度vocab = build_vocab("vocab.txt")       #建立字表corpus = load_corpus(corpus_path)     #加载语料model = build_model(vocab, char_dim)    #建立模型if torch.cuda.is_available():model = model.cuda()optim = torch.optim.Adam(model.parameters(), lr=0.001)   #建立优化器for epoch in range(epoch_num):model.train()watch_loss = []for batch in range(int(train_sample / batch_size)):x, y = build_dataset(batch_size, vocab, window_size, corpus) #构建一组训练样本if torch.cuda.is_available():x, y = x.cuda(), y.cuda()optim.zero_grad()    #梯度归零loss = model(x, y)   #计算losswatch_loss.append(loss.item())loss.backward()      #计算梯度optim.step()         #更新权重print("=========\n第%d轮平均loss:%f" % (epoch + 1, np.mean(watch_loss)))if not save_weight:returnelse:base_name = os.path.basename(corpus_path).replace("txt", "pth")model_path = os.path.join("model", base_name)torch.save(model.state_dict(), model_path)return#训练corpus文件夹下的所有语料，根据文件名将训练后的模型放到莫得了文件夹
def train_all():for path in os.listdir("corpus"):corpus_path = os.path.join("corpus", path)train(corpus_path)if __name__ == "__main__":# build_vocab_from_corpus("corpus/all.txt")# train("corpus.txt", True)train_all()

#coding:utf8import torch
import torch.nn as nn
import numpy as np
import random
import json
from transformers import BertModel"""基于pytorch的网络编写
实现一个网络完成一个简单nlp任务
判断文本中是否有某些特定字符出现week2的例子，修改引入bert
"""class TorchModel(nn.Module):def __init__(self, input_dim, sentence_length, vocab):super(TorchModel, self).__init__()# 原始代码# self.embedding = nn.Embedding(len(vocab) + 1, input_dim)# self.layer = nn.Linear(input_dim, input_dim)# self.pool = nn.MaxPool1d(sentence_length)self.bert = BertModel.from_pretrained(r"F:\Desktop\work_space\pretrain_models\bert-base-chinese", return_dict=False)self.classify = nn.Linear(input_dim, 3)self.activation = torch.sigmoid     #sigmoid做激活函数self.dropout = nn.Dropout(0.5)self.loss = nn.functional.cross_entropy#当输入真实标签，返回loss值；无真实标签，返回预测值def forward(self, x, y=None):# 原始代码# x = self.embedding(x)  #input shape:(batch_size, sen_len) (10,6)# x = self.layer(x)      #input shape:(batch_size, sen_len, input_dim) (10,6,20)# x = self.dropout(x)    #input shape:(batch_size, sen_len, input_dim)# x = self.activation(x) #input shape:(batch_size, sen_len, input_dim)# x = self.pool(x.transpose(1,2)).squeeze() #input shape:(batch_size, sen_len, input_dim)sequence_output, pooler_output = self.bert(x)x = self.classify(pooler_output)y_pred = self.activation(x)if y is not None:return self.loss(y_pred, y.squeeze())else:return y_pred#字符集随便挑了一些汉字，实际上还可以扩充
#为每个字生成一个标号
#{"a":1, "b":2, "c":3...}
#abc -> [1,2,3]
def build_vocab():chars = "abcdefghijklmnopqrstuvwxyz"  #字符集vocab = {}for index, char in enumerate(chars):vocab[char] = index + 1   #每个字对应一个序号vocab['unk'] = len(vocab)+1return vocab#随机生成一个样本
#从所有字中选取sentence_length个字
#反之为负样本
def build_sample(vocab, sentence_length):#随机从字表选取sentence_length个字，可能重复x = [random.choice(list(vocab.keys())) for _ in range(sentence_length)]#A类样本if set("abc") & set(x) and not set("xyz") & set(x):y = 0#B类样本elif not set("abc") & set(x) and set("xyz") & set(x):y = 1#C类样本else:y = 2x = [vocab.get(word, vocab['unk']) for word in x]   #将字转换成序号，为了做embeddingreturn x, y#建立数据集
#输入需要的样本数量。需要多少生成多少
def build_dataset(sample_length, vocab, sentence_length):dataset_x = []dataset_y = []for i in range(sample_length):x, y = build_sample(vocab, sentence_length)dataset_x.append(x)dataset_y.append([y])return torch.LongTensor(dataset_x), torch.LongTensor(dataset_y)#建立模型
def build_model(vocab, char_dim, sentence_length):model = TorchModel(char_dim, sentence_length, vocab)return model#测试代码
#用来测试每轮模型的准确率
def evaluate(model, vocab, sample_length):model.eval()total = 200 #测试样本数量x, y = build_dataset(total, vocab, sample_length)   #建立200个用于测试的样本y = y.squeeze()print("A类样本数量：%d, B类样本数量：%d, C类样本数量：%d"%(y.tolist().count(0), y.tolist().count(1), y.tolist().count(2)))correct, wrong = 0, 0with torch.no_grad():y_pred = model(x)      #模型预测for y_p, y_t in zip(y_pred, y):  #与真实标签进行对比if int(torch.argmax(y_p)) == int(y_t):correct += 1   #正样本判断正确else:wrong += 1print("正确预测个数：%d / %d, 正确率：%f"%(correct, total, correct/(correct+wrong)))return correct/(correct+wrong)def main():epoch_num = 15        #训练轮数batch_size = 20       #每次训练样本个数train_sample = 1000   #每轮训练总共训练的样本总数char_dim = 768         #每个字的维度sentence_length = 6   #样本文本长度vocab = build_vocab()       #建立字表model = build_model(vocab, char_dim, sentence_length)    #建立模型optim = torch.optim.Adam(model.parameters(), lr=1e-5)   #建立优化器log = []for epoch in range(epoch_num):model.train()watch_loss = []for batch in range(int(train_sample / batch_size)):x, y = build_dataset(batch_size, vocab, sentence_length) #构建一组训练样本optim.zero_grad()    #梯度归零loss = model(x, y)   #计算lossloss.backward()      #计算梯度optim.step()         #更新权重watch_loss.append(loss.item())print("=========\n第%d轮平均loss:%f" % (epoch + 1, np.mean(watch_loss)))acc = evaluate(model, vocab, sentence_length)   #测试本轮模型结果log.append([acc, np.mean(watch_loss)])# plt.plot(range(len(log)), [l[0] for l in log])  #画acc曲线# plt.plot(range(len(log)), [l[1] for l in log])  #画loss曲线# plt.show()#保存模型torch.save(model.state_dict(), "model.pth")# 保存词表writer = open("vocab.json", "w", encoding="utf8")writer.write(json.dumps(vocab, ensure_ascii=False, indent=2))writer.close()return#最终预测
def predict(model_path, vocab_path, input_strings):char_dim = 20  # 每个字的维度sentence_length = 6  # 样本文本长度vocab = json.load(open(vocab_path, "r", encoding="utf8"))model = build_model(vocab, char_dim, sentence_length)    #建立模型model.load_state_dict(torch.load(model_path))       #加载训练好的权重x = []for input_string in input_strings:x.append([vocab[char] for char in input_string])  #将输入序列化model.eval()   #测试模式，不使用dropoutwith torch.no_grad():  #不计算梯度result = model.forward(torch.LongTensor(x))  #模型预测for i, input_string in enumerate(input_strings):print(int(torch.argmax(result[i])), input_string, result[i]) #打印结果if __name__ == "__main__":main()# test_strings = ["juvaee", "yrwfrg", "rtweqg", "nlhdww"]# predict("model.pth", "vocab.json", test_strings)

import torch
import math
import numpy as np
from transformers import BertModel'''通过手动矩阵运算实现Bert结构
模型文件下载 https://huggingface.co/models'''bert = BertModel.from_pretrained(r"F:\Desktop\work_space\pretrain_models\bert-base-chinese", return_dict=False)
state_dict = bert.state_dict()
bert.eval()
x = np.array([2450, 15486, 102, 2110])   #假想成4个字的句子
torch_x = torch.LongTensor([x])          #pytorch形式输入
seqence_output, pooler_output = bert(torch_x)
print(seqence_output.shape, pooler_output.shape)
# print(seqence_output, pooler_output)print(bert.state_dict().keys())  #查看所有的权值矩阵名称#softmax归一化
def softmax(x):return np.exp(x)/np.sum(np.exp(x), axis=-1, keepdims=True)#gelu激活函数
def gelu(x):return 0.5 * x * (1 + np.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * np.power(x, 3))))class DiyBert:#将预训练好的整个权重字典输入进来def __init__(self, state_dict):self.num_attention_heads = 12self.hidden_size = 768self.num_layers = 1        #注意这里的层数要跟预训练config.json文件中的模型层数一致self.load_weights(state_dict)def load_weights(self, state_dict):#embedding部分self.word_embeddings = state_dict["embeddings.word_embeddings.weight"].numpy()self.position_embeddings = state_dict["embeddings.position_embeddings.weight"].numpy()self.token_type_embeddings = state_dict["embeddings.token_type_embeddings.weight"].numpy()self.embeddings_layer_norm_weight = state_dict["embeddings.LayerNorm.weight"].numpy()self.embeddings_layer_norm_bias = state_dict["embeddings.LayerNorm.bias"].numpy()self.transformer_weights = []#transformer部分，有多层for i in range(self.num_layers):q_w = state_dict["encoder.layer.%d.attention.self.query.weight" % i].numpy()q_b = state_dict["encoder.layer.%d.attention.self.query.bias" % i].numpy()k_w = state_dict["encoder.layer.%d.attention.self.key.weight" % i].numpy()k_b = state_dict["encoder.layer.%d.attention.self.key.bias" % i].numpy()v_w = state_dict["encoder.layer.%d.attention.self.value.weight" % i].numpy()v_b = state_dict["encoder.layer.%d.attention.self.value.bias" % i].numpy()attention_output_weight = state_dict["encoder.layer.%d.attention.output.dense.weight" % i].numpy()attention_output_bias = state_dict["encoder.layer.%d.attention.output.dense.bias" % i].numpy()attention_layer_norm_w = state_dict["encoder.layer.%d.attention.output.LayerNorm.weight" % i].numpy()attention_layer_norm_b = state_dict["encoder.layer.%d.attention.output.LayerNorm.bias" % i].numpy()intermediate_weight = state_dict["encoder.layer.%d.intermediate.dense.weight" % i].numpy()intermediate_bias = state_dict["encoder.layer.%d.intermediate.dense.bias" % i].numpy()output_weight = state_dict["encoder.layer.%d.output.dense.weight" % i].numpy()output_bias = state_dict["encoder.layer.%d.output.dense.bias" % i].numpy()ff_layer_norm_w = state_dict["encoder.layer.%d.output.LayerNorm.weight" % i].numpy()ff_layer_norm_b = state_dict["encoder.layer.%d.output.LayerNorm.bias" % i].numpy()self.transformer_weights.append([q_w, q_b, k_w, k_b, v_w, v_b, attention_output_weight, attention_output_bias,attention_layer_norm_w, attention_layer_norm_b, intermediate_weight, intermediate_bias,output_weight, output_bias, ff_layer_norm_w, ff_layer_norm_b])#pooler层self.pooler_dense_weight = state_dict["pooler.dense.weight"].numpy()self.pooler_dense_bias = state_dict["pooler.dense.bias"].numpy()#bert embedding，使用3层叠加，在经过一个Layer norm层def embedding_forward(self, x):# x.shape = [max_len]we = self.get_embedding(self.word_embeddings, x)  # shpae: [max_len, hidden_size]# position embeding的输入 [0, 1, 2, 3]pe = self.get_embedding(self.position_embeddings, np.array(list(range(len(x)))))  # shpae: [max_len, hidden_size]# token type embedding,单输入的情况下为[0, 0, 0, 0]te = self.get_embedding(self.token_type_embeddings, np.array([0] * len(x)))  # shpae: [max_len, hidden_size]embedding = we + pe + te# 加和后有一个归一化层embedding = self.layer_norm(embedding, self.embeddings_layer_norm_weight, self.embeddings_layer_norm_bias)  # shpae: [max_len, hidden_size]return embedding#embedding层实际上相当于按index索引，或理解为onehot输入乘以embedding矩阵def get_embedding(self, embedding_matrix, x):return np.array([embedding_matrix[index] for index in x])#执行全部的transformer层计算def all_transformer_layer_forward(self, x):for i in range(self.num_layers):x = self.single_transformer_layer_forward(x, i)return x#执行单层transformer层计算def single_transformer_layer_forward(self, x, layer_index):weights = self.transformer_weights[layer_index]#取出该层的参数，在实际中，这些参数都是随机初始化，之后进行预训练q_w, q_b, \k_w, k_b, \v_w, v_b, \attention_output_weight, attention_output_bias, \attention_layer_norm_w, attention_layer_norm_b, \intermediate_weight, intermediate_bias, \output_weight, output_bias, \ff_layer_norm_w, ff_layer_norm_b = weights#self attention层attention_output = self.self_attention(x,q_w, q_b,k_w, k_b,v_w, v_b,attention_output_weight, attention_output_bias,self.num_attention_heads,self.hidden_size)#bn层，并使用了残差机制x = self.layer_norm(x + attention_output, attention_layer_norm_w, attention_layer_norm_b)#feed forward层feed_forward_x = self.feed_forward(x,intermediate_weight, intermediate_bias,output_weight, output_bias)#bn层，并使用了残差机制x = self.layer_norm(x + feed_forward_x, ff_layer_norm_w, ff_layer_norm_b)return x# self attention的计算def self_attention(self,x,q_w,q_b,k_w,k_b,v_w,v_b,attention_output_weight,attention_output_bias,num_attention_heads,hidden_size):# x.shape = max_len * hidden_size# q_w, k_w, v_w  shape = hidden_size * hidden_size# q_b, k_b, v_b  shape = hidden_sizeq = np.dot(x, q_w.T) + q_b  # shape: [max_len, hidden_size]      W * X + B lINERk = np.dot(x, k_w.T) + k_b  # shpae: [max_len, hidden_size]v = np.dot(x, v_w.T) + v_b  # shpae: [max_len, hidden_size]attention_head_size = int(hidden_size / num_attention_heads)# q.shape = num_attention_heads, max_len, attention_head_sizeq = self.transpose_for_scores(q, attention_head_size, num_attention_heads)# k.shape = num_attention_heads, max_len, attention_head_sizek = self.transpose_for_scores(k, attention_head_size, num_attention_heads)# v.shape = num_attention_heads, max_len, attention_head_sizev = self.transpose_for_scores(v, attention_head_size, num_attention_heads)# qk.shape = num_attention_heads, max_len, max_lenqk = np.matmul(q, k.swapaxes(1, 2))  qk /= np.sqrt(attention_head_size)qk = softmax(qk)# qkv.shape = num_attention_heads, max_len, attention_head_sizeqkv = np.matmul(qk, v)# qkv.shape = max_len, hidden_sizeqkv = qkv.swapaxes(0, 1).reshape(-1, hidden_size)# attention.shape = max_len, hidden_sizeattention = np.dot(qkv, attention_output_weight.T) + attention_output_biasreturn attention#多头机制def transpose_for_scores(self, x, attention_head_size, num_attention_heads):# hidden_size = 768  num_attent_heads = 12 attention_head_size = 64max_len, hidden_size = x.shapex = x.reshape(max_len, num_attention_heads, attention_head_size)x = x.swapaxes(1, 0)  # output shape = [num_attention_heads, max_len, attention_head_size]return x#前馈网络的计算def feed_forward(self,x,intermediate_weight,  # intermediate_size, hidden_sizeintermediate_bias,  # intermediate_sizeoutput_weight,  # hidden_size, intermediate_sizeoutput_bias,  # hidden_size):# output shpae: [max_len, intermediate_size]x = np.dot(x, intermediate_weight.T) + intermediate_biasx = gelu(x)# output shpae: [max_len, hidden_size]x = np.dot(x, output_weight.T) + output_biasreturn x#归一化层def layer_norm(self, x, w, b):x = (x - np.mean(x, axis=1, keepdims=True)) / np.std(x, axis=1, keepdims=True)x = x * w + breturn x#链接[cls] token的输出层def pooler_output_layer(self, x):x = np.dot(x, self.pooler_dense_weight.T) + self.pooler_dense_biasx = np.tanh(x)return x#最终输出def forward(self, x):x = self.embedding_forward(x)sequence_output = self.all_transformer_layer_forward(x)pooler_output = self.pooler_output_layer(sequence_output[0])return sequence_output, pooler_output#自制
db = DiyBert(state_dict)
diy_sequence_output, diy_pooler_output = db.forward(x)
#torch
torch_sequence_output, torch_pooler_output = bert(torch_x)print(diy_sequence_output)
print(torch_sequence_output)# print(diy_pooler_output)
# print(torch_pooler_output)

import torch
import math
import numpy as np
from transformers import BertModel
from transformers import BertTokenizer
'''关于transformers自带的序列化工具
模型文件下载 https://huggingface.co/models'''# bert = BertModel.from_pretrained(r"F:\Desktop\work_space\pretrain_models\bert-base-chinese", return_dict=False)
tokenizer = BertTokenizer.from_pretrained(r"F:\Desktop\work_space\pretrain_models\bert-base-chinese")string = "咱呀么老百姓今儿个真高兴"
#分字
tokens = tokenizer.tokenize(string)
print("分字：", tokens)
#编码，前后自动添加了[cls] string [sep]
encoding = tokenizer.encode(string)
print("编码：", encoding)
#文本对编码, 形式[cls] string1 [sep] string2 [sep]
string1 = "今天天气真不错"
string2 = "明天天气怎么样"
encoding = tokenizer.encode(string1, string2)
print("文本对编码：", encoding)
#同时输出attention_mask和token_type编码
encoding = tokenizer.encode_plus(string1, string2)
print("全部编码：", encoding)