T06_RNN示例
加载数据
以IMDB影评数据为例,加载评论
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow import data
import warningswarnings.filterwarnings('ignore')
读取影评数据
df_train = pd.read_csv(r'D:\WebDownload\IMDB影评数据集\labeledTrainData.tsv',sep=r'\t')
df_test = pd.read_csv(r'D:\WebDownload\IMDB影评数据集\testData.tsv',sep=r'\t')
处理数据,testData数据需要将评分提取处理来,然后将大于等于7的设置为积极表示,最后将评论中的<br />替换掉
df_test['sentiment']=df_test.id
df_test.sentiment = df_test.sentiment.map(lambda x: x.replace("\"","").split('_')[1])
df_test.sentiment = df_test.sentiment.map(lambda x: 1 if int(x)>=7 else 0)
df_test.review=df_test.review.map(lambda x : x.replace("<br />",""))
df_train.review=df_train.review.map(lambda x : x.replace("<br />",""))
制作单词表
根据评论中的单词制作单词表,然后将评论转换为单词编码
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequencestokernizer = Tokenizer(num_words=1000)
tokernizer.fit_on_texts(df_train.review.tolist())df_train_review = tokernizer.texts_to_sequences(df_train.review.tolist())
df_test_review = tokernizer.texts_to_sequences(df_test.review.tolist())
训练模型
数据
将前面的数据转换为tf.data.Dataset
对于长度参差不齐的句子,人为设置一个阈值,对大于此长度的句子,选择截断部分单词,可以选择截去句首单词,也可以截去句末单词;对于小于此长度的句子,可以选择在句首或句尾填充。
db_train = data.Dataset.from_tensor_slices((pad_sequences(df_train_review,maxlen=80),df_train.sentiment))
db_test = data.Dataset.from_tensor_slices((pad_sequences(df_test_review,maxlen=80),df_test.sentiment))
db_train = db_train.shuffle(1000).batch(128,drop_remainder=True)
db_test = db_test.batch(128,drop_remainder=True)
网络模型
class MyRNN(keras.Model):def __init__(self,units):'''params:units RNN 状态向量长度 '''super(MyRNN,self).__init__()self.stat0=[tf.zeros([128,units]),tf.zeros([128,units])]self.stat1=[tf.zeros([128,units]),tf.zeros([128,units])]self.embeding = keras.layers.Embedding(1000,100,input_length=80)self.rnn_cell0 = keras.layers.LSTMCell(units,dropout=0.5)self.rnn_cell1 = keras.layers.LSTMCell(units,dropout=0.5)self.out_layer = keras.Sequential([keras.layers.Dense(units),keras.layers.Dropout(rate=0.5),keras.layers.ReLU(),keras.layers.Dense(1)])def call(self,inputs,training=None):X = inputsx=self.embeding(X)state0 = self.stat0state1 = self.stat1for word in tf.unstack(x,axis=1):out0,state0 = self.rnn_cell0(word,state0,training)out1,state1 = self.rnn_cell0(word,state1,training)x = self.out_layer(out1,training)prob = tf.sigmoid(x)return probmodel = MyRNN(64)
model.compile(optimizer=keras.optimizers.RMSprop(0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
model.fit(db_train,epochs=20)
经过20轮的训练,精度达到了0.916,测试精度达到0.841
推荐使用keras的高级接口:
model = keras.Sequential([keras.layers.Embedding(1000,100,input_length=80),keras.layer.Flatten(),keras.layers.LSTM(64,return_sequences=True),keras.layers.LSTM(64),keras.layers.Dense(64,activation='relu'),keras.layers.Dropout(rate=0.5),keras.layers.Dense(1, activation='sigmoid'),
])
model.compile(optimizer=keras.optimizers.RMSprop(0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
model.fit(db_train,epochs=20)
但上面代码执行的时要注意,如果是A卡环境,不能使用GPU加速,会报错(原因大概为:tensorflow检测到有GPU,LSTM/GRU底层默认会调用CudnnRNNv2,N卡环境才有CudnnRNNv2)。
使用预训练的词向量
使用预训练的 Word Embedding 模型来得到单词的表示方法,基于预训练模型的词向量相当于迁移了整个语义空间的知识,可以有效的缩短训练时间,还可以提高性能。
加载GloVe词向量
# 加载GloVe向量
def load_glove_vectors(glove_file):with open(glove_file, 'r', encoding='utf-8') as file:words = set()word_to_vec_map = {}for line in file:line = line.strip().split()curr_word = line[0]words.add(curr_word)word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float32)return words, word_to_vec_mapwords, word_to_vec_map = load_glove_vectors(r'D:\dwload\glove\glove.6B.100d.txt')
print("词汇量大小:", len(words))
print("向量维度:", word_to_vec_map['word'].shape) # 示例输出一个单词的向量维度
构建Tokenizer
tokenizer = keras.preprocessing.text.Tokenizer(10000)
tokenizer.fit_on_texts(df_train.review.tolist())
imdn_word_index = tokenizer.word_indexfrom tensorflow.keras.preprocessing.sequence import pad_sequencesdf_train_review = tokenizer.texts_to_sequences(df_train.review.tolist())
df_test_review = tokenizer.texts_to_sequences(df_test.review.tolist())db_train = tf.data.Dataset.from_tensor_slices((pad_sequences(df_train_review,maxlen=100),df_train.sentiment.tolist()))
db_test = tf.data.Dataset.from_tensor_slices((pad_sequences(df_test_review,maxlen=100),df_test.sentiment.tolist()))
db_train = db_train.shuffle(1000).batch(128,drop_remainder=True)
db_test = db_test.batch(128,drop_remainder=True)
嵌入矩阵
根据GloVe向量和tokenizer的词索引,构建embedding_matrix
num_words = min(10000, len(imdn_word_index)) + 1
embedding_matrix = np.zeros((num_words, 100)) # 100是单词维度
for word, i in imdn_word_index.items():if i > 10000:breakembedding_vector = word_to_vec_map.get(word)if embedding_vector is not None:embedding_matrix[i] = embedding_vector
模型
这里对embeding进行修改,使用预训练的词向量;设置trainable=False,表示后边的训练过程不在对embeding进行训练。
class MyRNN(keras.Model):def __init__(self,units):'''params:units RNN 状态向量长度 '''super(MyRNN,self).__init__()self.stat0=[tf.zeros([128,units]),tf.zeros([128,units])]self.stat1=[tf.zeros([128,units]),tf.zeros([128,units])]self.embeding = keras.layers.Embedding(num_words,100,weights=[embedding_matrix],input_length=100,trainable=False)self.rnn_cell0 = keras.layers.GRUCell(units) # 使用GRUCellself.rnn_cell1 = keras.layers.GRUCell(units)self.out_layer = keras.Sequential([keras.layers.Dense(units),keras.layers.ReLU(),keras.layers.Dense(1)])def call(self,inputs,training=None):X = inputsx=self.embeding(X)state0 = self.stat0state1 = self.stat1for word in tf.unstack(x,axis=1):out0,state0 = self.rnn_cell0(word,state0,training)out1,state1 = self.rnn_cell0(word,state1,training)x = self.out_layer(out1,training)prob = tf.sigmoid(x)return prob
model = MyRNN(64)
model.compile(optimizer=keras.optimizers.RMSprop(0.001),loss=keras.losses.BinaryCrossentropy(),metrics=['accuracy'])
model.fit(db_train,epochs=30)
model.evaluate(db_test)
同过使用预训练词向量,训练速度明显提升;经过30轮的训练,训练精度达到了0.977,测试精度0.832(存在过拟合)。