首先感谢唐宇迪课程https://edu.csdn.net/course/detail/3921/68734?auto_start=1
序列生成首先要做好数据预处理,第一步是要将文本数据转化为数值数据,可以使用word2vec训练词向量模型。在与以往文本分类需要文本长度保持一致不同,seq2seq只需要一个bach_size内的dequence_length保持一致,其他可以不一致。接着写encoder层和decoder层,encoder层的输出是中间向量。有的模型将这一中间向量参与训练decoder层所有训练,也有模型只是将该向量作为第一步输入。decoder层则包括训练和预测两个功能,预测的时候需要加[start]和[eos],此外还引进了注意力机制,在模型的输入也采取倒序输入,这都有利于提高模型的分类效果。encoder_input:sequence+<EOS>,作为目标句的输入为<GO>+target_sequence
import pandas as pd
import re
import numpy as np
import tensorflow as tf
import time
filename = 'E:\DataSets\Reviews.csv\Reviews.csv'
reviews = pd.read_csv(filename)
# print(reviews.isnull().sum())
# 去除缺失的空值
reviews = reviews.dropna()
# 去除不需要的列
reviews = reviews.drop(
['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1)
# 对去除以后的内容重新分配index
reviews = reviews.reset_index(drop=True)
# print(reviews.head()),显示去除以后的前十个text与summary
#连词转换词典
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "can not",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what've": "what have",
"what'd": "what did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}
#对文本内容进行清洗,全部转化为小写,最后形成'i want to rock you'形式
def clean_text(text, remove_stopwords=True):
text = text.lower()
if True:
text = text.split()
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
#形成新的句子,类型为str
text = " ".join(new_text)
#去除一些特殊符号
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', text)
text = re.sub(r'&', '', text)
text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
text = re.sub(r'<br />', ' ', text)
text = re.sub(r'\'', ' ', text)
words = open("Englishstopwords.txt", 'r')
stop = words.readlines()
stopwords = stop[0].split(" ")
stopwords = stopwords[0:-1]
if remove_stopwords:
text = text.split()
stops = set(stopwords)
text = [w for w in text if w not in stops]
# 形成新的句子,类型为str
text = " ".join(text)
return text
clean_summaries = []
#clean_summaries 和 clean_texts 里面的格式为['i want to rock you','you will win the championship',...]
for summary in reviews.Summary:
clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are completed")
clean_texts = []
for text in reviews.Text:
clean_texts.append(clean_text(text, remove_stopwords=True))
print("Texts are completed")
#生成单词字典形式,形式为{‘many’:897,'hate':234,....}
def count_words(count_dict, text):
for setence in text:
for word in setence.split():
if word not in count_dict:
count_dict[word] = 1
else:
count_dict[word] += 1
# 单词的字典,也就是说word_count存储单词的词频,不包括重复单词
word_counts = {}
count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)
print("Size of Vocabulary :", len(word_counts))
embeddings_index = {}
# 将训练好的向量以字典形式存储,第一步加载词向量,embeddings_index最终的形式为{‘a’:(词向量),‘money’:(词向量),...}
with open(r'E:\word2vecmodel\numberbatch-en-17.04b.txt', 'r',encoding='utf8') as f:
for line in f:
values = line.split(' ')
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = embedding
missing_words = 0
threshold = 20 # 设置阈值,出现次数小于20的就不用了
for word, count in word_counts.items():
if count > threshold:
if word not in embeddings_index:
missing_words += 1 # 统计不在词向量的字典中的个数,且满足经常出现条件
missing_ratio = round(missing_words / len(word_counts), 4) * 100
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))
vocab_to_int = {}
value = 0
#将单词映射为整数
for word, count in word_counts.items():
if count >= threshold or word in embeddings_index:
vocab_to_int[word] = value
value += 1
# 特殊符号
codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"]
for code in codes:
vocab_to_int[code] = len(vocab_to_int)
int_to_vocab = {}
#颠倒vocab_to_int
for word, value in vocab_to_int.items():
int_to_vocab[value] = word
usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100
print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))
embedding_dim = 300
nb_words = len(vocab_to_int)
# 初始化词向量,最后得到word_embedding_matrix为矩阵shape为nb_words * 300
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
if word in embeddings_index:
word_embedding_matrix[i] = embeddings_index[word]
else:
new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
embeddings_index[word] = new_embedding
word_embedding_matrix[i] = new_embedding
#将setence中的单词形成数字[[1,234,7687,23,...],[345,908,2359,11234,...],...]
def convert_to_ints(text, word_count, unk_count, eos=False):
ints = []
for setence in text:
setence_ints = []
for word in setence.split():
word_count += 1
if word in vocab_to_int:
setence_ints.append(vocab_to_int[word])
else:
setence_ints.append(vocab_to_int['<UNK>'])
unk_count += 1
if eos:
setence_ints.append(vocab_to_int['<EOS>'])
ints.append(setence_ints)
return ints, word_count, unk_count
word_count = 0
unk_count = 0
#int_summaries和int_texts格式为[[1,234,7687,23,...],[345,908,2359,11234,...],...]
int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)
def create_lengths(text):
lengths = []
for setence in text:
lengths.append(len(setence))
return pd.DataFrame(lengths, columns=['counts'])
lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)
# 测试当前text的统计长度
print(np.percentile(lengths_texts.counts, 90))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))
# 测试当前summary的统计长度
print(np.percentile(lengths_summaries.counts, 90))
print(np.percentile(lengths_summaries.counts, 95))
print(np.percentile(lengths_summaries.counts, 99))
#统计unk的数目,为下一步筛选有效训练集做准备
def unk_counter(setence):
unk_count = 0
for word in setence:
if word == vocab_to_int['<UNK>']:
unk_count += 1
return unk_count
sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0
#按长度排序,循环中count为序号
for length in range(min(lengths_texts.counts), max_text_length):
for count, words in enumerate(int_summaries):
if (len(int_summaries[count]) >= min_length and len(int_summaries[count]) <= max_summary_length
and len(int_texts[count]) > min_length and unk_counter(
int_summaries[count]) <= unk_summary_limit and unk_counter(int_texts[count]) < unk_text_limit
and length == len(int_texts[count])):
sorted_summaries.append(int_summaries[count])
sorted_texts.append(int_texts[count])
#以上得到经预处理后长短排序升序的texts和summary
#为输入定义占位符
def model_inputs():
input_data = tf.placeholder(tf.int32, [None, None], name = 'input')#应该是batch_size*dimensions,batch_size*句长
targets = tf.placeholder(tf.int32, [None, None], name = 'targets')#应该是batch_size*句长
lr = tf.placeholder(tf.float32, name = 'learning_rate')#学习率应该更小一些
keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')#防止过拟合
summary_length = tf.placeholder(tf.int32,(None, ), name = 'summary_length')#summary的长度
max_summary_length = tf.reduce_max(summary_length, name = 'max_dec_len')#tf.reduce_max()计算各个维度上元素的最大值
text_length = tf.placeholder(tf.int32, (None, ), name = 'text_length')#text的长度
return input_data, targets, lr, keep_prob, summary_length, max_summary_length ,text_length
#每个batch开始阶段加<GO>
def process_encoding_input(target_data, vocab_to_int, batch_size):#target就是summary
ending = tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1])#三维切片,每一维切割都是来自于上一维切割的结果
dec_input = tf.concat([tf.fill([batch_size,1],vocab_to_int['<GO>']),ending],1)
return dec_input
#创建encoding层
def encoding_layer(rnn_size,sequence_length, num_layers, rnn_inputs, keep_prob):
for layer in range(num_layers):
with tf.variable_scope('encoder_{}'.format(layer)):
cell_fw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw,input_keep_prob=keep_prob)
cell_bw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw,input_keep_prob=keep_prob)
enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,rnn_inputs,sequence_length, dtype=tf.float32)
enc_output = tf.concat(enc_output,2)
return enc_output, enc_state#enc_output应该为中间向量
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):#用于训练模型
training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=summary_length, time_major=False)#帮助建立一个训练的decoder类
training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,training_helper,initial_state, output_layer)#构造一个decoder
training_logits,_ ,_= tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length)#构造一个动态的decoder,返回(final_outputs, final_state, final_sequence_lengths).final_outputs是一个namedtuple,里面包含两项(rnn_outputs, sample_id)
return training_logits
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer, max_summary_length, batch_size):#decoding,解码要有<GO>和<EOS>,用于预测
start_token = tf.tile(tf.constant([start_token],dtype = tf.int32), [batch_size], name = 'start_token')#tile扩展向量
inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,start_token,end_token)#方便最后预测,seq2seq中帮助建立Decoder的一个类,在预测时使用
inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)#构造一个decoder
inference_logits,_ ,_= tf.contrib.seq2seq.dynamic_decode(inference_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length)
return inference_logits
#decoding层
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, max_summary_length, rnn_size,vocab_to_int, keep_prob,batch_size, num_layers):
for layer in range(num_layers):
with tf.variable_scope('decoder_{}'.format(layer)):
lstm = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer = tf.random_uniform_initializer(-0.1,0.1,seed = 2))
dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,input_keep_prob=keep_prob)
output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))#构造一个全连接的类,后续的vocab_size= len(vocab_to_int)+1仍需弄清楚
attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize = False)#集中机制
dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(dec_cell, attn_mech, rnn_size)
initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state[0])#可以理解为只给第一个,然后
with tf.variable_scope("decode"):
training_logits = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state,output_layer, vocab_size, max_summary_length)
with tf.variable_scope("decode",reuse= True):
inference_logits = inference_decoding_layer(embeddings, vocab_to_int['<GO>'],vocab_to_int['<EOS>'],dec_cell,initial_state,output_layer,max_summary_length,batch_size)
return training_logits, inference_logits
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, vocab_size, rnn_size, num_layers, vocab_to_int,batch_size):
embeddings = word_embedding_matrix#因为要预测所有的词,所以是全体词汇表的长度
enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input,keep_prob)
dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
training_logits, inference_logits = decoding_layer(dec_embed_input,embeddings,enc_output, enc_state, vocab_size,text_length,summary_length,max_summary_length,rnn_size,vocab_to_int,keep_prob,batch_size,num_layers)
return training_logits, inference_logits
#构造pad层
def pad_sentence_batch(sentence_batch):#pad层填充
max_sentence = max([len(sentence) for sentence in sentence_batch])
return [sentence + [vocab_to_int['<PAD>']]*(max_sentence-len(sentence)) for sentence in sentence_batch]
def get_batches(summaries, texts, batch_size):#获取数据
for batch_i in range(0,len(texts)//batch_size):
start_i = batch_i*batch_size
summaries_batch = summaries[start_i:start_i + batch_size]
texts_batch = texts[start_i:start_i + batch_size]
pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
pad_summaries_lengths = []
for summary in pad_summaries_batch:
pad_summaries_lengths.append(len(summary))
pad_texts_lengths = []
for text in pad_texts_batch:
pad_summaries_lengths.append(len(text))
yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths,pad_texts_lengths
epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
train_graph = tf.Graph()
with train_graph.as_default():
input_data, targets, lr, keep_prob, summary_length, text_length = model_inputs()
training_logits, inference_logits = seq2seq_model(tf.reverse(input_data,[-1]),targets, keep_prob,text_length,summary_length
,max_summary_length,len(vocab_to_int),rnn_size,num_layers,vocab_to_int,batch_size)#-1说明将其颠倒过来以后方便联系
training_logits = tf.identity(training_logits.rnn_output, 'logits')#保存每个单词的概率,用于计算loss
inference_logits = tf.identity(inference_logits.sample_id,name = 'predictions')#保存最后的单词结果
masks = tf.sequence_mask(summary_length,max_summary_length,dtype=tf.float32, name='masks')#engths代表的是一个一维数组,代表每一个sequence的长度,那么该函数返回的是一个mask的张量,张量的维数是:(lengths.shape,maxlen)
with tf.name_scope("optimization"):
cost = tf.contrib.seq2seq.sequence_loss(training_logits,targets,masks)#用于计算seq2seq中的loss。当我们的输入是不定长的时候,weights参数常常使用我们1.11中得到的mask
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad,-5.,5.),var) for grad,var in gradients if grad is not None]#输入一个张量A,把A中的每一个元素的值都压缩在min和max之间。小于min的让它等于min,大于max的元素的值等于max
train_op = optimizer.apply_gradients(capped_gradients)#梯度修剪主要避免训练梯度爆炸和消失问题
print("Graph is built")
start = 200000
end = start + 5000
sorted_summaries_short = sorted_summaries[start:end]
sorted_texts_short = sorted_texts[start:end]
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20
stop_early = 0
stop = 3
per_epoch = 3
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1
update_loss = 0
batch_loss = 0
summary_update_loss = []
checkpoint = "best_model.ckpt"
with tf.Session(graph=train_graph) as sess:
sess.run(tf.global_variables_initializer())
for epoch_i in range(1,epochs+1):
update_loss = 0
batch_loss = 0
for batch_i,(summaries_batch, texts_batch, summaries_lengths,texts_lengths) in enumerate(get_batches(sorted_summaries_short,sorted_texts_short,batch_size)):
start_time = time.time()
_,loss = sess.run([train_op,cost],{input_data:texts_batch,targets:summaries_batch,lr:learning_rate,summary_length:summaries_lengths,text_length:texts_lengths,keep_prob:keep_probability})
batch_loss += loss
update_loss += loss
end_time = time.time()
batch_time = end_time - start_time
if batch_i % display_step == 0 and batch_i >0:
print('Epoch{:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds:{:>4.2f}'.format(epoch_i,epochs,batch_i,
len(sorted_texts_short)//batch_size,
batch_loss/display_step,
batch_time*display_step))
if batch_i % update_check == 0 and batch_i>0:
print("Average loss for this update:", round(update_loss/update_check,3))
summary_update_loss.append(update_loss)
#如果update_loss最小,则保存模型
if update_loss <= min(summary_update_loss):
print('New Record')
stop_early = 0
saver = tf.train.Saver()
saver.save(sess, checkpoint)
else:
print('No Improvement')
stop_early += 1
if stop_early == stop:
break
update_loss = 0
learning_rate *= learning_rate_decay
if learning_rate < min_learning_rate:
learning_rate = min_learning_rate
if stop_early == stop:
print("Stopping Training")
break
#测试效果
def text_to_seq(text):
text = clean_text(text)
return [vocab_to_int.get(word,vocab_to_int['<UNK>']) for word in text.split()]
random = np.random.randint(0,len(clean_texts))
input_sentence = clean_texts[random]
text = text_to_seq(clean_texts[random])
checkpoint = './best_model.ckpt'
loaded_graph = tf.Graph()
with tf.Session(graph = loaded_graph) as sess:
loader = tf.train.import_meta_graph(checkpoint+'.meta')
loader.restore(sess,checkpoint)
input_data = loaded_graph.get_tensor_by_name('input:0')
logits = loaded_graph.get_tensor_by_name('predictions:0')
text_length = loaded_graph.get_tensor_by_name('text_length:0')
summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
keep_prob = loaded_graph.get_tensor_by_name('input:0')
answer_logits = sess.run(logits,{input_data:{text}*batch_size,
summary_length:[np.random.randint(5,8)],
text_length:[len(text)*batch_size],
keep_prob:1.0})[0]
pad = vocab_to_int["<PAD>"]
print('Original Text:', input_sentence)
print('\nText')
print('Word Ids: {}'.format([i for i in text]))
print('Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))
print("\nSummary")
print('Word Ids: {}'.format([i for i in answer_logits if i != pad]))
print('Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))