唐宇迪Seq2Seq代码+注释(tensorflow1.2版本)

首先感谢唐宇迪课程https://edu.csdn.net/course/detail/3921/68734?auto_start=1

序列生成首先要做好数据预处理,第一步是要将文本数据转化为数值数据,可以使用word2vec训练词向量模型。在与以往文本分类需要文本长度保持一致不同,seq2seq只需要一个bach_size内的dequence_length保持一致,其他可以不一致。接着写encoder层和decoder层,encoder层的输出是中间向量。有的模型将这一中间向量参与训练decoder层所有训练,也有模型只是将该向量作为第一步输入。decoder层则包括训练和预测两个功能,预测的时候需要加[start]和[eos],此外还引进了注意力机制,在模型的输入也采取倒序输入,这都有利于提高模型的分类效果。encoder_input:sequence+<EOS>,作为目标句的输入为<GO>+target_sequence

import pandas as pd
import re
import numpy as np
import tensorflow as tf
import time

filename = 'E:\DataSets\Reviews.csv\Reviews.csv'
reviews = pd.read_csv(filename)
# print(reviews.isnull().sum())
# 去除缺失的空值
reviews = reviews.dropna()
# 去除不需要的列
reviews = reviews.drop(
    ['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Score', 'Time'], 1)
# 对去除以后的内容重新分配index
reviews = reviews.reset_index(drop=True)
# print(reviews.head()),显示去除以后的前十个text与summary
#连词转换词典
contractions = {
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what've": "what have",
    "what'd": "what did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

#对文本内容进行清洗,全部转化为小写,最后形成'i want to rock you'形式
def clean_text(text, remove_stopwords=True):
    text = text.lower()
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        #形成新的句子,类型为str
        text = " ".join(new_text)
    #去除一些特殊符号
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    words = open("Englishstopwords.txt", 'r')
    stop = words.readlines()
    stopwords = stop[0].split(" ")
    stopwords = stopwords[0:-1]
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords)
        text = [w for w in text if w not in stops]
        # 形成新的句子,类型为str
        text = " ".join(text)
    return text


clean_summaries = []
#clean_summaries 和 clean_texts 里面的格式为['i want to rock you','you will win the championship',...]
for summary in reviews.Summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are completed")
clean_texts = []
for text in reviews.Text:
    clean_texts.append(clean_text(text, remove_stopwords=True))
print("Texts are completed")

#生成单词字典形式,形式为{‘many’:897,'hate':234,....}
def count_words(count_dict, text):
    for setence in text:
        for word in setence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1


# 单词的字典,也就是说word_count存储单词的词频,不包括重复单词
word_counts = {}
count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)
print("Size of Vocabulary :", len(word_counts))
embeddings_index = {}
# 将训练好的向量以字典形式存储,第一步加载词向量,embeddings_index最终的形式为{‘a’:(词向量),‘money’:(词向量),...}
with open(r'E:\word2vecmodel\numberbatch-en-17.04b.txt', 'r',encoding='utf8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding
missing_words = 0
threshold = 20  # 设置阈值,出现次数小于20的就不用了
for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1  # 统计不在词向量的字典中的个数,且满足经常出现条件
missing_ratio = round(missing_words / len(word_counts), 4) * 100
print("Number of words missing from CN:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))
vocab_to_int = {}
value = 0
#将单词映射为整数
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1
# 特殊符号
codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"]
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)
int_to_vocab = {}
#颠倒vocab_to_int
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word
usage_ratio = round(len(vocab_to_int) / len(word_counts), 4) * 100
print("Total number of unique words:", len(word_counts))
print("Number of words we will use:", len(vocab_to_int))
print("Percent of words we will use: {}%".format(usage_ratio))
embedding_dim = 300
nb_words = len(vocab_to_int)
# 初始化词向量,最后得到word_embedding_matrix为矩阵shape为nb_words * 300
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

#将setence中的单词形成数字[[1,234,7687,23,...],[345,908,2359,11234,...],...]
def convert_to_ints(text, word_count, unk_count, eos=False):
    ints = []
    for setence in text:
        setence_ints = []
        for word in setence.split():
            word_count += 1
            if word in vocab_to_int:
                setence_ints.append(vocab_to_int[word])
            else:
                setence_ints.append(vocab_to_int['<UNK>'])
                unk_count += 1
        if eos:
            setence_ints.append(vocab_to_int['<EOS>'])
        ints.append(setence_ints)
    return ints, word_count, unk_count


word_count = 0
unk_count = 0
#int_summaries和int_texts格式为[[1,234,7687,23,...],[345,908,2359,11234,...],...]
int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)


def create_lengths(text):
    lengths = []
    for setence in text:
        lengths.append(len(setence))
    return pd.DataFrame(lengths, columns=['counts'])


lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)
# 测试当前text的统计长度
print(np.percentile(lengths_texts.counts, 90))
print(np.percentile(lengths_texts.counts, 95))
print(np.percentile(lengths_texts.counts, 99))
# 测试当前summary的统计长度
print(np.percentile(lengths_summaries.counts, 90))
print(np.percentile(lengths_summaries.counts, 95))
print(np.percentile(lengths_summaries.counts, 99))

#统计unk的数目,为下一步筛选有效训练集做准备
def unk_counter(setence):
    unk_count = 0
    for word in setence:
        if word == vocab_to_int['<UNK>']:
            unk_count += 1
            return unk_count


sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0
#按长度排序,循环中count为序号
for length in range(min(lengths_texts.counts), max_text_length):
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= min_length and len(int_summaries[count]) <= max_summary_length
            and len(int_texts[count]) > min_length and unk_counter(
            int_summaries[count]) <= unk_summary_limit and unk_counter(int_texts[count]) < unk_text_limit
            and length == len(int_texts[count])):
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])
#以上得到经预处理后长短排序升序的texts和summary
#为输入定义占位符
def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name = 'input')#应该是batch_size*dimensions,batch_size*句长
    targets = tf.placeholder(tf.int32, [None, None], name = 'targets')#应该是batch_size*句长
    lr = tf.placeholder(tf.float32, name = 'learning_rate')#学习率应该更小一些
    keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')#防止过拟合
    summary_length = tf.placeholder(tf.int32,(None, ), name = 'summary_length')#summary的长度
    max_summary_length = tf.reduce_max(summary_length, name = 'max_dec_len')#tf.reduce_max()计算各个维度上元素的最大值
    text_length = tf.placeholder(tf.int32, (None, ), name = 'text_length')#text的长度
    return input_data, targets, lr, keep_prob, summary_length, max_summary_length ,text_length
#每个batch开始阶段加<GO>
def process_encoding_input(target_data, vocab_to_int, batch_size):#target就是summary
    ending = tf.strided_slice(target_data,[0,0],[batch_size,-1],[1,1])#三维切片,每一维切割都是来自于上一维切割的结果
    dec_input = tf.concat([tf.fill([batch_size,1],vocab_to_int['<GO>']),ending],1)
    return dec_input
#创建encoding层
def encoding_layer(rnn_size,sequence_length, num_layers, rnn_inputs, keep_prob):
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
            cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw,input_keep_prob=keep_prob)
            cell_bw = tf.nn.rnn_cell.BasicLSTMCell(rnn_size)
            cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw,input_keep_prob=keep_prob)
            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw,cell_bw,rnn_inputs,sequence_length, dtype=tf.float32)
        enc_output = tf.concat(enc_output,2)
    return enc_output, enc_state#enc_output应该为中间向量
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):#用于训练模型
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=summary_length, time_major=False)#帮助建立一个训练的decoder类
    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,training_helper,initial_state, output_layer)#构造一个decoder
    training_logits,_ ,_= tf.contrib.seq2seq.dynamic_decode(training_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length)#构造一个动态的decoder,返回(final_outputs, final_state, final_sequence_lengths).final_outputs是一个namedtuple,里面包含两项(rnn_outputs, sample_id)
    return training_logits
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer, max_summary_length, batch_size):#decoding,解码要有<GO>和<EOS>,用于预测
    start_token = tf.tile(tf.constant([start_token],dtype = tf.int32), [batch_size], name = 'start_token')#tile扩展向量
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,start_token,end_token)#方便最后预测,seq2seq中帮助建立Decoder的一个类,在预测时使用
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,inference_helper,initial_state,output_layer)#构造一个decoder
    inference_logits,_ ,_= tf.contrib.seq2seq.dynamic_decode(inference_decoder,output_time_major = False, impute_finished = True,maximum_iterations = max_summary_length)
    return inference_logits
#decoding层
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, max_summary_length, rnn_size,vocab_to_int, keep_prob,batch_size, num_layers):
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer = tf.random_uniform_initializer(-0.1,0.1,seed = 2))
            dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm,input_keep_prob=keep_prob)
    output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))#构造一个全连接的类,后续的vocab_size= len(vocab_to_int)+1仍需弄清楚
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize = False)#集中机制
    dec_cell = tf.contrib.seq2seq.DynamicAttentionWrapper(dec_cell, attn_mech, rnn_size)
    initial_state = tf.contrib.seq2seq.DynamicAttentionWrapperState(enc_state[0])#可以理解为只给第一个,然后
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state,output_layer, vocab_size, max_summary_length)
    with tf.variable_scope("decode",reuse= True):
        inference_logits = inference_decoding_layer(embeddings, vocab_to_int['<GO>'],vocab_to_int['<EOS>'],dec_cell,initial_state,output_layer,max_summary_length,batch_size)
    return training_logits, inference_logits
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, vocab_size, rnn_size, num_layers, vocab_to_int,batch_size):
    embeddings = word_embedding_matrix#因为要预测所有的词,所以是全体词汇表的长度
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input,keep_prob)
    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    training_logits, inference_logits = decoding_layer(dec_embed_input,embeddings,enc_output, enc_state, vocab_size,text_length,summary_length,max_summary_length,rnn_size,vocab_to_int,keep_prob,batch_size,num_layers)
    return training_logits, inference_logits
#构造pad层
def pad_sentence_batch(sentence_batch):#pad层填充
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']]*(max_sentence-len(sentence)) for sentence in sentence_batch]
def get_batches(summaries, texts, batch_size):#获取数据
    for batch_i in range(0,len(texts)//batch_size):
        start_i = batch_i*batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_summaries_lengths.append(len(text))
        yield  pad_summaries_batch, pad_texts_batch, pad_summaries_lengths,pad_texts_lengths
epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prob, summary_length, text_length = model_inputs()
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data,[-1]),targets, keep_prob,text_length,summary_length
                                                      ,max_summary_length,len(vocab_to_int),rnn_size,num_layers,vocab_to_int,batch_size)#-1说明将其颠倒过来以后方便联系
    training_logits = tf.identity(training_logits.rnn_output, 'logits')#保存每个单词的概率,用于计算loss
    inference_logits = tf.identity(inference_logits.sample_id,name = 'predictions')#保存最后的单词结果
    masks = tf.sequence_mask(summary_length,max_summary_length,dtype=tf.float32, name='masks')#engths代表的是一个一维数组,代表每一个sequence的长度,那么该函数返回的是一个mask的张量,张量的维数是:(lengths.shape,maxlen)
    with tf.name_scope("optimization"):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits,targets,masks)#用于计算seq2seq中的loss。当我们的输入是不定长的时候,weights参数常常使用我们1.11中得到的mask
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad,-5.,5.),var) for grad,var in gradients if grad is not None]#输入一个张量A,把A中的每一个元素的值都压缩在min和max之间。小于min的让它等于min,大于max的元素的值等于max
        train_op = optimizer.apply_gradients(capped_gradients)#梯度修剪主要避免训练梯度爆炸和消失问题
print("Graph is built")
start = 200000
end = start + 5000
sorted_summaries_short = sorted_summaries[start:end]
sorted_texts_short = sorted_texts[start:end]
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 20
stop_early = 0
stop = 3
per_epoch = 3
update_check = (len(sorted_texts_short)//batch_size//per_epoch)-1
update_loss = 0
batch_loss = 0
summary_update_loss = []
checkpoint = "best_model.ckpt"
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    for epoch_i in range(1,epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i,(summaries_batch, texts_batch, summaries_lengths,texts_lengths) in enumerate(get_batches(sorted_summaries_short,sorted_texts_short,batch_size)):
            start_time = time.time()
            _,loss = sess.run([train_op,cost],{input_data:texts_batch,targets:summaries_batch,lr:learning_rate,summary_length:summaries_lengths,text_length:texts_lengths,keep_prob:keep_probability})
            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time
            if batch_i % display_step == 0 and batch_i >0:
                print('Epoch{:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds:{:>4.2f}'.format(epoch_i,epochs,batch_i,
                                                                                               len(sorted_texts_short)//batch_size,
                                                                                               batch_loss/display_step,
                                                                                               batch_time*display_step))
            if batch_i % update_check == 0 and batch_i>0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                #如果update_loss最小,则保存模型
                if update_loss <= min(summary_update_loss):
                    print('New Record')
                    stop_early = 0
                    saver = tf.train.Saver()
                    saver.save(sess, checkpoint)
                else:
                    print('No Improvement')
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        if stop_early == stop:
            print("Stopping Training")
            break
#测试效果
def text_to_seq(text):
    text = clean_text(text)
    return [vocab_to_int.get(word,vocab_to_int['<UNK>']) for word in text.split()]
random = np.random.randint(0,len(clean_texts))
input_sentence = clean_texts[random]
text = text_to_seq(clean_texts[random])
checkpoint = './best_model.ckpt'
loaded_graph = tf.Graph()
with tf.Session(graph = loaded_graph) as sess:
    loader = tf.train.import_meta_graph(checkpoint+'.meta')
    loader.restore(sess,checkpoint)
    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('input:0')
    answer_logits = sess.run(logits,{input_data:{text}*batch_size,
                                     summary_length:[np.random.randint(5,8)],
                                     text_length:[len(text)*batch_size],
                                     keep_prob:1.0})[0]
pad = vocab_to_int["<PAD>"]
print('Original Text:', input_sentence)
print('\nText')
print('Word Ids: {}'.format([i for i in text]))
print('Input Words: {}'.format(" ".join([int_to_vocab[i] for i in text])))
print("\nSummary")
print('Word Ids: {}'.format([i for i in answer_logits if i != pad]))
print('Response Words: {}'.format(" ".join([int_to_vocab[i] for i in answer_logits if i != pad])))


























 

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 27
    评论
评论 27
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值