seq2seq,算法基本架构

首先,感谢唐宇迪课程。

更多内容请加入学习。

接下来说说我对于seq2seq的理解。

seq2seq简单来说就一个编码,再解码的过程,tensorflow官网也有相应的解释seq2seq

再或者githup上这篇写的很详细。

下面是主要的架构,导入一些基础库,是必不可少的。

import pandas as pd
import numpy as np
import tensorflow as tf
import re

把文件传入进来,进行预处理,比如去掉多余的项、英文的连写、一些特殊符号,还有停用词。(停用词网上有很多)

预处理中最关键的当然是word2vec的词向量转换,可以自己训练,也可以拿别人现成的,我还没有GPU环境,所以用的是别人的,目前最新的是17.06

把文本中所有的词依次转换为词向量之后,还需要加入开始符和停止符,同时记得计算下word2vec里,是否全包括了文本里的词。

reviews = pd.read_csv("")
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)

contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
}
def clean_text(text, remove_stopwords=True):
text = text.lower()
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&', '', text)
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    words = open("")
    stopwords = words.readline()

    if remove_stopwords:
        text = text.split()
        stops = set(stopwords)
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

clean_summaries = []
for summary in reviews.Summary:
    clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")

clean_texts = []
for text in reviews.Text:
    clean_texts.append(clean_text(text))
print("Texts are complete.")

def count_words(count_dict, text):
    for sentence in text:
        for word in sentence.split():
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1
word_counts = {}

count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)

print("Size of Vocabulary:", len(word_counts))


embeddings_index = {}
with open('') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding
vocab_to_int = {}
value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"]
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word
当全部转化为词向量后,需要看看文本里的最大词向量长度,寻找最合适的,大于的我这里是去掉了,小于的用PAD填充。
embedding_dim = 300
nb_words = len(vocab_to_int)
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

def convert_to_ints(text, word_count, unk_count, eos=False):
    ints = []
    for sentence in text:
        sentence_ints = []
        for word in sentence.split():
            word_count += 1
            if word in vocab_to_int:
                sentence_ints.append(vocab_to_int[word])
            else:
                sentence_ints.append(vocab_to_int["<UNK>"])
        if eos:
            sentence_ints.append(vocab_to_int["<EOS>"])
        ints.append(sentence_ints)
    return ints, word_count, unk_count
word_count = 0
unk_count = 0
int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)

def create_lengths(text):
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)

def unk_counter(sentence):
    unk_count = 0
    for word in sentence:
        if word == vocab_to_int["<UNK>"]:
            unk_count += 1
    return unk_count
sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0

for length in range(min(lengths_texts.counts), max_text_length):
    for count, words in enumerate(int_summaries):
        if (len(int_summaries[count]) >= min_length and
            len(int_summaries[count]) <= max_summary_length and
            len(int_texts[count]) >= min_length and
            unk_counter(int_summaries[count]) <= unk_summary_limit and
            unk_counter(int_texts[count]) <= unk_text_limit and
            length == len(int_texts[count])):
            sorted_summaries.append(int_summaries[count])
            sorted_texts.append(int_texts[count])


数据处理完后,就要开始搭建网络架构了。编码的过程,我这里使用的是双向动态RNN。

def model_inputs():
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dex_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')
    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

def process_encoding_input(target_data, vocab_to_int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    return dec_input

def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inpunts, keep_prob):
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, input_keep_prob=keep_prob)
            cell_bw = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, input_keep_prob=keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, rnn_inpunts, sequence_length, dtype=tf.float32)

    enc_output = tf.concat(enc_output, 2)
    return enc_output, enc_state
在解码的网络中,TrainingHelper和GreendyEmbeddingHelper,分别代表训练阶段的和预测阶段。TrainingHelper是指在训练阶段使用的是target_label,预测阶段GreendyEmbeddingHelper使用的是T-1的输出。另外BahdanauAttention在tensorflow的API中,解释是两种可选择的Attention机制,这里将 normalize选择False,我的理解是normalize,可以加快网络的迭代速度,而Bahdanau应该是准确性更好。(不知道是否理解正确,如果谁有实战过,这两种机制哪种效果好,还希望能告诉我,谢谢。)

def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=summary_length, time_major=False)
    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, training_helper, initial_state, output_layer)
    training_logits, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)
    return training_logits

def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    inference_helper = tf.contrib.seq2seq.GreendyEmbeddingHelper(embeddings, start_tokens, end_token)
    inferce_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, inference_helper, initial_state, output_layer)
    inference_logits, _ = tf.contrib.seq2seq.dynamic_decode(inferce_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    return inference_logits

def decoding_layer(dec_embed_input, emdeddings, enc_output, enc_state, vocab_size, text_length, summary_length,
                   max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0,1, 0.1, seed=2))
            dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm, input_keep_prob=keep_prob)
    output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize=False)
    dec_cell = tf.contrib.seq2seq.AttentionWrapperState(dec_cell, attn_mech, rnn_size)
    initial_state = tf.contrib.seq2seq.AttentionWrapperState(enc_state[0])
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer,
                                                  vocab_size, max_summary_length)
    with tf.variable_scope("decode"):
        inference_logits = inference_decoding_layer((embedding, vocab_to_int['<GO>'], vocab_to_int['<EOS>'], dec_cell,
                                                     initial_state, output_layer, max_summary_length, batch_size))
    return training_logits, inference_logits

def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length,
                  vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):
    embeddings = word_embedding_matrix
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)

    dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    training_logits, inference_logits = decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length,
                                                       summary_length, max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size,
                                                       num_layers)
    return training_logits, inference_logits
下面就是一些batch_size的生成和PAD填充,指定一个Garph,就可以开始训练了。当然最后训练网络的部分,我并没有写出来,毕竟理解seq2seq的架构才是关键。
def pad_sentence_batch(sentence_batch):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

def get_batches(summaries, texts, batch_size):
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i: start_i + batch_size]
        texts_batch = text[start_i: start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))

        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))

        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        yield pad_summaries_batch, pad_texts_batch, pad_texts_lengths, pad_texts_lengths

epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75

train_graph = tf.Graph()
with train_graph.as_default():
    input_data, targets, lr, keep_prb, summary_length, max_summary_length, text_length=model_inputs()
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, keep_prb, text_length, summary_length, max_summary_length,
                                                      len(vocab_to_int), rnn_size, num_layers, vocab_to_int, batch_size)

    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='maks')
    with tf.name_scope("optimization"):
        cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5, 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)

最后,我也是在持续学习中,如果哪里有不正确的,或者建议,希望能指出。

祝近安!



  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值