首先,感谢唐宇迪课程。
更多内容请加入学习。
接下来说说我对于seq2seq的理解。
seq2seq简单来说就一个编码,再解码的过程,tensorflow官网也有相应的解释seq2seq
下面是主要的架构,导入一些基础库,是必不可少的。
import pandas as pd
import numpy as np
import tensorflow as tf
import re
把文件传入进来,进行预处理,比如去掉多余的项、英文的连写、一些特殊符号,还有停用词。(停用词网上有很多)
预处理中最关键的当然是word2vec的词向量转换,可以自己训练,也可以拿别人现成的,我还没有GPU环境,所以用的是别人的,目前最新的是17.06
把文本中所有的词依次转换为词向量之后,还需要加入开始符和停止符,同时记得计算下word2vec里,是否全包括了文本里的词。
reviews = pd.read_csv("")
reviews = reviews.dropna()
reviews = reviews.reset_index(drop=True)
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
}
def clean_text(text, remove_stopwords=True):
text = text.lower()
if True:
text = text.split()
new_text = []
for word in text:
if word in contractions:
new_text.append(contractions[word])
else:
new_text.append(word)
text = " ".join(new_text)
text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(r'\<a href', ' ', text)
text = re.sub(r'&', '', text)
text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
text = re.sub(r'<br />', ' ', text)
text = re.sub(r'\'', ' ', text)
words = open("")
stopwords = words.readline()
if remove_stopwords:
text = text.split()
stops = set(stopwords)
text = [w for w in text if not w in stops]
text = " ".join(text)
return text
clean_summaries = []
for summary in reviews.Summary:
clean_summaries.append(clean_text(summary, remove_stopwords=False))
print("Summaries are complete.")
clean_texts = []
for text in reviews.Text:
clean_texts.append(clean_text(text))
print("Texts are complete.")
def count_words(count_dict, text):
for sentence in text:
for word in sentence.split():
if word not in count_dict:
count_dict[word] = 1
else:
count_dict[word] += 1
word_counts = {}
count_words(word_counts, clean_summaries)
count_words(word_counts, clean_texts)
print("Size of Vocabulary:", len(word_counts))
embeddings_index = {}
with open('') as f:
for line in f:
values = line.split(' ')
word = values[0]
embedding = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = embedding
vocab_to_int = {}
value = 0
for word, count in word_counts.items():
if count >= threshold or word in embeddings_index:
vocab_to_int[word] = value
value += 1
codes = ["<UNK>", "<PAD>", "<EOS>", "<GO>"]
for code in codes:
vocab_to_int[code] = len(vocab_to_int)
int_to_vocab = {}
for word, value in vocab_to_int.items():
int_to_vocab[value] = word
当全部转化为词向量后,需要看看文本里的最大词向量长度,寻找最合适的,大于的我这里是去掉了,小于的用PAD填充。
embedding_dim = 300
nb_words = len(vocab_to_int)
word_embedding_matrix = np.zeros((nb_words, embedding_dim), dtype=np.float32)
for word, i in vocab_to_int.items():
if word in embeddings_index:
word_embedding_matrix[i] = embeddings_index[word]
else:
new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
embeddings_index[word] = new_embedding
word_embedding_matrix[i] = new_embedding
def convert_to_ints(text, word_count, unk_count, eos=False):
ints = []
for sentence in text:
sentence_ints = []
for word in sentence.split():
word_count += 1
if word in vocab_to_int:
sentence_ints.append(vocab_to_int[word])
else:
sentence_ints.append(vocab_to_int["<UNK>"])
if eos:
sentence_ints.append(vocab_to_int["<EOS>"])
ints.append(sentence_ints)
return ints, word_count, unk_count
word_count = 0
unk_count = 0
int_summaries, word_count, unk_count = convert_to_ints(clean_summaries, word_count, unk_count)
int_texts, word_count, unk_count = convert_to_ints(clean_texts, word_count, unk_count, eos=True)
def create_lengths(text):
lengths = []
for sentence in text:
lengths.append(len(sentence))
return pd.DataFrame(lengths, columns=['counts'])
lengths_summaries = create_lengths(int_summaries)
lengths_texts = create_lengths(int_texts)
def unk_counter(sentence):
unk_count = 0
for word in sentence:
if word == vocab_to_int["<UNK>"]:
unk_count += 1
return unk_count
sorted_summaries = []
sorted_texts = []
max_text_length = 84
max_summary_length = 13
min_length = 2
unk_text_limit = 1
unk_summary_limit = 0
for length in range(min(lengths_texts.counts), max_text_length):
for count, words in enumerate(int_summaries):
if (len(int_summaries[count]) >= min_length and
len(int_summaries[count]) <= max_summary_length and
len(int_texts[count]) >= min_length and
unk_counter(int_summaries[count]) <= unk_summary_limit and
unk_counter(int_texts[count]) <= unk_text_limit and
length == len(int_texts[count])):
sorted_summaries.append(int_summaries[count])
sorted_texts.append(int_texts[count])
数据处理完后,就要开始搭建网络架构了。编码的过程,我这里使用的是双向动态RNN。
def model_inputs():
input_data = tf.placeholder(tf.int32, [None, None], name='input')
targets = tf.placeholder(tf.int32, [None, None], name='targets')
lr = tf.placeholder(tf.float32, name='learning_rate')
keep_prob = tf.placeholder(tf.float32, name='keep_prob')
summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
max_summary_length = tf.reduce_max(summary_length, name='max_dex_len')
text_length = tf.placeholder(tf.int32, (None,), name='text_length')
return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length
def process_encoding_input(target_data, vocab_to_int, batch_size):
ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
dec_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
return dec_input
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inpunts, keep_prob):
for layer in range(num_layers):
with tf.variable_scope('encoder_{}'.format(layer)):
cell_fw = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
cell_fw = tf.nn.rnn_cell.DropoutWrapper(cell_fw, input_keep_prob=keep_prob)
cell_bw = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
cell_bw = tf.nn.rnn_cell.DropoutWrapper(cell_bw, input_keep_prob=keep_prob)
enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, rnn_inpunts, sequence_length, dtype=tf.float32)
enc_output = tf.concat(enc_output, 2)
return enc_output, enc_state
在解码的网络中,TrainingHelper和GreendyEmbeddingHelper,分别代表训练阶段的和预测阶段。TrainingHelper是指在训练阶段使用的是target_label,预测阶段GreendyEmbeddingHelper使用的是T-1的输出。另外BahdanauAttention在tensorflow的API中,解释是两种可选择的Attention机制,这里将
normalize选择False,我的理解是normalize,可以加快网络的迭代速度,而Bahdanau应该是准确性更好。(不知道是否理解正确,如果谁有实战过,这两种机制哪种效果好,还希望能告诉我,谢谢。)
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, vocab_size, max_summary_length):
training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input, sequence_length=summary_length, time_major=False)
training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, training_helper, initial_state, output_layer)
training_logits, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=max_summary_length)
return training_logits
def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
max_summary_length, batch_size):
start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
inference_helper = tf.contrib.seq2seq.GreendyEmbeddingHelper(embeddings, start_tokens, end_token)
inferce_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell, inference_helper, initial_state, output_layer)
inference_logits, _ = tf.contrib.seq2seq.dynamic_decode(inferce_decoder,
output_time_major=False,
impute_finished=True,
maximum_iterations=max_summary_length)
return inference_logits
def decoding_layer(dec_embed_input, emdeddings, enc_output, enc_state, vocab_size, text_length, summary_length,
max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size, num_layers):
for layer in range(num_layers):
with tf.variable_scope('decoder_{}'.format(layer)):
lstm = tf.nn.rnn_cell.LSTMCell(rnn_size, initializer=tf.random_uniform_initializer(-0,1, 0.1, seed=2))
dec_cell = tf.nn.rnn_cell.DropoutWrapper(lstm, input_keep_prob=keep_prob)
output_layer = tf.layers.Dense(vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.1))
attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size, enc_output, text_length, normalize=False)
dec_cell = tf.contrib.seq2seq.AttentionWrapperState(dec_cell, attn_mech, rnn_size)
initial_state = tf.contrib.seq2seq.AttentionWrapperState(enc_state[0])
with tf.variable_scope("decode"):
training_logits = training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer,
vocab_size, max_summary_length)
with tf.variable_scope("decode"):
inference_logits = inference_decoding_layer((embedding, vocab_to_int['<GO>'], vocab_to_int['<EOS>'], dec_cell,
initial_state, output_layer, max_summary_length, batch_size))
return training_logits, inference_logits
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length,
vocab_size, rnn_size, num_layers, vocab_to_int, batch_size):
embeddings = word_embedding_matrix
enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob)
dec_input = process_encoding_input(target_data, vocab_to_int, batch_size)
dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
training_logits, inference_logits = decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length,
summary_length, max_summary_length, rnn_size, vocab_to_int, keep_prob, batch_size,
num_layers)
return training_logits, inference_logits
下面就是一些batch_size的生成和PAD填充,指定一个Garph,就可以开始训练了。当然最后训练网络的部分,我并没有写出来,毕竟理解seq2seq的架构才是关键。
def pad_sentence_batch(sentence_batch):
max_sentence = max([len(sentence) for sentence in sentence_batch])
return [sentence + [vocab_to_int['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]
def get_batches(summaries, texts, batch_size):
for batch_i in range(0, len(texts)//batch_size):
start_i = batch_i * batch_size
summaries_batch = summaries[start_i: start_i + batch_size]
texts_batch = text[start_i: start_i + batch_size]
pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
pad_summaries_lengths = []
for summary in pad_summaries_batch:
pad_summaries_lengths.append(len(summary))
pad_texts_lengths = []
for text in pad_texts_batch:
pad_texts_lengths.append(len(text))
yield pad_summaries_batch, pad_texts_batch, pad_texts_lengths, pad_texts_lengths
epochs = 100
batch_size = 64
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75
train_graph = tf.Graph()
with train_graph.as_default():
input_data, targets, lr, keep_prb, summary_length, max_summary_length, text_length=model_inputs()
training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
targets, keep_prb, text_length, summary_length, max_summary_length,
len(vocab_to_int), rnn_size, num_layers, vocab_to_int, batch_size)
training_logits = tf.identity(training_logits.rnn_output, 'logits')
inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='maks')
with tf.name_scope("optimization"):
cost = tf.contrib.seq2seq.sequence_loss(training_logits, targets, masks)
optimizer = tf.train.AdamOptimizer(learning_rate)
gradients = optimizer.compute_gradients(cost)
capped_gradients = [(tf.clip_by_value(grad, -5, 5.), var) for grad, var in gradients if grad is not None]
train_op = optimizer.apply_gradients(capped_gradients)
最后,我也是在持续学习中,如果哪里有不正确的,或者建议,希望能指出。
祝近安!