第5.3节讲的是文本简单处理,课程源码在这里: https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_word2vec_basic.py
但是Step 7: Evaluate by analogy questions.我没能实现,主要是不知道,questions-words.txt是个什么东东,有机会再来试试吧~~
import sys
import time
import numpy as np
import tensorflow as tf
import tensorlayer as tl
from six.moves import xrange
words = tl.files.load_matt_mahoney_text8_dataset()
data_size = len(words)
print(data_size)
print(words[0:10])
resume = False
_UNK = "_UNK"
vocabulary_size = 50000
batch_size = 128
embedding_size = 128
skip_window = 1
num_skips = 2
num_sampled = 64
learning_rate = 1.0
n_epoch = 12
model_file_name = "model_word2vec_50k_128"
num_steps = int((data_size / batch_size) * n_epoch)
print('%d Steps in a Epoch, total Epochs %d' % (int(data_size / batch_size), n_epoch))
print(' learning_rate: %f' % learning_rate)
print(' batch_size: %d' % batch_size)
if resume:
print("Load existing data and dictionaries" + "!" * 10)
all_var = tl.files.load_npy_to_any(name=model_file_name + '.npy')
data = all_var['data']
count = all_var['count']
dictionary = all_var['dictionary']
reverse_dictionary = all_var['reverse_dictionary']
else:
data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True, _UNK)
print('~~~~~~~~~~~~~Most 5 common words (+UNK)~~~~~~~~~~~~')
print(count[:5])
print('~~~~~~~~~~~~~~~~~~~~~~Sample data~~~~~~~~~~~~~~~~~')
print(data[:10], [reverse_dictionary[i] for i in data[:10]])
print('~~~~~~~~~~~~~只看周围两个单词,左右各一个~~~~~~~~~~~~')
batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
batch_size=8, num_skips=2, skip_window=1, data_index=0)
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
print('~~~~~~~~~~~~~只看周围四个单词,左右各两个~~~~~~~~~~~~')
batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
batch_size=8, num_skips=4, skip_window=2, data_index=0)
for i in range(8):
print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])
print('~~~~~~~~~~~~~~Build Skip-Gram model~~~~~~~~~~~~~')
valid_size = 16
valid_window = 100
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
emb_net = tl.layers.Word2vecEmbeddingInputlayer(
inputs=train_inputs,
train_labels=train_labels,
vocabulary_size=vocabulary_size,
embedding_size=embedding_size,
num_sampled=num_sampled,
name='word2vec_layer')
cost = emb_net.nce_cost
train_params = emb_net.all_params
# train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, var_list=train_params)
train_op = tf.train.AdagradOptimizer(learning_rate).minimize(cost, var_list=train_params)
normalized_embeddings = emb_net.normalized_embeddings
valid_embed = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embed, normalized_embeddings, transpose_b=True)
print('~~~~~~~~~~~~~~~~~Start training~~~~~~~~~~~~~~~~')
sess = tf.InteractiveSession()
tl.layers.initialize_global_variables(sess)
if resume:
print("Load existing model" + "!" * 10)
# Load from ckpt or npz file
# saver = tf.train.Saver()
# saver.restore(sess, model_file_name+'.ckpt')
tl.files.load_and_assign_npz_dict(name=model_file_name + '.npz', sess=sess)
emb_net.print_params()
emb_net.print_layers()
# save vocabulary to txt
tl.nlp.save_vocab(count, name='vocab_text8.txt')
average_loss = 0
step =