tensorlayer学习日志14_chapter5_5.3

第5.3节讲的是文本简单处理,课程源码在这里: https://github.com/tensorlayer/tensorlayer/blob/master/example/tutorial_word2vec_basic.py

但是Step 7: Evaluate by analogy questions.我没能实现,主要是不知道,questions-words.txt是个什么东东,有机会再来试试吧~~

import sys
import time
import numpy as np
import tensorflow as tf
import tensorlayer as tl
from six.moves import xrange  


words = tl.files.load_matt_mahoney_text8_dataset()
data_size = len(words)
print(data_size)  
print(words[0:10])  

resume = False  
_UNK = "_UNK"

vocabulary_size = 50000 
batch_size = 128
embedding_size = 128  
skip_window = 1  
num_skips = 2  

num_sampled = 64 
learning_rate = 1.0
n_epoch = 12
model_file_name = "model_word2vec_50k_128"

num_steps = int((data_size / batch_size) * n_epoch)  

print('%d Steps in a Epoch, total Epochs %d' % (int(data_size / batch_size), n_epoch))
print('   learning_rate: %f' % learning_rate)
print('   batch_size: %d' % batch_size)

if resume:
    print("Load existing data and dictionaries" + "!" * 10)
    all_var = tl.files.load_npy_to_any(name=model_file_name + '.npy')
    data = all_var['data']
    count = all_var['count']
    dictionary = all_var['dictionary']
    reverse_dictionary = all_var['reverse_dictionary']
else:
    data, count, dictionary, reverse_dictionary = tl.nlp.build_words_dataset(words, vocabulary_size, True, _UNK)

print('~~~~~~~~~~~~~Most 5 common words (+UNK)~~~~~~~~~~~~')
print(count[:5])
print('~~~~~~~~~~~~~~~~~~~~~~Sample data~~~~~~~~~~~~~~~~~')
print(data[:10], [reverse_dictionary[i] for i in data[:10]])

print('~~~~~~~~~~~~~只看周围两个单词,左右各一个~~~~~~~~~~~~')
batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
    batch_size=8, num_skips=2, skip_window=1, data_index=0)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

print('~~~~~~~~~~~~~只看周围四个单词,左右各两个~~~~~~~~~~~~')

batch, labels, data_index = tl.nlp.generate_skip_gram_batch(data=data, \
    batch_size=8, num_skips=4, skip_window=2, data_index=0)
for i in range(8):
    print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

print('~~~~~~~~~~~~~~Build Skip-Gram model~~~~~~~~~~~~~')

valid_size = 16  
valid_window = 100  
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

emb_net = tl.layers.Word2vecEmbeddingInputlayer(
    inputs=train_inputs,
    train_labels=train_labels,
    vocabulary_size=vocabulary_size,
    embedding_size=embedding_size,
    num_sampled=num_sampled,    
    name='word2vec_layer')

cost = emb_net.nce_cost
train_params = emb_net.all_params
# train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost, var_list=train_params)
train_op = tf.train.AdagradOptimizer(learning_rate).minimize(cost, var_list=train_params)

normalized_embeddings = emb_net.normalized_embeddings
valid_embed = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embed, normalized_embeddings, transpose_b=True)

print('~~~~~~~~~~~~~~~~~Start training~~~~~~~~~~~~~~~~')
sess = tf.InteractiveSession()
tl.layers.initialize_global_variables(sess)
if resume:
    print("Load existing model" + "!" * 10)
    # Load from ckpt or npz file
    # saver = tf.train.Saver()
    # saver.restore(sess, model_file_name+'.ckpt')
    tl.files.load_and_assign_npz_dict(name=model_file_name + '.npz', sess=sess)

emb_net.print_params()
emb_net.print_layers()

# save vocabulary to txt
tl.nlp.save_vocab(count, name='vocab_text8.txt')

average_loss = 0
step =
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值