Tensorflow里word2vec训练
import tensorflow as tf
import numpy as np
import math
import collections
import pickle as pkl
from pprint import pprint
import re
import jieba
import os.path as path
import os
class word2vec():
def __init__(self,
vocab_list=None,
embedding_size=200,
win_len=3,
num_sampled=1000,
learning_rate=1.0,
logdir='/tmp/simple_word2vec',
model_path= None
):
self.batch_size = None
if model_path!=None:
self.load_model(model_path)
else:
assert type(vocab_list)==list
self.vocab_list = vocab_list
self.vocab_size = vocab_list.__len__()
self.embedding_size = embedding_size
self.win_len = win_len
self.num_sampled = num_sampled
self.learning_rate = learning_rate
self.logdir = logdir
self.word2id = {
}
for i in range(self.vocab_size):
self.word2id[self.vocab_list[i]] = i
self.train_words_num = 0
self.train_sents_num = 0
self.train_times_num = 0
self.train_loss_records = collections.deque(maxlen=10)
self.train_loss_k10 = 0
self.build_graph()
self.init_op()
if model_path!=None:
tf_model_path = os.path.join(model_path,'tf_vars')
self.saver.restore(self.sess,tf_model_path)
def init_op(self):
self.sess = tf.Session(graph=self.graph)
self.sess.run(self.init)
self.summary_writer = tf.train.SummaryWriter(self.logdir, self.sess.graph)
def build_graph(self):
self.graph = tf.Graph()
with self.graph.as_default():
self.train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
self.train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
self.embedding_dict = tf.Variable(
tf.random_uniform([self.vocab_size,self.embedding_size],-1.0,1.0)
)
self.nce_weight = tf.Variable(tf.truncated_normal([self.vocab_size, self.embedding_size],
stddev=1.0/math.sqrt(self.embedding_size)))
self.nce_biases = tf.Variable(tf.zeros([self.vocab_size]))
embed = tf.nn.embedding_lookup(self.embedding_dict, self.train_inputs)
self.loss = tf.reduce_mean(
tf.nn.nce_loss(
weights = self.nce_weight,
biases = self.nce_biases,
labels = self.train_labels,
inputs = embed,
num_sampled = self.num_sampled,
num_classes = self.vocab_size
)
)
tf.scalar_summary('loss',self.loss)
self.train_op = tf.train.GradientDescentOptimizer(learning_rate=0.1).minimize(self.loss)
self.test_word_id = tf.placeholder(tf.int32,shape