tensorflow实现word2vec以及相关注释
参考1:https://www.jianshu.com/p/556d735a7f97
参考2:http://blog.csdn.net/wangyangzhizhou/article/details/77530479
参考3:http://blog.csdn.net/NNNNNNNNNNNNY/article/details/70177509
参考4:https://www.jianshu.com/p/fab82fa53e16
......
# -*- coding: utf-8 -*-
import collections
import math
import random
import codecs
import numpy as np
import tensorflow as tf
import jieba
"""read samples"""
def read_data(filename):
with codecs.open(filename, 'r', encoding='utf-8') as f:
data = f.read()
seg_list = jieba.cut(data, cut_all=False)
text = tf.compat.as_str("/".join(seg_list)).split('/')
return text
"""建立数据集"""
"""words表示要读取的分词,n_words为读取的词汇量大小"""
def build_dataset(words,n_words):
count=[['UNK',-1]]
count.extend(collections.Counter(words).most_common(n_words-1))
dictionary=dict()
"""给每一个高频词标上序号"""
for word,_ in count:
dictionary[word]=len(dictionary)
#print(dictionary) #可获取全部的高频词汇
#所有词汇
data=list()
unk_count=0
for word in words:
if word in dictionary:
index=dictionary[word]
else:
index=0
unk_count+=1
"""data中index对应dictionary中的词的序号"""
data.append(index)
"""不在dictionary中的词(低频词)赋值给count[0][1]"""
count[0][1]=unk_count
"""dictionary中的数据存储为[...,'上起':33812,...],而reversed中的数据存储正好反过来[...,33812:'上起',...]"""
reversed_dictionary=dict(zip(dictionary.values(),dictionary.keys()))
"""返回data,count,dictionary,reversed_dictionary,分别表示:所有词及下标,高频词及词频,高频词及下标,压缩词典"""
return data,count,dictionary,reversed_dictionary
"""generate batch from all samples,skip-gram模型batch生成训练数据"""
def generate_batch(batch_size, num_skips, skip_window):
global data_index
"""assert声明断言语句,如果后面为真则继续执行,如果不真,则抛出异常AssertionError"""
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
"""创建batch,batch是一行batch_size列,里边是随机数,类型为int32"""
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
"""创建batch对应的label,并且label是一列batch_size行,里边是随机数,类型为int32"""
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
"""左右各取skip_window个词"""
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
"""依次取span个词"""
if data_index + span > len(data):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index += span
for i in range(batch_size // num_skips):
"""目标值为中间那个值"""
target = skip_window
targets_to_avoid = [skip_window]
"""从目标次左右取num_skips个值"""
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
if data_index == len(data):
#print(data[:span])
#buffer[:] = data[:span]
buffer=[]
buffer[:]=data[:span]
data_index = span
else:
"""deque丢弃掉最前面的,后面加入新值data[data_index]"""
buffer.append(data[data_index])
data_index += 1
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
"""降维画图"""
def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), 'More labels than embeddings'
plt.figure(figsize=(18, 18)) # in inches
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
"""绘制散点图"""
plt.scatter(x, y)
"""添加图的注释,xytext设置注释内容显示的起始位置"""
plt.annotate(label,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
if __name__=='__main__':
"""sample.txt为分词语料"""
filename = "sample.txt"
vocabulary=read_data(filename)
vocabulary_size=50000
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,vocabulary_size)
"""删除变量vocabulary,而不是数据"""
del vocabulary
data_index = 0
#==============================================================================
# """测试generate_batch函数"""
# batch,labels=generate_batch(batch_size=8,num_skips=2,skip_window=1)
# for i in range(8):
# print(batch[i],reverse_dictionary[batch[i]],'->',labels[i,0],reverse_dictionary[labels[i,0]])
#==============================================================================
"""构造神经网络"""
batch_size=128
"""embedding表示词向量维度,skip_window表示左右窗口大小,num_skips表示每个窗口取几个词"""
embedding_size=128
skip_window=1
num_skips=2
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 ## Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
num_sampled = 64 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default():
"""placeholder用来放置网络使用过程的数据"""
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
"""TensorFlow指定特定CPU进行计算,默认为#0"""
with tf.device('/cpu:0'):
"""词向量,词典大小*词向量维数"""
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
"""根据train_inputs查找embedding"""
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
"""构造网络"""
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
"""定义lost function,"""
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
"""定义优化方法"""
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
"""norm化,每一行平方求和再开方"""
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
"""找到评估的几个词向量"""
valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
"""相似度矩阵,得到每个待评估的词和所有词的相似度"""
similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
# Add variable initializer.
init = tf.global_variables_initializer()
num_steps=40001
#==============================================================================
"""创建会话,开始训练"""
with tf.Session(graph=graph) as session:
"""初始化所有变量"""
init.run()
average_loss = 0
for step in range(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
"""运行依次迭代,指定loss函数,训练方法,初始数据"""
_, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step ', step, ': ', average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
"""计算similarity,结果是[评估个数*词数]"""
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
#print(len(reverse_dictionary))
top_k = 8
"""每个词的top_k个最相似词"""
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in range(top_k):
if nearest[k]<len(reverse_dictionary):
close_word=reverse_dictionary[nearest[k]]
log_str='%s %s,'%(log_str,close_word)
print(log_str)
final_embeddings = normalized_embeddings.eval()
#==============================================================================
#==============================================================================
try:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
#solve the error code for chinese
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
"""降维可视化"""
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 300
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])
labels = [reverse_dictionary[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
except ImportError:
print('Please install sklearn,matplotlib,anf scipy to show embeddings.')
#==============================================================================