# -*- coding: utf-8 -*-
#!/usr/bin/python
'''
1-2. Word2Vec(Skip-gram) - Embedding Words and Show Graph
Paper - Distributed Representations of Words and Phrases and their Compositionality(2013)
'''
import tensorflow as tf
#import matplotlib.pyplot as plt
import numpy as np
tf.reset_default_graph()
# 3 Words Sentence
sentences = [ "i like dog", "i like cat", "i like animal",
"dog cat animal", "apple cat dog like", "dog fish milk like",
"dog cat eyes like", "i like apple", "apple i hate",
"apple i movie book music like", "cat dog hate", "cat dog like"]
word_sequence = " ".join(sentences).split()
word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
print word_dict
# Word2Vec Parameter
batch_size = 20 # 一次训练的词向量数量
embedding_size = 2 # To show 2 dim embedding graph # 词向量的维度
voc_size = len(word_list) # 词典中单词的数量
def random_batch(data, size): # data是[中心词,上下文]对的列表, size=batch_size
random_inputs = []
random_labels = []
random_index = np.random.choice(range(len(data)), size, replace=False)
for i in random_index:
random_inputs.append(np.eye(voc_size)[data[i][0]]) # target
random_labels.append(np.eye(voc_size)[data[i][1]]) # context word
#print (random_inputs,random_labels)
return random_inputs, random_labels
# Make skip gram of one size window
skip_grams = []
for i in range(1, len(word_sequence) - 1):
target = word_dict[word_sequence[i]]
context = [word_dict[word_sequence[i - 1]], word_dict[word_sequence[i + 1]]]
#print(target,context)
for w in context:
skip_grams.append([target, w]) #target和上下文词
#print skip_grams
#sys.exit(0)
# Model
inputs = tf.placeholder(tf.float32, shape=[None, voc_size])
labels = tf.placeholder(tf.float32, shape=[None, voc_size])
# W and WT is not Traspose relationship
W = tf.Variable(tf.random_uniform([voc_size, embedding_size], -1.0, 1.0))
WT = tf.Variable(tf.random_uniform([embedding_size, voc_size], -1.0, 1.0))
hidden_layer = tf.matmul(inputs, W) # [batch_size, embedding_size] = [None, voc_size]*[voc_size, embedding_size]
output_layer = tf.matmul(hidden_layer, WT) # [batch_size, voc_size] = [batch_size, embedding_size]*[embedding_size, voc_size]
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output_layer, labels=labels))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cost)
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
for epoch in range(5000):
batch_inputs, batch_labels = random_batch(skip_grams, batch_size)
_, loss = sess.run([optimizer, cost], feed_dict={inputs: batch_inputs, labels: batch_labels})
if (epoch + 1)%1000 == 0:
print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
trained_embeddings = W.eval()
print trained_embeddings
for i, label in enumerate(word_list):
x, y = trained_embeddings[i]
plt.scatter(x, y)
plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
1-2.Word2Vec
最新推荐文章于 2020-05-27 23:49:00 发布