1. skip-gram
skip-gram 是用中心词预测上下文
给每个词进行embedding 编码 ,inputs 为 [batch_szie] labels 为 [batch_size,context_len]
nce_loss 负采样 用随机梯度下降更新梯度,负采样的选择的概率为按照每个词的频数大的选择概率大
将一个正例和若干个负采样组成的负例,组合起来进行逻辑回归
详细信息见 https://www.cnblogs.com/pinard/p/7249903.html
2. cbow
cbow 是用上下文预测中心词
skip-gram 一个简单tutroil tf 代码:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
sentences = ['i like dog','i like cat',"i like animal",
"dog cat animal", "apple cat dog like", "dog fish milk like",
"dog cat eyes like", "i like apple", "apple i hate",
"apple i movie book music like", "cat dog hate", "cat dog like"]
word_sequence = ' '.join(sentences).split()
word_list = ' '.join(sentences).split()
word_list = list(set(word_list))
print(word_list)
word_dict = dict(zip(word_list,range(len(word_list))))
# word2vec Parameter
batch_size = 20
embedding_size = 2 # to show 2 dim embedding graph
num_sampled = 10 # for negative sampling ,less than batch_size
voc_size = len(word_list)
def random_batch(data,size):
random_inputs = []
random_labels = []
random_index = np.random.choice(range(len(data)),size=size,replace=False)
for i in random_index:
random_inputs.append(data[i][0]) # target
random_labels.append([data[i][1]]) # context word
return random_inputs,random_labels
# make skip gram for one size window
skip_grams = []
for i in range(1,len(word_sequence) - 1):
target = word_dict[word_sequence[i]]
context = [word_dict[word_sequence[i - 1]],word_dict[word_sequence[i + 1]]]
for w in context:
skip_grams.append([target,w])
print('skip_grams = ',skip_grams)
# Model
inputs = tf.placeholder(dtype=tf.int64,shape=[batch_size],name='inputs')
labels = tf.placeholder(dtype=tf.int64,shape=[batch_size,1],name='labels') # to use tf.nn.nce_loss, [batch_size,1]
embeddings = tf.get_variable(name='embeddings',shape=[voc_size,embedding_size],initializer=tf.random_uniform_initializer(minval=-1.,maxval=1.))
select_embedding = tf.nn.embedding_lookup(params=embeddings,ids=inputs)
nce_weights = tf.get_variable(name='nce_weights',shape=[voc_size,embedding_size],initializer=tf.random_uniform_initializer(minval=-1.,maxval=1.))
nce_biases =- tf.get_variable(name='nce_biases',shape=[voc_size],initializer=tf.zeros_initializer())
# loss and optimizer
loss = tf.reduce_mean(tf.nn.nce_loss(nce_weights,nce_biases,labels=labels,inputs=select_embedding,num_sampled=num_sampled,num_classes=voc_size))
optimizer = tf.train.AdamOptimizer(1e-3).minimize(loss)
# training
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(5000):
batch_inputs,batch_labels = random_batch(skip_grams,batch_size)
_,loss_val = sess.run([optimizer,loss],feed_dict={inputs:batch_inputs,labels:batch_labels})
if (epoch + 1) % 1000 == 0:
print('Epoch : ','%04d'%(epoch + 1),'cost = ','{:.6f}'.format(loss_val))
trained_embeddings = embeddings.eval()
for i,label in enumerate(word_list):
x,y = trained_embeddings[i]
plt.scatter(x,y)
plt.annotate(label,xy=(x,y),xytext = (5,2),textcoords = 'offset points',ha='right',va='bottom')
plt.show()