一个简单的word2vec实现

最近学习word2vec,根据自己的理解实现了一个简单的word2vec实现,算法总共三层数据结构,输入层,隐藏层(embedding_lookup),输出层,简单代码如下:

import tensorflow as tf
import numpy as np

contents = '一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十一二三四五六七八九十'
words = list(set(contents))
contentList = list(contents)
wordsMap = {}
count = 0
for word in words:
    wordsMap[word] = count
    count += 1

vocabulary_size = len(words)
embedding_size = 10
batch_size = len(contentList) - 1
print(wordsMap)
print('=========================')
x = np.ndarray(dtype=np.int32, shape=batch_size)
y = np.ndarray(dtype=np.int32, shape=[batch_size])

for i in range(len(contentList) - 1):
    x[i] = wordsMap[contentList[i]]
    y[i] = wordsMap[contentList[i+1]]

# for i in range(len(contentList) - 1):
#     y[i] = wordsMap[contentList[i]]
#     x[i] = wordsMap[contentList[i+1]]

print(x)
print(x.shape)
print(y)
print(y.shape)
train_inputs = tf.placeholder(tf.int32, shape=[None])
train_labels = tf.placeholder(tf.int32, shape=[None])

embedDic = tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)
# embedDic = tf.identity([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
embeddings = tf.Variable(embedDic)
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
outLevel = tf.layers.dense(embed, units=vocabulary_size, activation=None)

loss = tf.losses.sparse_softmax_cross_entropy(labels=train_labels, logits=outLevel)

optimizer = tf.train.AdagradOptimizer(learning_rate=0.1)
train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
print('==========================')


sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())


for i in range(100000):
    sess.run(train_op, feed_dict={train_inputs: x, train_labels: y})
    if i % 10000 == 0:
        print('迭代步数: %s, loss: %s' % (i, sess.run(loss, feed_dict={train_inputs: x, train_labels: y})))
# print(sess.run(embedDic))


def getNextWord(w):
    print('==================================================')
    print('计算 %s 的下一个单词' % w)
    i1 = wordsMap[w]
    outV = sess.run(outLevel, feed_dict={train_inputs: [i1]})
    print(outV)
    i2 = sess.run(tf.argmax(outV, 1))[0]
    print('单词索引: %s' % i2)
    print("下一个单词:%s" % words[i2])

getNextWord('一')
getNextWord('二')
getNextWord('三')
getNextWord('四')
getNextWord('五')
getNextWord('六')
getNextWord('七')
getNextWord('八')
getNextWord('九')
getNextWord('十')

def distance(w1, w2):
    print('===================================================')
    print('计算单词之间的相似度, word1: %s, word2: %s' % (w1, w2))
    i1 = wordsMap[w1]
    i2 = wordsMap[w2]
    v1 = tf.nn.embedding_lookup(embeddings, i1)
    # print('单词 %s 的词向量 : %s' % (w1, sess.run(v1)))
    v2 = tf.nn.embedding_lookup(embeddings, i2)
    # print('单词 %s 的词向量 : %s' % (w2, sess.run(v2)))
    dis = sess.run(tf.sqrt(tf.reduce_sum(tf.square(v1 - v2))))
    print("单词 word1: %s,word2: %s 之间的距离: %s" % (w1, w2, dis))
distance('五', '一')
distance('五', '二')
distance('一', '二')
distance('一', '三')
distance('一', '四')
distance('一', '五')
distance('一', '六')
distance('一', '七')
distance('一', '八')
distance('一', '九')
distance('一', '十')




def showDis():
    print('=====================================================')
    for word in list('一二三四五六七八九十'):
        id = wordsMap[word]
        print('word: %s, 向量:%s' % (word, sess.run(tf.nn.embedding_lookup(embeddings, id))))

showDis()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

WitsMakeMen

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值