使用TensorFlow v2.0实现Word2Vec算法计算单词的向量表示,这个例子是使用一小部分维基百科文章来训练的。
更多信息请查看论文: Mikolov, Tomas et al. “Efficient Estimation of Word Representations in Vector Space.”, 20131
from __future__ import division, print_function, absolute_import
import collections
import os
import random
import urllib
import zipfile
import numpy as np
import tensorflow as tf
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000
# 训练参数
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000
# 评估参数
eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']
# Word2Vec 参数
embedding_size = 200 # 嵌入向量的维度 vector.
max_vocabulary_size = 50000 # 词汇表中不同单词的总数words in the vocabulary.
min_occurrence = 10 # 删除出现小于n次的所有单词
skip_window = 3 # 左右各要考虑多少个单词
num_skips = 2 # 重复使用输入生成标签的次数
num_sampled = 64 # 负采样数量
# 下载一小部分维基百科文章集
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'text8.zip'
if not os.path.exists(data_path):
print("Downloading the dataset... (It may take some time)")
filename, _ = urllib.urlretrieve(url, data_path)
print("Done!")
# 解压数据集文件,文本已处理完毕
with zipfile.ZipFile(data_path) as f:
text_words = f.read(f.namelist()[0]).lower().split()
# 构建词典并用 UNK 标记替换频数较低的词
count = [('UNK', -1)]
# 检索最常见的单词
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))
# 删除少于'min_occurrence'次数的样本
for i in range(len(count) - 1, -1, -1):
if count[i][1] < min_occurrence:
count.pop(i)
else:
#该集合是有序的,因此在当出现小于'min_occurrence'时停止
break
# 计算单词表单词个数
vocabulary_size = len(count)
# 为每一个词分配id
word2id = dict()
for i, (word, _)in enumerate(count):
word2id[word] = i
data = list()
unk_count = 0
for word in text_words:
# 检索单词id,或者如果不在字典中则为其指定索引0('UNK')
index = word2id.get(word, 0)
if index == 0:
unk_count = 1
data.append(index)
count[0] = ('UNK', unk_count)
id2word