算法原理:
算法原理可以参考该链接
超参数
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000
valid_size = 20
valid_window = 100
eval_words = np.random.choice(valid_window, valid_size, replace=False)
embedding_size = 200
max_vocabulary_size = 50000
min_occurrence = 10
skip_window = 3
num_skips = 2
num_sampled = 64
解释:
- 我们模型的验证是:计算于eval_words数组中的词最近似的几个词
词典生成模块
def make_vocabulary(data):
"""
data:是一个一维的list,每个元素可以是单个字也可以是切词后的词
data是我们将句子切词后再拼接生成的(如果以字为单位不用切词直接拼接)
"""
word2count = [('UNK', -1)]
word2count.extend(collections.Counter("".join(data)).most_common(max_vocabulary_size - 1))
for i in range(len(word2count) - 1, -1, -1):
if word2count[i][1] < min_occurrence:
word2count.pop(i)
else:
break
vocabulary_size = len(word2count)
word2id = dict()
for i, (word, _) in enumerate(word2count):
word2id[word] = i
data_id = list()
unk_count = 0
for word in data:
index = word2id.get(word, 0)
if index == 0:
unk_count += 1
data_id.append(index)
word2count[0] = ('UNK', unk_count)