Word2ve模型
tensorflow 2 实现简单的Word2Vec模型
import collections
import os
import random
import zipfile
import urllib
import numpy as np
import tensorflow as tf
# 训练参数
learning_rate = 0.1
batch_size = 128
num_steps = 10000
display_step = 1000
eval_step = 2000
# 评估参数
eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']
# Word2Vec参数
embedding_size = 200
max_vocabulary_size = 50000
min_occurrence = 10
skip_window = 3
num_skips = 2
num_sampled = 64
# 下载维基百科数据集,并使用部分
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'data/text8.zip'
if not os.path.exists(data_path):
print("Downloading the dataset... (It may take some time)")
filename, _ = urllib.request.urlretrieve(url, data_path)
print("Done!")
with zipfile.ZipFile(data_path) as f:
text_words = f.read(f.namelist()[0]).lower().split()
class WikiData():
def __init__(self):
self.raw_text, self.word2id, self.data = self.date_process()
self.id2word = dict(zip(self.word2id.values(), self.word2id.keys()))
self.vocab_size = len(self.word2id)
self.data_idx = 0
def date_process(self):
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'data/text8.zip'
if not os.path.exists(data_path):
print("Downloading the dataset... (It may take some time)")
filename, _ = urllib.request.urlretrieve(url, data_path)
print("Done!")
with zipfile.ZipFile(data_path, 'r') as f:
text_words = f.read(f.namelist()[0]).decode('utf-8').lower().split()
# 构建词汇表
count = [('UNK', -1)]
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))
# 删除出现次数小于min_occurrence的词.
for i in range(len(count)-1, -1, -1):
if count[i][1] < min_occurrence:
count.pop(i)
else:
break # 因为count中是按出现频次降序的
# 词-索引字典
word2id = dict()
for i, (word, _) in enumerate(count):
word2id[word] = i
# 统计稀有词的个数
data = [] # 以id形式存储文本
unk_count = 0
for word in text_words:
idx = word2id.get(word, 0)
if idx == 0:
unk_count += 1
data.append(idx)
count[0] = ('UNK', unk_count)
print("Words count: ", len(text_words))
print("Unique words: ", len(set(text_words)))
print("Vocabulary size: ", len(word2id))
print("Most common words: ", count[:10])
return text_words, word2id, data
def get_batch(self, batch_size, num_skips, skip_window):
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) # 列向量
span = 2 * skip_window + 1
# 存放中心词和上下文词的队列。大小固定,加入新元素时,队首元素会被自动移除队列。模拟滑动窗口
buffer = collections.deque(maxlen=span)
if self.data_idx + span > len(self.data):
self.data_idx = 0
buffer.extend(self.data[self.data_idx: self.data_idx+span])
# 下一个要加入队列中的词的索引,即滑动窗口后移一位。
self.data_idx += span
# 每次循环会取出num_skips个训练样例
for i in range(batch_size // num_skips):
context_words = [w for w in range(span) if w != skip_window] # 上下文词
word_to_use = random.sample(context_words, num_skips) # 采样num_skips个词
# 从窗口队列中取出词
for j, word in enumerate(word_to_use):
batch[i * num_skips + j] = buffer[skip_window] # 中心词
labels[i * num_skips + j, 0] = buffer[word] # 要预测的词
# 滑动窗口尾部到了数据序列末尾,从头开始
if self.data_idx == len(self.data):
buffer.extend(self.data[0:span])
self.data_idx = span
else:
buffer.append(self.data[self.data_idx])
self.data_idx += 1
# 现在的self.data_idx在滑动窗口尾部,但下一次调用本函数时是将self.data_idx作为滑动首部,
# 所以要回退span个位置。但直接相减可能会小于0,所以加len(self.data)并取模
self.data_idx = (self.data_idx - span + len(self.data)) % len(self.data)
return batch, labels
dataset = WikiData()
with tf.device('/cpu:0'):
embedding = tf.Variable(tf.random.normal([dataset.vocab_size, embedding_size]))
nce_weights = tf.Variable(tf.random.normal([dataset.vocab_size, embedding_size]))
nce_biases = tf.Variable(tf.zeros([dataset.vocab_size]))
def get_embedding(x):
with tf.device('/cpu:0'):
x_embed = tf.nn.embedding_lookup(embedding, x)
return x_embed
def nce_loss(x_embed, y):
with tf.device('/cpu:0'):
y = tf.cast(y, tf.int64)
loss = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=y,
inputs=x_embed,
num_sampled=num_sampled,
num_classes=dataset.vocab_size))
return loss
def run_optimization(x, y):
optimizer = tf.optimizers.SGD(learning_rate)
with tf.device('/cpu:0'):
with tf.GradientTape() as tape:
x_embed = get_embedding(x)
loss = nce_loss(x_embed, y)
variables = [embedding, nce_weights, nce_biases]
grads = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(grads, variables))
x_test = np.array([dataset.word2id[w] for w in eval_words])
for step in range(1, num_steps+1):
batch_x, batch_y = dataset.get_batch(batch_size, num_skips, skip_window)
run_optimization(batch_x, batch_y)
if step % display_step == 0 or step == 1:
loss = nce_loss(get_embedding(batch_x), batch_y)
print("step: %i, loss: %f" % (step, loss))
# evaluation
if step % eval_step == 0 or step == 1:
print('Evaluation......')
sim = evaluate(get_embedding(x_test)).numpy()
for i in range(len(eval_words)):
top_k = 8
nearest = (-sim[i,:]).argsort()[1:top_k+1] # 跟自己的相似度为1,argsort返回下标
log_str = "'%s' nearest neighbors:" % eval_words[i]
for k in range(top_k):
log_str = "%s %s" % (log_str, dataset.id2word[nearest[k]])
print(log_str)