word2vec有两种模型,一个是CBOW模型,另一个是Skip-gram模型。那么这里主要用tensorflow实现Skip-gram模型。
Skip-gram模型是在已知目标词的前提下预测其上下文。那么skip-gram模型只用到了一个隐藏层,一个输出层,输入及输出标注为 初始化的词向量。
那么原理如下:
1、设定固定的窗口大小来遍历训练文本,构建(中心词,临近词)的关系,使用网络模型来拟合两者的关系。
2、输入为中心词初始词向量,输出标注为临近词的初始词向量,通过网络模型利用输入与输出两组词向量来构建损失函数,隐藏层不使用激活函数,输出层结果为概率分数,使用softmax归一化。
3、最终产生的词向量是训练得到的隐藏层参数,不使用最后的输出。隐藏层节点个数即最终词向量空间的维度。
这篇博客讲解的很清楚,可以参考https://blog.csdn.net/rlnlo2pnefx9c/article/details/78747970,接下来主要是实现了这部分博客里面的思想。
那么接下来将要使用tensorflow实现Skip-gram模型,主要使用了python深度学习这本书中的代码。
下面函数代码的功能是读取text9压缩文件的内容,然后把其保存在words中,并显示了前面10个单词,运行结果如下图。
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
words = read_data("text9.zip")
print(words[:10])
print('Data size %d' % len(words))
下面代码主要是建立字典,就是单词索引号的映射,一个单词对应一个索引号,并保存到dictionary中。count里面记录了每个单词出现的次数并把出现次数最多的前vocabulary_size-1个原素添加到count中。运行结果如下图所示:
vocabulary_size = 20
def build_dataset(words):
count = [['UNK',-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
dictionary = dict() #创建空字典
for word,_ in count:
dictionary[word] = len(dictionary)
print('count是:',end=' ')
print(count)
print("dictionary是:",end=' ')
print(dictionary) #dictionary 是 值:索引号
data = list()
unk_count = 0 #记录了不在字典中的词的个数
for word in words:
if word in dictionary:
index = dictionary[word] #如果该词在字典中出现,那么就保存在字典中对应的索引号
else:
index = 0 #否则就没有在字典中出现,默认为0
unk_count = unk_count + 1
data.append(index)
count[0][1] = unk_count
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #reverse_dictionary是 索引号:值
print("reverse_dictionary是:",end=' ')
print(reverse_dictionary)
return data, count, dictionary, reverse_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(words)
print('Most common words (+UNK)', count[:5])
print('Sample data', data[:10])
del words # Hint to reduce memory.
下面这个代码段主要就是获取上下文窗口,并且需要调整步数以及窗口值的大小。那么运行结果如下图所示:
data_index = 0
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # [ skip_window target skip_window ]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(batch_size // num_skips):
target = skip_window # target label at the center of the buffer
targets_to_avoid = [skip_window]
for j in range(num_skips):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
print('data:', [reverse_dictionary[di] for di in data[:8]])
for num_skips, skip_window in [(2, 1), (4, 2)]:
data_index = 0
batch, labels = generate_batch(batch_size=8, num_skips=num_skips, skip_window=skip_window)
print('\nwith num_skips = %d and skip_window = %d:' % (num_skips, skip_window))
print(' batch:', [reverse_dictionary[bi] for bi in batch])
print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
那么下面主要是进行神经网络模型的训练,然后优化这个代价函数,得到隐藏层的权重值。
batch_size = 8
embedding_size = 8 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
#我们选择一个随机验证集来对最近的邻居进行采样
#在这里,我们将验证样本限制为具有较低数字ID的词
valid_size = 2 # Random set of words to evaluate similarity on.
valid_window = 6 # Only pick dev samples in the head of the distribution.
valid_examples = np.array(random.sample(range(valid_window), valid_size))
num_sampled = 4 # Number of negative examples to sample.
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
# Input data.
train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# Variables.
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Model.
# Look up embeddings for inputs.
embed = tf.nn.embedding_lookup(embeddings, train_dataset)
# Compute the softmax loss, using a sample of the negative labels each time.
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=embed,
labels=train_labels, num_sampled=num_sampled, num_classes=vocabulary_size))
# Optimizer.
# #注意:优化器将优化softmax_weights和嵌入。
# #这是因为嵌入被定义为一个可变的数量和。
# #优化器的`minim`方法默认会修改所有变量的数量。
# #这有助于张量传递。
# #查看关于`tf.train.Optimizer.minimize()`的文档了解更多细节
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
# 计算minibatch示例和所有嵌入之间的相似度。
# 我们使用余弦距离:
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
normalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(
normalized_embeddings, valid_dataset)
similarity = tf.matmul(valid_embeddings, tf.transpose(normalized_embeddings))
下面代码是启动这个会话,然后激活变量,打印在运行过程中损失值的变化情况。
num_steps = 10
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
average_loss = 0
for step in range(num_steps):
batch_data, batch_labels = generate_batch(
batch_size, num_skips, skip_window)
feed_dict = {train_dataset: batch_data, train_labels: batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss += l
if step % 1 == 0:
if step > 0:
average_loss = average_loss / 2000
# The average loss is an estimate of the loss over the last 2000 batches.
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
# note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10 == 0:
sim = similarity.eval()
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]]
log = '%s %s,' % (log, close_word)
print(log)
final_embeddings = normalized_embeddings.eval()
下面这个代码主要是用于可视化这个结果 。如下图所示:
num_points = 19
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
two_d_embeddings = tsne.fit_transform(final_embeddings[1:num_points+1, :])
def plot(embeddings, labels):
assert embeddings.shape[0] >= len(labels), 'More labels than embeddings'
pylab.figure(figsize=(15, 15)) # in inches
for i, label in enumerate(labels):
x, y = embeddings[i, :]
pylab.scatter(x, y)
pylab.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points',
ha='right', va='bottom')
pylab.show()
words = [reverse_dictionary[i] for i in range(1, num_points + 1)]
plot(two_d_embeddings, words)