看了很多关于word2vec的讲解文章,自己对代码略加改动和简化,有了更深的理解。使用的小数据集“text8”(点击链接下载)来训练skip-gram模型,主要用于理解思想和流程。
Step 1: 加载数据
数据集可直接载入内存,英文数据集已是空格隔开的单词不需考虑分词。
import tensorflow as tf
import numpy as np
import bs4
import collections
import math
import os
import random
from six.moves import xrange
filename = 'text8'
vocabulary_size=50000
# Step 1: 加载数据
def read_data(filename):
with open(filename) as file:
words = file.read().split() # .split()用空格划分单词
return words
Step 2: 构建单词对应词典
函数bulid_vocabulary( words, vocabulary_size)
参数:
- words: 加载进来的数据即text8 中的所有单词序列
- vocabulary_size: 超参数(程序中取50000)表示构建的词典大小(构建好的词典包含前50000个高频词)
返回值:
- count: 统计所有单词的词频后截取前50000个高频词。 形如: [['UNK', -1], ('the', 1061396), ('of', 593677), ('and', 416629),...]
- dictionary: 构建字典,键值对为word:id,值越小对应word的频率越高。形如 {'UNK': 0, 'of': 2, 'one': 4, 'the': 1, 'and': 3....}
- reverse_dictionary: 互换dictionary中的键和值得到编号id:word的字典形式数据。形如:{0: 'UNK', 1: 'the', 2: 'of', 3: 'and',...}
- data:将传入的单词序列words中的每个单词用对应编号id替换,组成编号id的序列。
# Step 2: 构建单词表(使用前vocabulary_size个高频词构建词列表,其他词为UNK)
def bulid_vocabulary(words,vocabulary_size):
# extend追加一个列表
# Counter用来统计每个词出现的次数
# most_common(N)返回一个TOP N的列表,只留N个
# c = Counter('abracadabra')
# c.most_common() ==> [('a',5),('r',2),('b',2),('c',1),('d',1)]
# c.most_common(3) ==>[('a',5),('r',2),("b",2)]
count=[['UNK',-1]]
count.extend(collections.Counter(words).most_common(vocabulary_size-1))
#print(count)
#生成字典,词对应编号word:id
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary) #创建键值对组成的字典:{'a': 6, 'the': 1, 'of': 2, 'in': 5, 'UNK': 0, 'and': 3, 'one': 4},值越小词出现频率越高
#将句子中的单词全部变成相应的id编号,并统计未在dictionary中出现的词个数
unk_count = 0
data=list()
for word in words:
if word in dictionary:
index = dictionary[word] #即index取到单词word对应的编号
else:
index = 0
unk_count+=1
data.append(index)
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) #互换字典中的键和值,得到id:word
return data, count, dictionary,reverse_dictionary
Step 3: 生成批次数据样本
# Step 3: 基于skip-gram模型生成 trainingg batch
batch_size = 128
#词向量维度
embedding_size = 128 # Dimension of the embedding vector
skip_window = 1 # How many words to consider left and right
num_skips = 2 #它代表着我们从整个窗口中选取多少个不同的词作为我们的output word
data_index = 0
num_sampled = 64
valid_size = 16 # Random set of words to evaluate similarity on 取16个词做验证集
valid_window = 100 # Only pick dev samples in the head of the distribution
# 即从0-100抽取16个整数,无放回抽样
valid_example = np.random.choice(valid_window, valid_size, replace = False)
def generate_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
label = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2 * skip_window + 1 # 相当于得到训练字段[左边skip_window个词 target 右边词]
buffer = collections.deque(maxlen=span) # 定义一个长度为span的双向队列 buffer
#循环结束-->把span里的内容加入队列
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1 ) % len(data)
for i in range(batch_size // num_skips):
target = skip_window
target_to_avoid = [skip_window] # target_to_avoid=[target的编号]
# 循环2次,一个目标但词对应两个上下文单词
for j in range(num_skips):
while target in target_to_avoid:
# 可能先拿到前面的词,也可能先拿到后面的词
target = random.randint(0, span - 1)
target_to_avoid.append(target)
batch[i * num_skips + j] = buffer[skip_window] # 将双向队列
label[i * num_skips + j, 0] = buffer[target]
buffer.append(data[data_index]) # buffer长度固定,加入一个数据时候会自动出队
data_index = (data_index +1) % len(data)
data_index = (data_index + len(data) - span) % len(data)
return batch, label
Step 4: 构建图(Graph)
words = read_data(filename)
data,count, dictionary,reverse_dictionary = bulid_vocabulary (words,vocabulary_size)
num_steps =50000
# Step 4: 构建图Graph
graph = tf.Graph()
with graph.as_default():
# 定义变量
train_input = tf.placeholder(dtype = tf.int32, shape = [batch_size])
train_labels = tf.placeholder(dtype = tf.int32,shape = [batch_size,1])
valid_data = tf.constant(valid_example,dtype = tf.int32)
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
#定义负采样的相关参数
nce_weights = tf.Variable( tf.truncated_normal(shape=[vocabulary_size,embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros(shape = [vocabulary_size]))
#提取要训练的词 不是每次迭代都训练所有词,而是这个批次里的词语的向量
embed = tf.nn.embedding_lookup(embeddings,train_input)
# 定义损失函数 和 优化算法
loss = tf.reduce_mean(
tf.nn.nce_loss(weights =nce_weights,
biases=nce_biases,
labels = train_labels,
inputs = embed,
num_sampled = num_sampled,
num_classes = vocabulary_size))
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
#计算单词的余弦相似度
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
normlized_embedding = embeddings/norm
# 抽取一些常用词来测试余弦相似度
valid_embedding = tf.nn.embedding_lookup( normlized_embedding, valid_data)
similarity = tf.matmul( valid_embedding, normlized_embedding, transpose_b=True)
# 定义初始化函数init
init = tf.global_variables_initializer()
Steo 5: 启动会话(session)
#Step 5:开启会话
with tf.Session(graph = graph) as sess:
# 初始化全部变量init
sess.run(init)
# 获取训练数据 和 对应标签
for step in xrange(num_steps):
batch_input,batch_labels = generate_batch(batch_size,num_skips,skip_window)
sess.run(optimizer,feed_dict ={ train_input:batch_input,train_labels:batch_labels})
#打印一部分余弦相似度高的单词(非训练中必要的,可以删除)
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = reverse_dictionary[valid_example[i]]
top_k = 8
nearest =(-sim[i,:]).argsort()[1:top_k+1]
log_str="Nearest to %s:" %valid_word
for k in xrange(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = "%s %s," %(log_str, close_word)
print(log_str)