word2vec 词to向量
CBOW 连续词袋模型
根据词的上下文词汇来预测目标词汇,
skip-Gram模型
通过目标词汇来预测上下文词汇。
word2vec模型我们通常用噪声对比估计(noise contrastive estimation)。 NCE使用的方法是把上下文h对应的正确目标词汇标记为正样本D=1,然后抽取一些错误的词汇样本标记为D=0,然后最大化目标函数的值。
当真实的目标单词被分配到较高的概率,同时噪声单词的概率很低时,目标函数也就达到最大值了。计算这个函数时,只需要计算挑出来的k个噪声单词,而不是整个语料库,所以训练速度会很快。
https://github.com/zmjames2000/NLP_basis/blob/master/demo6_w2v_skip_gram.py
simple_vord2vec
# -*- coding: utf-8 -*-
import os
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import math
import random
import zipfile
import numpy as np
from six.moves import urllib
from six.moves import xrange
import matplotlib.pyplot as plt
url = r'http://mattmahoney.net/dc/'
# step 1
# download data
def maybe_download(filename, expected_bytes):
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception( 'failed to verify' + filename +'. Can you get to it with a brower?')
return filename
# read the data into a list of strings
def read_data(filename):
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
# words list
filename = maybe_download('text8.zip',31344016)
words = read_data(filename)
print('Data size', len(words))
# step 2: build the dictionary and replace rare words with UNK token.
#只留500000 个单词,其他单词都是UNK
vocabulary_size = 500000
#预处理
def build_dataset(words, vocabulary_size):
count = [['UNK',-1]]
#most_common返回一个TopN列表,[('a',5),('b',2)]
count.extend(collections.Counter(words).most_common(vocabulary_size -1 ))
# dict key=单词,value= 是编号,开始是0,后来是1,词频率越高,编号越小
dictionary =dict()
for word,_ in count:
dictionary[word] = len(dictionary)
# data, [unk, 。。。。 频率越高,编号越小
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count += 1
data.append(index)
count[0][1] = unk_count
# 编号对应的词的字典
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
#data数据集编号形式 频率越高,编号越小
#count 前50000个出现次数最多的词
#dictionary 词对应编号
#reverse_dictionary 编号对应词
data, count, dictionary, reverse_dictionay = build_dataset(words, vocabulary_size)
del words
print('Most common words (+UNK),', count[:5])
print('Sample data', data[:10], [reverse_dictionay[i] for i in data[:10]])
data_index = 0
#step 3: Function to generate a trainning batch for the skip-gram model
def generate_batch(batch_size, num_skips, skip_window): # batch_size 取多少个,num_skips 目标词左右的上下文多少个,skip_window目标词
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2*skip_window
batch = np.ndarray(shape=batch_size, dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
span = 2*skip_window + 1 # span = 上下文+目标词
buffer = collections.deque(maxlen=span) # 3个词的双向队列
# 0 1 2 3 4 5 6 7
# t
for _ in range(span):
buffer.append(data[data_index]) # data_index开始=0, 取3个放入buffer
data_index = (data_index + 1)% len(data) #第一次 data_index 移动到3的位置 第二次移动到6的位置
#获取batch和labels
for i in range(batch_size // num_skips): #有多少组
target = skip_window
target_to_avoid = [skip_window]
for j in range(num_skips): # num_skips=2
while target in target_to_avoid:
target = random.randint(0, span-1) # 0, 1
target_to_avoid.append(target)
batch[i*num_skips + j] = buffer[skip_window]
labels[i*num_skips + j,0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1)%len(data)
data_index = (data_index + len(data) - span)%len(data)
return batch, labels
batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
for i in range(8):
print(batch[i], reverse_dictionay[batch[i]], '->', labels[i,0], reverse_dictionay[labels[i,0]])
#step 4: Build and train a skip-gram model
batch_size = 128
#词向量维度 一般是128 或256
embedding_size = 128
skip_window = 1
num_skips = 2
valid_size = 16 #验证集16个
valid_window = 100 # 0-100个词中,抽出16个频率最高的
valid_examples = np.random.choice(valid_window, valid_size, replace=False) #从0-100抽取16个整数,无放回抽样
num_sample = 64 # 取64个负样本
graph = tf.Graph()
with graph.as_default():
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.placeholder(valid_examples, dtype=tf.int32)
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # 500000行,128维(句子最大的长度)
embed = tf.nn.embedding_lookup(embeddings, train_inputs) #embedding_lookup(params, ids) 按照ids顺序返回params中的第ids行
#比如ids=[1,7,4] params返回第1,7,4行, 返回结果是由params的1,7,4组成的tensor
# 我们不是全部训练,从inputs中传入批次进行训练
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0/ math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size])) #噪声对比估计
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=train_labels,
inputs=embed,
num_sampled=num_sample,
num_classes=vocabulary_size))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=1).minimize(loss)
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normlized_embeddings = embeddings/ norm
valid_embeddings = tf.nn.embedding_lookup(normlized_embeddings, valid_dataset) #抽取验证集
#抽取一些常用词来测试余弦相似度
similaryity = tf.matmul(valid_embeddings, normlized_embeddings, transpose_b=True)
init = tf.global_variables_initializer()
num_steps = 100001
final_embeddings = []
with tf.Session(graph=graph) as sess:
init.run()
print('Initialized')
average_loss = 0;
for step in xrange(num_steps):
batch_inputs, batch_labels = generate_batch(batch_size, num_skips, skip_window) # 128,2,1
feed_dict = {train_inputs:batch_inputs, train_labels:batch_labels}
_, loss_val = sess.run([optimizer, loss], feed_dict=feed_dict)
average_loss += loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
print("average loss at step,", step, ':', average_loss)
average_loss = 0
if step % 2000 == 0:
sim = similaryity.eval() # 求余弦相似度
for i in xrange(valid_size):
valid_word = reverse_dictionay[valid_examples[i]]
top_k = 5
nearest = (-sim[i,:]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s'% valid_word
for k in xrange(top_k):
close_word = reverse_dictionay[nearest[k]]
log_str = "%s %s," %(log_str, close_word)
print(log_str)
final_embeddings = normlized_embeddings.eval()
#step 6: visualize the embeddings
def plot_with_lables(low_dim_embs, labels, filename='tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), 'more labels than embeddings'
plt.figure(figsize=(15,15))
for i ,label in enumerate(labels):
x,y = low_dim_embs[i,:]
plt.scatter(x,y)
plt.annotate(label, xy=(x,y), xytest=(5,2), textcoords='offset points', ha='right', va='bottom')
plt.savefig(filename)
try:
from sklearn.manifold import TSNE
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact')
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionay[i] for i in xrange(plot_only)]
plot_with_lables(low_dim_embs, labels)
except ImportError:
print('please install sklearn, matplotlib, and scipy to visualize embeddings')