word2vec的高速化
word2vec的高速化
1.CBOW模型的学习代码
import sys
sys.path.append('..')
from common import config
# 在用GPU运行时,请打开下面的注释(需要cupy)
# ===============================================
#config.GPU = True
# ===============================================
from common.np import *
import pickle
from common.trainer import Trainer
from common.optimizer import Adam
from cbow import CBOW
from skip_gram import SkipGram
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb
# 设定超参数
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10
# 读入数据
corpus, word_to_id, id_to_word = ptb.load_data('train') #加载熟练数据
vocab_size = len(word_to_id)
contexts, target = create_contexts_target(corpus, window_size) #
if config.GPU:
contexts, target = to_gpu(contexts), to_gpu(target)
# 生成模型等
model = CBOW(vocab_size, hidden_size, window_size, corpus)
# model = SkipGram(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)
# 开始学习
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()
# 保存必要数据,以便后续使用
word_vecs = model.word_vecs
if config.GPU:
word_vecs = to_cpu(word_vecs)
params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl' # or 'skipgram_params.pkl'
with open(pkl_file, 'wb') as f:
pickle.dump(params, f, -1)
2. create_contexts_target
def create_contexts_target(corpus, window_size=1):
'''生成上下文和目标词
:param corpus: 语料库(单词ID列表)
:param window_size: 窗口大小(当窗口大小为1时,左右各1个单词为上下文)
:return:
'''
target = corpus[window_size:-window_size]
contexts = []
for idx in range(window_size, len(corpus)-window_size):
cs = []
for t in range(-window_size, window_size + 1):
if t == 0:
continue
cs.append(corpus[idx + t])
contexts.append(cs)
return np.array(contexts), np.array(target)
3.CBOW模型的实现
import sys
sys.path.append('..')
from common.np import * # import numpy as np
from common.layers import Embedding
from ch04.negative_sampling_layer import NegativeSamplingLoss
class CBOW:
def __init__(self, vocab_size, hidden_size, window_size, corpus):
V, H = vocab_size, hidden_size
# 初始化权重
W_in = 0.01 * np.random.randn(V, H).astype('f')
W_out = 0.01 * np.random.randn(V, H).astype('f')
# 生成层
self.in_layers = []
for i in range(2 * window_size): #创建 2 * window_size 个Embedding 层
layer = Embedding(W_in) # 使用Embedding层,输入层-隐藏层
self.in_layers.append(layer)
self.ns_loss = NegativeSamplingLoss(W_out, corpus, power=0.75, sample_size=5)
# 将所有的权重和梯度整理到列表中
layers = self.in_layers + [self.ns_loss]
self.params, self.grads = [], []
for layer in layers:
self.params += layer.params
self.grads += layer.grads
# 将单词的分布式表示设置为成员变量
self.word_vecs = W_in
#contexts, target如图所示
def forward(self, contexts, target):
h = 0
for i, layer in enumerate(self.in_layers):
h += layer.forward(contexts[:, i])
h *= 1 / len(self.in_layers)
loss = self.ns_loss.forward(h, target) #实现负采样,NegativeSamplingLoss,并返回损失函数
return loss
def backward(self, dout=1):
dout = self.ns_loss.backward(dout)
dout *= 1 / len(self.in_layers)
for layer in self.in_layers:
layer.backward(dout)
return None
forward:contexts是目标词的上下的关联词,target使我们要预测的值,这里我们给出的正确解
首先:先调用输入层Embedding, 相关词先经过Embedding,然后相加,再除以相关词的总数这就求出H层隐藏层
然后:H层和target目标词 调用NegativeSamplingLoss,求出损失值
最后:给出损失值。
2.1使用Embedding层(输入层到隐藏层)
class Embedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.idx = None
def forward(self, idx):
W, = self.params #加逗号把array外面的方括号去点了
self.idx = idx
out = W[idx]
return out
def backward(self, dout):
dW, = self.grads
dW[...] = 0
for i, word_id in enumerate(self.idx):
dW[word_id] += dout[i]
# 或者
# np.add.at(dW, self.idx, dout)
return None
Embedding层是获取的:输入层到隐藏层的,输入的值第几个直接取到Win的第几个。
2.2 实现负采样,NegativeSamplingLoss
import sys
sys.path.append('..')
from common.np import * # import numpy as np
from common.layers import Embedding, SigmoidWithLoss
import collections
class NegativeSamplingLoss:
def __init__(self, W, corpus, power=0.75, sample_size=5):
#输出侧权重的 W、语料库(单词 ID 列表)corpus、概率分布的次方值 power 和负例的采样数 sample_size。
self.sample_size = sample_size
self.sampler = UnigramSampler(corpus, power, sample_size) #负样例类
#成员变量 loss_layers 和 embed_dot_layers 中以列表格式保存了必要的层。
#在这两个列表中生成 sample_size + 1 个层,这是因为需要生成一个正例用的层和 sample_size 个负例用的层。
self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)] #损失函数
self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)] #这是w是Wout
self.params, self.grads = [], []
for layer in self.embed_dot_layers:
self.params += layer.params
self.grads += layer.grads
#H是隐藏层,target是目标词的正确解
def forward(self, h, target):
batch_size = target.shape[0] #代表一共有多少个样例
negative_sample = self.sampler.get_negative_sample(target) #负样例采集,返回收集的负样例
# 正例的正向传播,
score = self.embed_dot_layers[0].forward(h, target)
correct_label = np.ones(batch_size, dtype=np.int32) #正确解标签都是1
loss = self.loss_layers[0].forward(score, correct_label) #求出损失值
# 负例的正向传播
negative_label = np.zeros(batch_size, dtype=np.int32)
for i in range(self.sample_size):
negative_target = negative_sample[:, i]
score = self.embed_dot_layers[1 + i].forward(h, negative_target)
loss += self.loss_layers[1 + i].forward(score, negative_label)
return loss
def backward(self, dout=1):
dh = 0
for l0, l1 in zip(self.loss_layers, self.embed_dot_layers):
dscore = l0.backward(dout)
dh += l1.backward(dscore)
return dh
负采样的采样方法:UnigramSampler
import sys
sys.path.append('..')
from common.np import * # import numpy as np
from common.layers import Embedding, SigmoidWithLoss
import collections
class UnigramSampler:
def __init__(self, corpus, power, sample_size):
self.sample_size = sample_size
self.vocab_size = None ¥#一共有多少个单词,(重复的算成一个)
self.word_p = None #负样例
counts = collections.Counter()
for word_id in corpus: #统计每个单词出现的次数
counts[word_id] += 1
vocab_size = len(counts) #代表一共有多少个单词,(重复的算成一个)
self.vocab_size = vocab_size
self.word_p = np.zeros(vocab_size)
for i in range(vocab_size):
self.word_p[i] = counts[i]
self.word_p = np.power(self.word_p, power)
self.word_p /= np.sum(self.word_p) #这里是单词出现的频率大小
def get_negative_sample(self, target):
batch_size = target.shape[0] #共有多少个样例
if not GPU:
negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)
for i in range(batch_size):
p = self.word_p.copy() #这里是单词出现的频率
target_idx = target[i] #这里是取出正例的索引
p[target_idx] = 0 #把正例概率设为0
p /= p.sum()
negative_sample[i, :] = np.random.choice(self.vocab_size, size=self.sample_size, replace=False, p=p)
else:
# 在用GPU(cupy)计算时,优先速度
# 有时目标词存在于负例中
negative_sample = np.random.choice(self.vocab_size, size=(batch_size, self.sample_size),
replace=True, p=self.word_p)
return negative_sample
(详细过程见https://blog.csdn.net/weixin_44953928/article/details/121864848?spm=1001.2014.3001.5501 4.2.6 负采样的采样方法中)
Sigmoid求损失函数:SigmoidWithLoss
class SigmoidWithLoss:
def __init__(self):
self.params, self.grads = [], []
self.loss = None
self.y = None # sigmoid的输出
self.t = None # 监督标签
def forward(self, x, t):
self.t = t
self.y = 1 / (1 + np.exp(-x))
self.loss = cross_entropy_error(np.c_[1 - self.y, self.y], self.t)
return self.loss
def backward(self, dout=1):
batch_size = self.t.shape[0]
dx = (self.y - self.t) * dout / batch_size
return dx
EmbeddingDot(隐藏层到输出层)
先调用Embedding 返回的是 Wout 对应的某一行,然后h再于Embedding相乘
class EmbeddingDot:
def __init__(self, W):
self.embed = Embedding(W)
self.params = self.embed.params
self.grads = self.embed.grads
self.cache = None
def forward(self, h, idx):
target_W = self.embed.forward(idx)
out = np.sum(target_W * h, axis=1) #返回的矩阵左右,一行相加
self.cache = (h, target_W)
return out
def backward(self, dout):
h, target_W = self.cache
dout = dout.reshape(dout.shape[0], 1)
dtarget_W = dout * h
self.embed.backward(dtarget_W)
dh = dout * target_W
return dh
Embedding返回的知识 Wout 对应的某一行
#这里EmbeddingDot调用Embedding,这里的w是Wout,输出侧的w
class Embedding:
def __init__(self, W):
self.params = [W]
self.grads = [np.zeros_like(W)]
self.idx = None
def forward(self, idx):
W, = self.params #加逗号把array外面的方括号去点了
self.idx = idx
out = W[idx]
return out
def backward(self, dout):
dW, = self.grads
dW[...] = 0
for i, word_id in enumerate(self.idx):
dW[word_id] += dout[i]
# 或者
# np.add.at(dW, self.idx, dout)
return None