参考网址:https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/
为了实现Word2Vec,有两种风格可以选择,Continuous Bag-of-Words(CBOW)或Skip-gram(SG)。
简单来说,CBOW尝试从相邻单词(上下文单词)猜测输出(目标单词),而Skip-Gram从目标单词猜测上下文单词。实际上,Word2Vec是基于分布假说,其认为每个单词的上下文都在其附近的单词中。因此,通过查看它的相邻单词我们可以尝试对目标单词进行预测。
Skip-gram:能够很好地处理少量的训练数据,而且能够很好地表示不常见的单词或短语
CBOW:比skip-gram训练快几倍,对出现频率高的单词的准确度稍微更好一些
word2vec训练出来的词向量其实是把词语用one-hot表示时训练之后的副产物,one-hot向量的权重W
from collections import defaultdict
import numpy as np
#1 数据准备:
text = "natural language processing and machine learning is fun and exciting"
copus = [[word.lower() for word in text.split()]]
#2 超参数
setting = {
"window":2, # 窗口尺寸(至左边或右边,即左边2个和右边2个)
"n":8, # 词向量维度
"epochs":50,
"learning_rate":0.01
}
class word2vec():
def __init__(self):
self.n = setting['n']
self.window = setting['window']
self.epochs = setting['epochs']
self.learning_rate = setting['learning_rate']
def generate_train_data(self,setting,copus):
"""建立训练用的词向量"""
# 计算非重复词语的频数
word_counts = defaultdict(int);
for row in copus:
for word in row:
word_counts[word] += 1
self.count_word = len(word_counts.keys()) #非重复词语数量
self.words_list = list(word_counts.keys()) #非重语料列表
self.word_idx = dict((word,i) for i , word in enumerate(self.words_list)) #建立word_idx 字典,加快访问速度
self.idx_word = dict((i,word) for i,word in enumerate(self.words_list)) #建立idx_word 字典,加快访问速度
training_data = []
for setence in copus: #遍历每行(每句话)
len_setence = len(setence)
for i ,word in enumerate(setence): #遍历每个词(目标词)
w_target = self.word2onehot(setence[i]) #把词语转化成向量
w_context = [] #上下文
#获取上下文向量
for j in range(i-self.window , i+self.window+1): #遍历窗口内的上下文
if j != i and j>= 0 and j<len_setence :
w_context.append(self.word2onehot(setence[j])) #把上下文的词语转化为one-hot向量
#把向量保存到训练集列表中
training_data.append([w_target,w_context]) #每个词语的向量和该词语的上下文向量加入到训练集中
return np.array(training_data)
def word2onehot(self,word):
"""建立one-hot词向量"""
word_vec = np.zeros(self.count_word) #建立长度为词语数量的全0向量
word_index = self.word_idx[word]; #获取这个词语独一无二的编号
word_vec[word_index] = 1 #将全0向量中编号处的 0 改成 1 ,该词语的one-hot向量
return word_vec
def train(self,training_data):
"""训练"""
#初始化权重
self.w1 = np.random.uniform(-1,1,(self.count_word,self.n)) #shape = (count_word , n)
self.w2 = np.random.uniform(-1,1,(self.n,self.count_word)) #shape = (n , count_vec)
#遍历每个epoch
for i in range(self.epochs):
self.loss = 0 #初始化loss
for w_t , w_c in training_data: #遍历每个训练集数据
#前向传播
y_predict , h ,u = self.forward(w_t)
#计算误差
EI = np.sum([np.subtract(y_predict,word) for word in w_c],axis=0)
#反向传播,并更新参数
self.backprop(EI, h, w_t)
#计算误差
self.loss += -np.sum([u[word.tolist().index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
print ('EPOCH:',i, 'LOSS:', self.loss)
def forward(self,x):
"""前向传播"""
h = np.dot(self.w1.T , x) #隐藏层的输出(w1.T的形状是(n,count_word),x的形状是(count_word,)所以结果的形状为(n,))
u= np.dot(self.w2.T , h) #输出层的输出(w2.T的形状是(count_word,n),h的形状是(n,),所以结果的形状是(count_word,)
y_c = self.softmax(u) #激活层输出
return y_c , h ,u
def softmax(self,x):
"""softmax"""
e_x = np.exp(x-np.max(x)) #x的形状为(count_word,)
return e_x/np.sum(e_x,axis=0)
def backprop(self,e,h,x):
dl_dw2 = np.outer(h,e) #损失函数对w2偏导
dl_dw1 = np.outer(x , np.dot(self.w2,e.T)) #损失函数对w1偏导
#update weight
self.w1 = self.w1 - (self.learning_rate * dl_dw1) #更新w
self.w2 = self.w2 - (self.learning_rate * dl_dw2) #更新w
def word_vec(self, word):
w_index = self.word_idx[word]
v_w = self.w1[w_index]
return v_w
#实例化模型
w2v = word2vec()
training_data = w2v.generate_train_data(setting,copus)
print(training_data.shape)
w2v.train(training_data)
(10, 2)
EPOCH: 0 LOSS: 84.26226184477264
EPOCH: 1 LOSS: 82.5334354504378
EPOCH: 2 LOSS: 80.97314092418368
EPOCH: 3 LOSS: 79.556153105198
EPOCH: 4 LOSS: 78.26245748940423
EPOCH: 5 LOSS: 77.07585499766886
EPOCH: 6 LOSS: 75.98302285515354
EPOCH: 7 LOSS: 74.97286011466974
EPOCH: 8 LOSS: 74.0360166890339
EPOCH: 9 LOSS: 73.16454468478804
EPOCH: 10 LOSS: 72.35163395274861
EPOCH: 11 LOSS: 71.59140745991571
EPOCH: 12 LOSS: 70.87876038047602
EPOCH: 13 LOSS: 70.209231949284
EPOCH: 14 LOSS: 69.57890239388053
EPOCH: 15 LOSS: 68.9843093969211
EPOCH: 16 LOSS: 68.42237997289847
EPOCH: 17 LOSS: 67.89037463068387
EPOCH: 18 LOSS: 67.38584139479062
EPOCH: 19 LOSS: 66.90657777131717
EPOCH: 20 LOSS: 66.45059913078315
EPOCH: 21 LOSS: 66.01611227868631
EPOCH: 22 LOSS: 65.60149322077369
EPOCH: 23 LOSS: 65.20526832014733
EPOCH: 24 LOSS: 64.82609819827393
EPOCH: 25 LOSS: 64.46276385909516
EPOCH: 26 LOSS: 64.11415461991506
EPOCH: 27 LOSS: 63.77925751838158
EPOCH: 28 LOSS: 63.457147934657165
EPOCH: 29 LOSS: 63.14698122422874
EPOCH: 30 LOSS: 62.847985201835556
EPOCH: 31 LOSS: 62.55945335252828
EPOCH: 32 LOSS: 62.28073867353736
EPOCH: 33 LOSS: 62.01124807185712
EPOCH: 34 LOSS: 61.75043725849147
EPOCH: 35 LOSS: 61.49780609223435
EPOCH: 36 LOSS: 61.25289433457873
EPOCH: 37 LOSS: 61.01527778361762
EPOCH: 38 LOSS: 60.78456475923181
EPOCH: 39 LOSS: 60.56039291494355
EPOCH: 40 LOSS: 60.342426353940596
EPOCH: 41 LOSS: 60.130353028236684
EPOCH: 42 LOSS: 59.923882400961986
EPOCH: 43 LOSS: 59.72274335253571
EPOCH: 44 LOSS: 59.526682312086336
EPOCH: 45 LOSS: 59.33546159603934
EPOCH: 46 LOSS: 59.148857936344996
EPOCH: 47 LOSS: 58.9666611814089
EPOCH: 48 LOSS: 58.788673153434914
EPOCH: 49 LOSS: 58.61470664660484
w2v.word_vec("natural")
array([-0.45495959, 0.39501012, -0.25014066, -0.32978533, 0.15838857,
-0.50636667, 0.15753532, 0.84494568])
w2v.word_vec("language")
array([ 0.09572602, 0.04661155, 0.17993407, 0.35106728, 0.38565779,
1.01885008, -0.34719543, 0.05457249])