numpy 实现word2vec

参考网址:https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/

为了实现Word2Vec,有两种风格可以选择,Continuous Bag-of-Words(CBOW)或Skip-gram(SG)。
简单来说,CBOW尝试从相邻单词(上下文单词)猜测输出(目标单词),而Skip-Gram从目标单词猜测上下文单词。实际上,Word2Vec是基于分布假说,其认为每个单词的上下文都在其附近的单词中。因此,通过查看它的相邻单词我们可以尝试对目标单词进行预测。

Skip-gram:能够很好地处理少量的训练数据,而且能够很好地表示不常见的单词或短语
CBOW:比skip-gram训练快几倍,对出现频率高的单词的准确度稍微更好一些

word2vec训练出来的词向量其实是把词语用one-hot表示时训练之后的副产物,one-hot向量的权重W

from collections import defaultdict
import numpy as np

#1 数据准备:
text = "natural language processing and machine learning is fun and exciting"
copus = [[word.lower() for word in text.split()]]
#2 超参数
setting = {
    "window":2,             # 窗口尺寸(至左边或右边,即左边2个和右边2个)
    "n":8,                 # 词向量维度
    "epochs":50,           
    "learning_rate":0.01
}

class word2vec():
    def __init__(self):
        self.n = setting['n']
        self.window = setting['window']
        self.epochs = setting['epochs']
        self.learning_rate = setting['learning_rate']
        
    def generate_train_data(self,setting,copus):
        """建立训练用的词向量"""
        # 计算非重复词语的频数
        word_counts = defaultdict(int);
        for row in copus:
            for word in row:
                word_counts[word] += 1
        
        self.count_word = len(word_counts.keys())       #非重复词语数量
        self.words_list = list(word_counts.keys())      #非重语料列表
        self.word_idx = dict((word,i) for i , word in enumerate(self.words_list)) #建立word_idx 字典,加快访问速度
        self.idx_word = dict((i,word) for i,word in enumerate(self.words_list))   #建立idx_word 字典,加快访问速度
        
        training_data = []
        
        for setence in copus:                            #遍历每行(每句话)
            len_setence = len(setence)
            for i ,word in enumerate(setence):           #遍历每个词(目标词)
                w_target = self.word2onehot(setence[i])  #把词语转化成向量
                w_context = []                           #上下文
                
                #获取上下文向量
                for j in range(i-self.window , i+self.window+1): #遍历窗口内的上下文
                    if j != i and j>= 0 and j<len_setence : 
                        w_context.append(self.word2onehot(setence[j])) #把上下文的词语转化为one-hot向量
                
                #把向量保存到训练集列表中        
                training_data.append([w_target,w_context]) #每个词语的向量和该词语的上下文向量加入到训练集中
        return np.array(training_data)
    
    def word2onehot(self,word):
        """建立one-hot词向量"""
        word_vec = np.zeros(self.count_word)  #建立长度为词语数量的全0向量
        
        word_index = self.word_idx[word];     #获取这个词语独一无二的编号
        word_vec[word_index] = 1              #将全0向量中编号处的 0 改成 1 ,该词语的one-hot向量
        return word_vec
        
    def train(self,training_data):
        """训练"""
        #初始化权重
        self.w1 = np.random.uniform(-1,1,(self.count_word,self.n)) #shape = (count_word , n)
        self.w2 = np.random.uniform(-1,1,(self.n,self.count_word)) #shape = (n , count_vec)

    
        
        #遍历每个epoch
        for i in range(self.epochs):
            self.loss = 0 #初始化loss
            for w_t , w_c in training_data: #遍历每个训练集数据
                #前向传播
                y_predict , h ,u = self.forward(w_t)
                #计算误差
                EI = np.sum([np.subtract(y_predict,word) for word in w_c],axis=0)
                #反向传播,并更新参数
                self.backprop(EI, h, w_t)
                #计算误差    
                self.loss += -np.sum([u[word.tolist().index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
            print ('EPOCH:',i, 'LOSS:', self.loss)
            
    def forward(self,x):
        """前向传播"""
        h = np.dot(self.w1.T , x)  #隐藏层的输出(w1.T的形状是(n,count_word),x的形状是(count_word,)所以结果的形状为(n,))
        u= np.dot(self.w2.T , h) #输出层的输出(w2.T的形状是(count_word,n),h的形状是(n,),所以结果的形状是(count_word,)
        
        y_c = self.softmax(u)    #激活层输出
        return y_c , h ,u
    
    def softmax(self,x):
        """softmax"""
        e_x = np.exp(x-np.max(x))      #x的形状为(count_word,)
        return e_x/np.sum(e_x,axis=0)
    
    def backprop(self,e,h,x):
        dl_dw2 = np.outer(h,e)         #损失函数对w2偏导
        dl_dw1 = np.outer(x , np.dot(self.w2,e.T)) #损失函数对w1偏导
        
        #update weight
        self.w1 = self.w1 - (self.learning_rate * dl_dw1) #更新w
        self.w2 = self.w2 - (self.learning_rate * dl_dw2) #更新w
        

    def word_vec(self, word):
        w_index = self.word_idx[word]
        v_w = self.w1[w_index]
        return v_w
#实例化模型
w2v = word2vec()
training_data = w2v.generate_train_data(setting,copus)
print(training_data.shape)

w2v.train(training_data)
(10, 2)
EPOCH: 0 LOSS: 84.26226184477264
EPOCH: 1 LOSS: 82.5334354504378
EPOCH: 2 LOSS: 80.97314092418368
EPOCH: 3 LOSS: 79.556153105198
EPOCH: 4 LOSS: 78.26245748940423
EPOCH: 5 LOSS: 77.07585499766886
EPOCH: 6 LOSS: 75.98302285515354
EPOCH: 7 LOSS: 74.97286011466974
EPOCH: 8 LOSS: 74.0360166890339
EPOCH: 9 LOSS: 73.16454468478804
EPOCH: 10 LOSS: 72.35163395274861
EPOCH: 11 LOSS: 71.59140745991571
EPOCH: 12 LOSS: 70.87876038047602
EPOCH: 13 LOSS: 70.209231949284
EPOCH: 14 LOSS: 69.57890239388053
EPOCH: 15 LOSS: 68.9843093969211
EPOCH: 16 LOSS: 68.42237997289847
EPOCH: 17 LOSS: 67.89037463068387
EPOCH: 18 LOSS: 67.38584139479062
EPOCH: 19 LOSS: 66.90657777131717
EPOCH: 20 LOSS: 66.45059913078315
EPOCH: 21 LOSS: 66.01611227868631
EPOCH: 22 LOSS: 65.60149322077369
EPOCH: 23 LOSS: 65.20526832014733
EPOCH: 24 LOSS: 64.82609819827393
EPOCH: 25 LOSS: 64.46276385909516
EPOCH: 26 LOSS: 64.11415461991506
EPOCH: 27 LOSS: 63.77925751838158
EPOCH: 28 LOSS: 63.457147934657165
EPOCH: 29 LOSS: 63.14698122422874
EPOCH: 30 LOSS: 62.847985201835556
EPOCH: 31 LOSS: 62.55945335252828
EPOCH: 32 LOSS: 62.28073867353736
EPOCH: 33 LOSS: 62.01124807185712
EPOCH: 34 LOSS: 61.75043725849147
EPOCH: 35 LOSS: 61.49780609223435
EPOCH: 36 LOSS: 61.25289433457873
EPOCH: 37 LOSS: 61.01527778361762
EPOCH: 38 LOSS: 60.78456475923181
EPOCH: 39 LOSS: 60.56039291494355
EPOCH: 40 LOSS: 60.342426353940596
EPOCH: 41 LOSS: 60.130353028236684
EPOCH: 42 LOSS: 59.923882400961986
EPOCH: 43 LOSS: 59.72274335253571
EPOCH: 44 LOSS: 59.526682312086336
EPOCH: 45 LOSS: 59.33546159603934
EPOCH: 46 LOSS: 59.148857936344996
EPOCH: 47 LOSS: 58.9666611814089
EPOCH: 48 LOSS: 58.788673153434914
EPOCH: 49 LOSS: 58.61470664660484
w2v.word_vec("natural")
array([-0.45495959,  0.39501012, -0.25014066, -0.32978533,  0.15838857,
       -0.50636667,  0.15753532,  0.84494568])
w2v.word_vec("language")
array([ 0.09572602,  0.04661155,  0.17993407,  0.35106728,  0.38565779,
        1.01885008, -0.34719543,  0.05457249])

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值