numpy 实现word2vec

最新推荐文章于 2022-05-05 21:43:34 发布

wenqiang su

最新推荐文章于 2022-05-05 21:43:34 发布

阅读量392

点赞数 1

分类专栏：深度学习

本文链接：https://blog.csdn.net/weixin_42681868/article/details/105486638

版权

深度学习专栏收录该内容

4 篇文章 0 订阅

订阅专栏

参考网址：https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/

为了实现Word2Vec，有两种风格可以选择，Continuous Bag-of-Words(CBOW)或Skip-gram(SG)。
简单来说，CBOW尝试从相邻单词（上下文单词）猜测输出（目标单词），而Skip-Gram从目标单词猜测上下文单词。实际上，Word2Vec是基于分布假说，其认为每个单词的上下文都在其附近的单词中。因此，通过查看它的相邻单词我们可以尝试对目标单词进行预测。

Skip-gram：能够很好地处理少量的训练数据，而且能够很好地表示不常见的单词或短语
CBOW：比skip-gram训练快几倍，对出现频率高的单词的准确度稍微更好一些

word2vec训练出来的词向量其实是把词语用one-hot表示时训练之后的副产物，one-hot向量的权重W

from collections import defaultdict
import numpy as np

#1 数据准备：
text = "natural language processing and machine learning is fun and exciting"
copus = [[word.lower() for word in text.split()]]
#2 超参数
setting = {
    "window":2,             # 窗口尺寸（至左边或右边，即左边2个和右边2个）
    "n":8,                 # 词向量维度
    "epochs":50,           
    "learning_rate":0.01
}


class word2vec():
    def __init__(self):
        self.n = setting['n']
        self.window = setting['window']
        self.epochs = setting['epochs']
        self.learning_rate = setting['learning_rate']
        
    def generate_train_data(self,setting,copus):
        """建立训练用的词向量"""
        # 计算非重复词语的频数
        word_counts = defaultdict(int);
        for row in copus:
            for word in row:
                word_counts[word] += 1
        
        self.count_word = len(word_counts.keys())       #非重复词语数量
        self.words_list = list(word_counts.keys())      #非重语料列表
        self.word_idx = dict((word,i) for i , word in enumerate(self.words_list)) #建立word_idx 字典，加快访问速度
        self.idx_word = dict((i,word) for i,word in enumerate(self.words_list))   #建立idx_word 字典，加快访问速度
        
        training_data = []
        
        for setence in copus:                            #遍历每行（每句话）
            len_setence = len(setence)
            for i ,word in enumerate(setence):           #遍历每个词（目标词）
                w_target = self.word2onehot(setence[i])  #把词语转化成向量
                w_context = []                           #上下文
                
                #获取上下文向量
                for j in range(i-self.window , i+self.window+1): #遍历窗口内的上下文
                    if j != i and j>= 0 and j<len_setence : 
                        w_context.append(self.word2onehot(setence[j])) #把上下文的词语转化为one-hot向量
                
                #把向量保存到训练集列表中        
                training_data.append([w_target,w_context]) #每个词语的向量和该词语的上下文向量加入到训练集中
        return np.array(training_data)
    
    def word2onehot(self,word):
        """建立one-hot词向量"""
        word_vec = np.zeros(self.count_word)  #建立长度为词语数量的全0向量
        
        word_index = self.word_idx[word];     #获取这个词语独一无二的编号
        word_vec[word_index] = 1              #将全0向量中编号处的 0 改成 1 ，该词语的one-hot向量
        return word_vec
        
    def train(self,training_data):
        """训练"""
        #初始化权重
        self.w1 = np.random.uniform(-1,1,(self.count_word,self.n)) #shape = （count_word ， n）
        self.w2 = np.random.uniform(-1,1,(self.n,self.count_word)) #shape = （n ， count_vec）

    
        
        #遍历每个epoch
        for i in range(self.epochs):
            self.loss = 0 #初始化loss
            for w_t , w_c in training_data: #遍历每个训练集数据
                #前向传播
                y_predict , h ,u = self.forward(w_t)
                #计算误差
                EI = np.sum([np.subtract(y_predict,word) for word in w_c],axis=0)
                #反向传播，并更新参数
                self.backprop(EI, h, w_t)
                #计算误差    
                self.loss += -np.sum([u[word.tolist().index(1)] for word in w_c]) + len(w_c) * np.log(np.sum(np.exp(u)))
            print ('EPOCH:',i, 'LOSS:', self.loss)
            
    def forward(self,x):
        """前向传播"""
        h = np.dot(self.w1.T , x)  #隐藏层的输出(w1.T的形状是（n,count_word）,x的形状是（count_word,）所以结果的形状为（n,）)
        u= np.dot(self.w2.T , h) #输出层的输出(w2.T的形状是（count_word,n）,h的形状是（n,),所以结果的形状是(count_word,)
        
        y_c = self.softmax(u)    #激活层输出
        return y_c , h ,u
    
    def softmax(self,x):
        """softmax"""
        e_x = np.exp(x-np.max(x))      #x的形状为(count_word,)
        return e_x/np.sum(e_x,axis=0)
    
    def backprop(self,e,h,x):
        dl_dw2 = np.outer(h,e)         #损失函数对w2偏导
        dl_dw1 = np.outer(x , np.dot(self.w2,e.T)) #损失函数对w1偏导
        
        #update weight
        self.w1 = self.w1 - (self.learning_rate * dl_dw1) #更新w
        self.w2 = self.w2 - (self.learning_rate * dl_dw2) #更新w
        

    def word_vec(self, word):
        w_index = self.word_idx[word]
        v_w = self.w1[w_index]
        return v_w

#实例化模型
w2v = word2vec()
training_data = w2v.generate_train_data(setting,copus)
print(training_data.shape)

w2v.train(training_data)

(10, 2)
EPOCH: 0 LOSS: 84.26226184477264
EPOCH: 1 LOSS: 82.5334354504378
EPOCH: 2 LOSS: 80.97314092418368
EPOCH: 3 LOSS: 79.556153105198
EPOCH: 4 LOSS: 78.26245748940423
EPOCH: 5 LOSS: 77.07585499766886
EPOCH: 6 LOSS: 75.98302285515354
EPOCH: 7 LOSS: 74.97286011466974
EPOCH: 8 LOSS: 74.0360166890339
EPOCH: 9 LOSS: 73.16454468478804
EPOCH: 10 LOSS: 72.35163395274861
EPOCH: 11 LOSS: 71.59140745991571
EPOCH: 12 LOSS: 70.87876038047602
EPOCH: 13 LOSS: 70.209231949284
EPOCH: 14 LOSS: 69.57890239388053
EPOCH: 15 LOSS: 68.9843093969211
EPOCH: 16 LOSS: 68.42237997289847
EPOCH: 17 LOSS: 67.89037463068387
EPOCH: 18 LOSS: 67.38584139479062
EPOCH: 19 LOSS: 66.90657777131717
EPOCH: 20 LOSS: 66.45059913078315
EPOCH: 21 LOSS: 66.01611227868631
EPOCH: 22 LOSS: 65.60149322077369
EPOCH: 23 LOSS: 65.20526832014733
EPOCH: 24 LOSS: 64.82609819827393
EPOCH: 25 LOSS: 64.46276385909516
EPOCH: 26 LOSS: 64.11415461991506
EPOCH: 27 LOSS: 63.77925751838158
EPOCH: 28 LOSS: 63.457147934657165
EPOCH: 29 LOSS: 63.14698122422874
EPOCH: 30 LOSS: 62.847985201835556
EPOCH: 31 LOSS: 62.55945335252828
EPOCH: 32 LOSS: 62.28073867353736
EPOCH: 33 LOSS: 62.01124807185712
EPOCH: 34 LOSS: 61.75043725849147
EPOCH: 35 LOSS: 61.49780609223435
EPOCH: 36 LOSS: 61.25289433457873
EPOCH: 37 LOSS: 61.01527778361762
EPOCH: 38 LOSS: 60.78456475923181
EPOCH: 39 LOSS: 60.56039291494355
EPOCH: 40 LOSS: 60.342426353940596
EPOCH: 41 LOSS: 60.130353028236684
EPOCH: 42 LOSS: 59.923882400961986
EPOCH: 43 LOSS: 59.72274335253571
EPOCH: 44 LOSS: 59.526682312086336
EPOCH: 45 LOSS: 59.33546159603934
EPOCH: 46 LOSS: 59.148857936344996
EPOCH: 47 LOSS: 58.9666611814089
EPOCH: 48 LOSS: 58.788673153434914
EPOCH: 49 LOSS: 58.61470664660484

w2v.word_vec("natural")

array([-0.45495959,  0.39501012, -0.25014066, -0.32978533,  0.15838857,
       -0.50636667,  0.15753532,  0.84494568])

w2v.word_vec("language")

array([ 0.09572602,  0.04661155,  0.17993407,  0.35106728,  0.38565779,
        1.01885008, -0.34719543,  0.05457249])

wenqiang su

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
numpy 实现word2vec

参考网址：https://nathanrooy.github.io/posts/2018-03-22/word2vec-from-scratch-with-python-and-numpy/为了实现Word2Vec，有两种风格可以选择，Continuous Bag-of-Words(CBOW)或Skip-gram(SG)。简单来说，CBOW尝试从相邻单词（上下文单词）猜测输出（目标单词），而S...
复制链接

扫一扫