Hierarchical Softmax、CBOW词带模型抽象化及其公式推理以及python代码实现包括注释

最新推荐文章于 2022-10-17 22:57:50 发布
flying_coder
最新推荐文章于 2022-10-17 22:57:50 发布
阅读量625
点赞数
分类专栏：统计学习方法 GNN学习笔记文章标签： CBOW Hierarchical Softmax word2vec
本文链接：https://blog.csdn.net/qwezhaohaihong/article/details/107618902
版权
GNN学习笔记同时被 2 个专栏收录
35 篇文章 13 订阅
订阅专栏
统计学习方法
6 篇文章 0 订阅
订阅专栏
接下来是我的详细的推倒过程
接下来是代码的实现部分：
import argparse
import math
import struct
import sys
import time
import warnings
import os

import numpy as np
#from multiprocessing import Pool,Value,Array
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
##    multiprocessing需要在linux环境下使用！！！！
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

import multiprocessing


class VocabItem: #为每个单词建立一个对象，保存相关内容
    def __init__(self,word):
        self.word=word #传入单词
        self.count=0
        self.path=None #the path from root to the word
        self.code=None #Huffman encoding(embedding)

class Vocab:
    def __init__(self,fi,min_count):
        vocab_items=[]
        vocab_hash={}
        word_count=0
        fi=open(fi,'r',encoding='utf-8')

        #Add special tokens <bol> (beginning of line) and <eol> (end of line)
        for token in ['<bol>','<eol>']: #打标记，开始 or 结束
            vocab_hash[token]=len(vocab_items)
            vocab_items.append(VocabItem(token))

        for line in fi:     #将单词放入语料库Vocab中,一行一行的读取
            tokens=line.split()
            for token in tokens:
                if token not in vocab_hash:   #构件的hash表中单词不会重复出现，并且会统计频次
                    vocab_hash[token]=len(vocab_items)
                    vocab_items.append(VocabItem(token))

                vocab_items[vocab_hash[token]].count+=1 #每出现一次+1
                word_count+=1 #文档中出现的单词总数(包括重复出现)

            #Add special tokens <bol> (beginning of line) abd <eol>(end of line) 表示第一行结束，+1的目的是表示已经读入了1行单词
            vocab_items[vocab_hash['<bol>']].count+=1
            vocab_items[vocab_hash['<eol>']].count+=1
            word_count+=2 #这里是因为做标记导致的词语数量增加

        self.bytes=fi.tell()
        self.vocab_items=vocab_items  #List of VocabItems objects
        self.vocab_hash=vocab_hash    #Mapping from each token to its index in vocab
        self.word_count=word_count    #Total number of words in train file

        #Add special token<unk>(unknown)
        #merge words occuring less than mini_count into(unk),and
        #sort vocab in descending order by frequency in train file
        self.__sort(min_count)   #声明了一个方法

        #assert self.word_count==sum([t.count for t in self.vocab_items]),'word count and sum of t.count do not agree'
        #可以使用==判断记录的单词总数和vocab_items连表中每个单词的频次之和是否对应，下面的print只是输出并没有进行判断
        print("Total words in training file: %d" % self.word_count)
        print("Total bytes in training file: %d" % self.bytes)
        print("Vocab size: %d" % len(self)) #参考 __len__(self)，用于记录单词的个数

    # __len__
    # 如果一个类表现得像一个list，要获取有多少个元素，就得用
    # len() 函数。要让len()函数工作正常，类必须提供一个特殊方法__len__()，它返回元素的个数。

    def __getitem(self,i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)  #用于遍历单词

    def __contains__(self, key):
        return key in self.vocab_hash #返回查找的单词的index

    def __sort(self,min_count):
        tmp=[] #目的是将当前语料库中min_count<5的单词去掉
        tmp.append(VocabItem('<unk>'))
        unk_hash=0

        count_unk=0

        for token in self.vocab_items:
            if token.count< min_count:
                count_unk+=1
                tmp[unk_hash].count+=token.count
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count,reverse=True) #key是用于排序的lambda表达式，定义了排序的指标，这里是token.count

        #update vocab_hash
        vocab_hash={}
        for i,token in enumerate(tmp):
            vocab_hash[token.word]=i
        self.vocab_hash=vocab_hash
        self.vocab_items=tmp #重构之后的语料库中<'ukn'>这一个special token代表了所有min_count<5的单词的总称

    def indices(self,tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

    def encode_huffman(self):
        #Building a Huffman tree
        vocab_size=len(self)
        count=[t.count for t in self]+[1e15]*(vocab_size-1) #创建单词叶结点以及Huffman的中间节点
        #后面部分的目的是储存中间节点，比如将两个单词构造一个树结构的情况,1e15=10^15，目的是构造出来的小数中的频次的值一定会大于所有单词
        #出现的频次，否则会出现将两个树进行合并时出现错位的问题
        parent=[0]*(2*vocab_size-2) #[0]*2=[0,0] parent是Huffman树中节点的个数，parent节点个数貌似和Huffman边的个数一样？？
        binary=[0]*(2*vocab_size-2) #Huffman树中边的个数

        pos1=vocab_size-1 #和单词数目对应，表示叶结点[t.count for t in self]部分,并且因为
        # 单词表是根据降序排列的，vocab_size表示最小的频次的单词，之所以是-1是因为数组从0开始索引，而不是像字典可以根据具体单词索引
        pos2=vocab_size   #和中间节点对应表示[1e15]*(vocab_size-1)这部分的标号


        for i in range(vocab_size-1):
            #Find min1 第一个频次最小的节点
            if pos1>=0:
                if count[pos1]<count[pos2]:
                    min1=pos1
                    pos1-=1 #单词表中频次最小的单词挑选出来作为新构建树的左孩子节点，因此要将索引减一，再在单词表中查找只能查到次最小，以此类推
                else:
                    min1=pos2
                    pos2+=1
            else:
                min1=pos2
                pos2+=1
            #Find min2 第二个频次最小的节点
            if pos1>=0:
                if count[pos1]<count[pos2]:
                    min2=pos1
                    pos1-=1
                else:
                    min2=pos2
                    pos2+=1
            else:
                min2=pos2
                pos2+=1

            count[vocab_size+i]=count[min1]+count[min2] #构造出的第一个树的父节点，更新频次
            parent[min1] = vocab_size + i   #记录下单词索引为min1的父节点
            parent[min2] = vocab_size + i   #记录下单词索引为min2的父节点
            binary[min2]=1                  #设置单词索引为min2与索引为vocab_size+i的父节点的连边为1

        #到这里为止Huffman树构造完成，不过并不像C++或C语言中使用指针构造，而是将这些节点以及他们的索引保存起来，
        #记录为parent、binary、count分别记录单词索引下的父节点索引、与父节点连边的编码值、count用于比较最小值

        # Assign binary code and path pointers to each vocab word 构造好Huffman树后为单词设置编码
        ############################         理       解          ###########################
        #     注意这里之所以用语料库中词语的频次构造，只是为了迁移Huffman树构造方法，更加方便
        #     我猜只要满足Huffman树结构的树，单词可以任意放入这个树结构中，并随机编码  我猜是这样子
        #     因为如果不用频次构造Huffman树，构造这种树结构比较混乱，不能形式化去用
        ############################         理       解          ###########################

        root_idx=2*vocab_size-2 #根结点的索引
        for i,token in enumerate(self):
            path=[] #记录从根节点到该单词token的路径
            code=[] #根据path以及binary[]设置编码

            node_idx = i
            while node_idx < root_idx:
                if node_idx >= vocab_size: path.append(node_idx)
                code.append(binary[node_idx])    #只是为了便于保存，更新单词中的编码的时候是倒序放入的
                node_idx = parent[node_idx]
            path.append(root_idx)

            #these are path and code from root to the leaf
            token.path=[j-vocab_size for j in path[::-1]] #[::-1]表示从最后一个开始便利当前list列表
            #j根据Huffman树结构j-vocab_size刚好可以表示从根结点到当前单词所经过几个中间节点，也就是
            #代表了编码长度
            token.code=code[::-1]

    #Huffman编码完成
#带有Huffman结构的语料库构建完成

class UnigramTale:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
#暂时不完成 这部分内容是负采样方法

def sigmoid(z):  #sigmoid激活函数
    if z>6:    #因为sigmoid激活函数在>6 或 <-6的基本上为1或者0，这里是为了方便期间做的修改，因为是估计一个二分类的概率因此为了便于计算这么设置没有问题
        return 1.0
    elif z<-6:
        return 0.0
    else:
        return 1/1+math.exp(-z)

def init_net(dim,vocab_size):
    #Init syn0 with random numbers from a uniform distribution on the interval [-0.5,0.5]/dim
    tmp1=np.random.uniform(low=-0.5/dim,high=0.5/dim,size=(vocab_size,dim))
    syn0=tmp1

    #syn0保存了初始word embedding空间
    print("hello")
    #Init syn1 with zeros    syn1保存的是路径节点上的参数向量 θ
    tmp2=np.zeros(shape=(vocab_size,dim))
    syn1=tmp2
    print("hello2")

    return (syn0,syn1)

def train_process(vocab, syn0, syn1, table, cbow, neg, dim, starting_alpha,win, num_processes, global_word_count, fi,file_size):
    #set fi to point to the right chunk of training file
    # start=vocab.bytes/num_processes #将文件根据线程数分成若干份，pid表示当前执行的第pid个线程，其实就是在为多线程分配训练的文件块
    # end=vocab.bytes if pid==num_processes-1 else vocab.bytes/num_processes*(pid+1)
    # fi.seek(start)
    #print'Worker %d begining training at %d , ending at %d %(pid,start,end)
    print("hello3")

    alpha=starting_alpha
    print("global_word_count:")
    print(global_word_count)
    print('\n')
    word_count=0
    last_word_count=0

    while fi.tell()<file_size:
        print("hello4")
        line = fi.readline().strip()
     #skip blank lines
        if not line:
          return

            #init sent,a list of indices words in line
        sent=vocab.indices(['<bol>']+line.split()+['<eol>'])
        # print(sent)
        for sent_pos,token in enumerate(sent):  #token是单词在items中的序号不是不是单词本身
            if word_count%1000==0:
                print("已完成的词语训练个数：%d ----------------"%word_count)
                global_word_count+=(word_count-last_word_count)
                last_word_count=word_count #这里指的是每训练1000个单词就记录一次全局数据用于统计训练情况

                #Recalculate alpha
                alpha=starting_alpha * (1- float(global_word_count)/vocab.word_count)
                if alpha<starting_alpha*0.0001:alpha=starting_alpha*0.0001  #训练的数据越多到达一定数目的时候学习率会下降

                #print process info
                sys.stdout.write("\rAlpha:%f Progress:%d of %d (%.2f%%)"%
                                (alpha,global_word_count,vocab.word_count,float(global_word_count)/vocab.word_count*100))
            #Randomize window size,where win is the max window size
            current_win=np.random.randint(low=1,high=win+1) #确定当前的滑动窗口
            context_start=max(sent_pos-current_win,0)
            contex_end=min(sent_pos+current_win+1,len(sent))#？？？？？？？？？？？？？？？？？
            context=sent[context_start:sent_pos]+sent[sent_pos+1:contex_end] #sent_pos是要预测的位置

            if cbow:
                #Compute neul
                neul=np.mean(np.array([syn0[c] for c in context]),axis=0) #syn0[c]计算context中包含的单词的向量的均值(得到了输入)
                assert len(neul) == dim  # 判断一下维度

                #Init neule with zeros
                neu1e = np.zeros(dim) #这个是预测的embedding也就是cbow中的ωt

                # print(token)
                #print(vocab.vocab_items[token])
                classifiers = zip((vocab.vocab_items[token]).path, (vocab.vocab_items[token]).code) #将同一个单词的path和code打包成元组

                for target,label in classifiers:
                    z=np.dot(neul,syn1[target])#将整合后的neul也就是xω和syn1上的路径节点相乘，其实就是深度，单词的Huffman树中的深度
                    p=sigmoid(z) #使用sigmoid激活，会得到第target个中间节点选择1还是0
                    g=alpha*(label-p) #alpha学习率、label是已知的djω编码{0,1}，p是sigmoid(z) 这就是第target
                    #个向量的偏导数中还没有乘xω的部分

                    neu1e+=g*syn1[target] #Error to backpropagate to syn0 syn0d的反向传播误差
                    #neule记录的是传递给xω的误差，但是cbow模型会将此误差全部传递给每一个context中的
                    #word embedding也就是syn0中context_word的词向量

                    syn1[target]+=g*neul  #Updata syn1，θ的更新

                # Update syn0
                for context_word in context:
                    syn0[context_word]+=neu1e
            word_count += 1
            #新的理解也就是cbow模型在使用Huffman的Hierarchical softmax方法的时候我们想让单词用几维向量就几维，
#比如100维的向量表示10w个单词，效果好不好另说，它将100维向量，在Huffman的中间节点的参数向量，每个
#中间节点的参数向量都是100维，将这100维和整合后的窗口向量也就是xω相乘会得到Huffman树第target个中间
#节点是往左还是右，也就是Huffman树是训练用的，Huffman树的大小不影响词向量的维度，只影响参数向量的多少

            # Print progress info
        global_word_count += (word_count - last_word_count)
        sys.stdout.write("\rAlpha:%f Progress:%d of %d (%.2f%%)" %
                         (alpha, global_word_count, vocab.word_count, float(global_word_count) / vocab.word_count * 100))
    sys.stdout.flush()
    fi.close()

def save(vocab, syn0, fo, binary):
    print('Saving model to', fo)
    dim = len(syn0[0])
    if binary:
        fo = open(fo, 'wb')
        fo.write(('%d %d\n' % (len(syn0), dim)).encode(encoding='utf-8'))
        fo.write(('\n').encode(encoding='utf-8'))
        for token, vector in zip(vocab, syn0):
            fo.write(('%s ' % token.word).encode(encoding='utf-8'))
            for s in vector:
                fo.write((struct.pack('f', s)))
            fo.write(('\n').encode(encoding='utf-8'))
    else:
        fo = open(fo, 'w',encoding='utf-8')
        fo.write('%d %d\n' % (len(syn0), dim))
        for token, vector in zip(vocab, syn0):
            word = token.word
            vector_str = ' '.join([str(s) for s in vector])
            fo.write('%s %s\n' % (word, vector_str))

    fo.close()


# def __init_process(args):
#     print("hello6")
#     global vocab, syn0, syn1, table, cbow, neg, dim, starting_alpha
#     global win, num_processes, global_word_count, fi
#     vocab, syn0_tmp, syn1_tmp, table, cbow, neg, dim, starting_alpha, win, num_processes, global_word_count = args[:-1]
#     print(global_word_count)
#     print("hello7")
#     fi = open(args[-1], 'r',encoding='utf-8')
#     print("hello8")
#     with warnings.catch_warnings():
#         warnings.simplefilter('ignore', RuntimeWarning)
#         syn0 = np.ctypeslib.as_array(syn0_tmp)
#         syn1 = np.ctypeslib.as_array(syn1_tmp)
#

def train(fi, fo, cbow, neg, dim, alpha, win, min_count, num_processes, binary):
    # Read train file to init vocab
    vocab = Vocab(fi, min_count)
    file_size=os.path.getsize(fi)
    # Init net
    syn0, syn1 = init_net(dim, len(vocab))

    global_word_count = 0
    table = None
    if neg > 0:
        print('Initializing unigram table')
    else:
        print
        'Initializing Huffman tree'
        vocab.encode_huffman()

    # Begin training using num_processes workers
    t0 = time.time()
    print("hello10")
    fi = open(fi, 'r', encoding='utf-8')
    # initargs = [vocab, syn0, syn1, table, cbow, neg, dim, alpha,win, num_processes, global_word_count, fi]
    # print(initargs)
    # __init_process(initargs)
    train_process(vocab, syn0, syn1, table, cbow, neg, dim, alpha,win, num_processes, global_word_count, fi,file_size)

    t1 = time.time()
    print('Completed training. Training took', (t1 - t0) / 60, 'minutes')

    # Save model to file
    save(vocab, syn0, fo, binary)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-train', help='Training file', dest='fi', required=True)
    parser.add_argument('-model', help='Output model file', dest='fo', required=True)
    parser.add_argument('-cbow', help='1 for CBOW, 0 for skip-gram', dest='cbow', default=1, type=int)
    parser.add_argument('-negative',
                        help='Number of negative examples (>0) for negative sampling, 0 for hierarchical softmax',
                        dest='neg', default=5, type=int)
    parser.add_argument('-dim', help='Dimensionality of word embeddings', dest='dim', default=100, type=int)
    parser.add_argument('-alpha', help='Starting alpha', dest='alpha', default=0.025, type=float)
    parser.add_argument('-window', help='Max window length', dest='win', default=5, type=int)
    parser.add_argument('-min-count', help='Min count for words used to learn <unk>', dest='min_count', default=5,
                        type=int)
    parser.add_argument('-processes', help='Number of processes', dest='num_processes', default=1, type=int)
    parser.add_argument('-binary', help='1 for output model in binary format, 0 otherwise', dest='binary', default=0,
                        type=int)
    # TO DO: parser.add_argument('-epoch', help='Number of training epochs', dest='epoch', default=1, type=int)
    args = parser.parse_args()

    train(args.fi, args.fo, bool(args.cbow), args.neg, args.dim, args.alpha, args.win,
          args.min_count, args.num_processes, bool(args.binary))

#python cbow_hierachical_softmax_single_process.py -train="hello_cbow.txt" -model=cbow_save_file -cbow=1 -negative=0 -dim=100 -alpha=0.025 -window=5 -min-count=5 -processes=4 -binary=1