word2vec详解（CBOW、SG、hierarchical softmax、negative sampling）

最新推荐文章于 2023-08-30 10:00:00 发布
一枚小白的日常
最新推荐文章于 2023-08-30 10:00:00 发布
阅读量705
点赞数 7
本文链接：https://blog.csdn.net/Smile_mingm/article/details/110441584
版权
word2vec用来干什么的解释，参见这篇博客
一开始也是纯照论文《word2vec Parameter Learning Explained》推公式，到后来理解深了一点，所以后面和前面的公式间有点乱。
在这里插入图片描述
python代码：理解不深刻。链接
import argparse
import math
import struct
import sys
import time
import warnings

import numpy as np

from multiprocessing import Pool, Value, Array

class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0
        self.path = None # Path (list of indices) from the root to the word (leaf) 每一个词哈夫曼树的路径
        self.code = None # Huffman encoding 每个词的哈夫曼编码

class Vocab:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        fi = open(fi, 'r')

        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        # 添加特殊表示符（开始、停止）信息
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))

        # 查看文本
        # 查看文本的每一行
        for line in fi:
            tokens = line.split()
            # 查看每一行的每一个词
            for token in tokens:
                # 添加新词（不要重复）
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    vocab_items.append(VocabItem(token))
                    
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                # 计算每个词的数量
                vocab_items[vocab_hash[token]].count += 1
                # 总词数
                word_count += 1
            
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d \n" % word_count)
                    # sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            # 每执行完一行，加一个开头结尾标识符，词数也加2个
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2

        self.bytes = fi.tell()# 返回当前文件位置
        self.vocab_items = vocab_items         # List of VocabItem objects
        self.vocab_hash = vocab_hash           # Mapping from each token to its index in vocab
        self.word_count = word_count           # Total number of words in train file

        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        # 按照给定的min_count，以每个词出现的次数为标准进行排序
        self.__sort(min_count)

        #assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'
        print ('Total words in training file: %d' % self.word_count)
        print ('Total bytes in training file: %d' % self.bytes)
        print ('Vocab size: %d' % len(self))

    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)# 降序排列

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash

        print ('Unknown vocab size:', count_unk)

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

    def encode_huffman(self):
        # Building a Huffman tree
        vocab_size = len(self)
        # count已经是由大到小排好序的词频
        count = [t.count for t in self] + [1e15] * (vocab_size - 1)  # 创建单词叶结点以及Huffman的中间节点
        # 后面部分的目的是储存中间节点，比如将两个单词构造一个树结构的情况,1e15=10^15，目的是构造出来的小数中的频次的值一定会大于所有单词
        # 出现的频次，否则会出现将两个树进行合并时出现错位的问题
        parent = [0] * (2 * vocab_size - 2)  # [0]*2=[0,0] parent是Huffman树中节点的个数，parent节点个数貌似和Huffman边的个数一样？？
        binary = [0] * (2 * vocab_size - 2)  # Huffman树中边的个数

        pos1 = vocab_size - 1  # 和单词数目对应，表示叶结点[t.count for t in self]部分,并且因为
        # 单词表是根据降序排列的，vocab_size表示最小的频次的单词，之所以是-1是因为数组从0开始索引，而不是像字典可以根据具体单词索引
        pos2 = vocab_size  # 和中间节点对应表示[1e15]*(vocab_size-1)这部分的标号

        for i in range(vocab_size - 1):
            # Find min1 第一个频次最小的节点
            if pos1 >= 0:
                if count[pos1] < count[pos2]:
                    min1 = pos1
                    pos1 -= 1  # 单词表中频次最小的单词挑选出来作为新构建树的左孩子节点，因此要将索引减一，再在单词表中查找只能查到次最小，以此类推
                else:
                    min1 = pos2
                    pos2 += 1
            else:
                min1 = pos2
                pos2 += 1
            # Find min2 第二个频次最小的节点
            if pos1 >= 0:
                if count[pos1] < count[pos2]:
                    min2 = pos1
                    pos1 -= 1
                else:
                    min2 = pos2
                    pos2 += 1
            else:
                min2 = pos2
                pos2 += 1

            count[vocab_size + i] = count[min1] + count[min2]  # 构造出的第一个树的父节点，更新频次
            parent[min1] = vocab_size + i  # 记录下单词索引为min1的父节点
            parent[min2] = vocab_size + i  # 记录下单词索引为min2的父节点
            binary[min2] = 1  # 设置单词索引为min2与索引为vocab_size+i的父节点的连边为1

        # 到这里为止Huffman树构造完成，不过并不像C++或C语言中使用指针构造，而是将这些节点以及他们的索引保存起来，
        # 记录为parent、binary、count分别记录单词索引下的父节点索引、与父节点连边的编码值、count用于比较最小值

        # Assign binary code and path pointers to each vocab word 构造好Huffman树后为单词设置编码
        ############################         理       解          ###########################
        #     注意这里之所以用语料库中词语的频次构造，只是为了迁移Huffman树构造方法，更加方便
        #     我猜只要满足Huffman树结构的树，单词可以任意放入这个树结构中，并随机编码  我猜是这样子
        #     因为如果不用频次构造Huffman树，构造这种树结构比较混乱，不能形式化去用
        ############################         理       解          ###########################

        root_idx = 2 * vocab_size - 2  # 根结点的索引
        for i, token in enumerate(self):
            path = []  # 记录从根节点到该单词token的路径
            code = []  # 根据path以及binary[]设置编码

            node_idx = i
            while node_idx < root_idx:
                if node_idx >= vocab_size: path.append(node_idx)
                code.append(binary[node_idx])  # 只是为了便于保存，更新单词中的编码的时候是倒序放入的
                node_idx = parent[node_idx]
            path.append(root_idx)

            # these are path and code from root to the leaf
            token.path = [j - vocab_size for j in path[::-1]]  # [::-1]表示从最后一个开始便利当前list列表
            # j根据Huffman树结构j-vocab_size刚好可以表示从根结点到当前单词所经过几个中间节点，也就是
            # 代表了编码长度
            token.code = code[::-1]


'''
unigram,bigram,trigram,是自然语言处理（NLP）中的问题。父词条：n-gram.
unigram: 单个word
bigram: 双word
trigram:3 word
比如：
西安交通大学：
unigram 形式为：西/安/交/通/大/学
bigram形式为： 西安/安交/交通/通大/大学
trigram形式为：西安交/安交通/交通大/通大学
'''
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        table_size = int(1e8) # Length of the unigram table
        table = np.zeros(table_size, dtype=np.uint32)

        print ('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        for j, unigram in enumerate(vocab):
            p += float(math.pow(unigram.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
        self.table = table

    def sample(self, count):
        # 随机采样count条数据
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

def sigmoid(z):
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))

def init_net(dim, vocab_size):
    # Init syn0 with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    # 产生随机均匀分布：np.random.uniform(a,b,size)-->产生size个大小在a到b之间的随机数
    # vocab_size:词表大小
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    # 从numpy数组创建并返回ctypes对象
    syn0 = np.ctypeslib.as_ctypes(tmp)
    syn0 = Array(syn0._type_, syn0, lock=False)

    # Init syn1 with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    syn1 = np.ctypeslib.as_ctypes(tmp)
    syn1 = Array(syn1._type_, syn1, lock=False)

    return (syn0, syn1)

def train_process(pid):
    # Set fi to point to the right chunk of training file
    start = vocab.bytes / num_processes * pid
    end = vocab.bytes if pid == num_processes - 1 else vocab.bytes / num_processes * (pid + 1)
    fi.seek(start)
    #print 'Worker %d beginning training at %d, ending at %d' % (pid, start, end)

    alpha = starting_alpha

    word_count = 0
    last_word_count = 0

    while fi.tell() < end:
        line = fi.readline().strip()
        # Skip blank lines
        if not line:
            continue

        # Init sent, a list of indices of words in line
        sent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])

        for sent_pos, token in enumerate(sent):
            if word_count % 10000 == 0:
                global_word_count.value += (word_count - last_word_count)
                last_word_count = word_count

                # Recalculate alpha
                alpha = starting_alpha * (1 - float(global_word_count.value) / vocab.word_count)
                if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001

                # Print progress info
                sys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %
                                 (alpha, global_word_count.value, vocab.word_count,
                                  float(global_word_count.value) / vocab.word_count * 100))
                sys.stdout.flush()

            # Randomize window size, where win is the max window size
            # 获取到窗内的单词
            current_win = np.random.randint(low=1, high=win+1)
            context_start = max(sent_pos - current_win, 0)
            context_end = min(sent_pos + current_win + 1, len(sent))
            context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end] # Turn into an iterator?

            # CBOW
            if cbow:
                # Compute neu1
                # 隐藏层向量neu1
                neu1 = np.mean(np.array([syn0[c] for c in context]), axis=0)
                assert len(neu1) == dim, 'neu1 and dim do not agree'

                # Init neu1e with zeros
                neu1e = np.zeros(dim)

                # Compute neu1e and update syn1
                if neg > 0:
                    # 正样本标记为1，负样本标记为0
                    classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]
                else:
                    classifiers = zip(vocab[token].path, vocab[token].code)
                for target, label in classifiers:
                    # 隐藏层与样本词向量2的内积
                    z = np.dot(neu1, syn1[target])
                    # 得到的结果sigmoid
                    p = sigmoid(z)
                    # （p-label）为关于syn2中某词向量的偏导数，alpha为学习率
                    g = alpha * (label - p)
                    # 更新syn1
                    syn1[target] += g * neu1  # Update syn1
                    # 关于隐藏层的偏导数
                    neu1e += g * syn1[target] # Error to backpropagate to syn0


                # Update syn0
                for context_word in context:
                    syn0[context_word] += neu1e

            # Skip-gram
            else:
                for context_word in context:
                    # Init neu1e with zeros
                    neu1e = np.zeros(dim)

                    # Compute neu1e and update syn1
                    if neg > 0:
                        classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]
                    else:
                        classifiers = zip(vocab[token].path, vocab[token].code)
                    for target, label in classifiers:
                        z = np.dot(syn0[context_word], syn1[target])
                        p = sigmoid(z)
                        g = alpha * (label - p)
                        neu1e += g * syn1[target]              # Error to backpropagate to syn0
                        syn1[target] += g * syn0[context_word] # Update syn1

                    # Update syn0
                    syn0[context_word] += neu1e

            word_count += 1

    # Print progress info
    global_word_count.value += (word_count - last_word_count)
    sys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %
                     (alpha, global_word_count.value, vocab.word_count,
                      float(global_word_count.value)/vocab.word_count * 100))
    sys.stdout.flush()
    fi.close()

def save(vocab, syn0, fo, binary):
    print ('Saving model to', fo)
    dim = len(syn0[0])
    xxx = binary
    # if binary:
    #     fo = open(fo, 'wb')
    #     fo.write("%d %d\n" % (len(syn0), dim))
    #     fo.write('\n')
    #     for token, vector in zip(vocab, syn0):
    #         fo.write('%s ' % token.word)
    #         for s in vector:
    #             fo.write(struct.pack('f', s))
    #         fo.write('\n')
    # else:
    fo = open(fo, 'w')
    fo.write('%d %d\n' % (len(syn0), dim))
    for token, vector in zip(vocab, syn0):
        word = token.word
        vector_str = ' '.join([str(s) for s in vector])
        fo.write('%s %s\n' % (word, vector_str))

    fo.close()

def __init_process(*args):
    global vocab, syn0, syn1, table, cbow, neg, dim, starting_alpha
    global win, num_processes, global_word_count, fi
    
    vocab, syn0_tmp, syn1_tmp, table, cbow, neg, dim, starting_alpha, win, num_processes, global_word_count = args[:-1]
    fi = open(args[-1], 'r')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', RuntimeWarning)
        syn0 = np.ctypeslib.as_array(syn0_tmp)
        syn1 = np.ctypeslib.as_array(syn1_tmp)

def train(fi, fo, cbow, neg, dim, alpha, win, min_count, num_processes, binary):
    # Read train file to init vocab
    vocab = Vocab(fi, min_count)

    # Init net
    # 词向量定义为dim维
    # vocab:词表大小
    # 初始化word2vec中的两个权重矩阵，也就是词向量矩阵
    syn0, syn1 = init_net(dim, len(vocab))

    global_word_count = Value('i', 0)
    table = None
    if neg > 0:
        print( 'Initializing unigram table')
        table = UnigramTable(vocab)
    else:
        print ('Initializing Huffman tree')
        vocab.encode_huffman()

    # Begin training using num_processes workers
    t0 = time.time()
    pool = Pool(processes=num_processes, initializer=__init_process,
                initargs=(vocab, syn0, syn1, table, cbow, neg, dim, alpha,
                          win, num_processes, global_word_count, fi))
    pool.map(train_process, range(num_processes))
    t1 = time.time()
    print ('Completed training. Training took', (t1 - t0) / 60, 'minutes')

    # Save model to file
    save(vocab, syn0, fo, binary)

if __name__ == '__main__':
    # parser = argparse.ArgumentParser()
    # parser.add_argument('-train', help='Training file', dest='fi', required=True)
    # parser.add_argument('-model', help='Output model file', dest='fo', required=True)
    # parser.add_argument('-cbow', help='1 for CBOW, 0 for skip-gram', dest='cbow', default=1, type=int)
    # parser.add_argument('-negative', help='Number of negative examples (>0) for negative sampling, 0 for hierarchical softmax', dest='neg', default=5, type=int)
    # parser.add_argument('-dim', help='Dimensionality of word embeddings', dest='dim', default=100, type=int)
    # parser.add_argument('-alpha', help='Starting alpha', dest='alpha', default=0.025, type=float)
    # parser.add_argument('-window', help='Max window length', dest='win', default=5, type=int)
    # parser.add_argument('-min-count', help='Min count for words used to learn <unk>', dest='min_count', default=5, type=int)
    # parser.add_argument('-processes', help='Number of processes', dest='num_processes', default=1, type=int)
    # parser.add_argument('-binary', help='1 for output model in binary format, 0 otherwise', dest='binary', default=0, type=int)
    # #TO DO: parser.add_argument('-epoch', help='Number of training epochs', dest='epoch', default=1, type=int)
    # args = parser.parse_args()

    # train(args.fi, args.fo, bool(args.cbow), args.neg, args.dim, args.alpha, args.win,
    #       args.min_count, args.num_processes, bool(args.binary))
    train('questions-words.txt','result.txt', 0, 0, 300, 0.01, 1, 5, 1, 0)
一枚小白的日常
关注
7
点赞
踩
8

收藏

觉得还不错? 一键收藏
3
评论
word2vec详解（CBOW、SG、hierarchical softmax、negative sampling）

word2vec用来干什么的解释，参见这篇博客一开始也是纯照论文《word2vec Parameter Learning Explained》推公式，到后来理解深了一点，所以后面和前面的公式间有点乱。
复制链接

扫一扫