word2vec详解(CBOW、SG、hierarchical softmax、negative sampling)

  • word2vec用来干什么的解释,参见这篇博客
  • 一开始也是纯照论文《word2vec Parameter Learning Explained》推公式,到后来理解深了一点,所以后面和前面的公式间有点乱。
    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述
    在这里插入图片描述

在这里插入图片描述
在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

  • python代码:理解不深刻。链接
import argparse
import math
import struct
import sys
import time
import warnings

import numpy as np

from multiprocessing import Pool, Value, Array

class VocabItem:
    def __init__(self, word):
        self.word = word
        self.count = 0
        self.path = None # Path (list of indices) from the root to the word (leaf) 每一个词哈夫曼树的路径
        self.code = None # Huffman encoding 每个词的哈夫曼编码

class Vocab:
    def __init__(self, fi, min_count):
        vocab_items = []
        vocab_hash = {}
        word_count = 0
        fi = open(fi, 'r')

        # Add special tokens <bol> (beginning of line) and <eol> (end of line)
        # 添加特殊表示符(开始、停止)信息
        for token in ['<bol>', '<eol>']:
            vocab_hash[token] = len(vocab_items)
            vocab_items.append(VocabItem(token))

        # 查看文本
        # 查看文本的每一行
        for line in fi:
            tokens = line.split()
            # 查看每一行的每一个词
            for token in tokens:
                # 添加新词(不要重复)
                if token not in vocab_hash:
                    vocab_hash[token] = len(vocab_items)
                    vocab_items.append(VocabItem(token))
                    
                #assert vocab_items[vocab_hash[token]].word == token, 'Wrong vocab_hash index'
                # 计算每个词的数量
                vocab_items[vocab_hash[token]].count += 1
                # 总词数
                word_count += 1
            
                if word_count % 10000 == 0:
                    sys.stdout.write("\rReading word %d \n" % word_count)
                    # sys.stdout.flush()

            # Add special tokens <bol> (beginning of line) and <eol> (end of line)
            # 每执行完一行,加一个开头结尾标识符,词数也加2个
            vocab_items[vocab_hash['<bol>']].count += 1
            vocab_items[vocab_hash['<eol>']].count += 1
            word_count += 2

        self.bytes = fi.tell()# 返回当前文件位置
        self.vocab_items = vocab_items         # List of VocabItem objects
        self.vocab_hash = vocab_hash           # Mapping from each token to its index in vocab
        self.word_count = word_count           # Total number of words in train file

        # Add special token <unk> (unknown),
        # merge words occurring less than min_count into <unk>, and
        # sort vocab in descending order by frequency in train file
        # 按照给定的min_count,以每个词出现的次数为标准进行排序
        self.__sort(min_count)

        #assert self.word_count == sum([t.count for t in self.vocab_items]), 'word_count and sum of t.count do not agree'
        print ('Total words in training file: %d' % self.word_count)
        print ('Total bytes in training file: %d' % self.bytes)
        print ('Vocab size: %d' % len(self))

    def __getitem__(self, i):
        return self.vocab_items[i]

    def __len__(self):
        return len(self.vocab_items)

    def __iter__(self):
        return iter(self.vocab_items)

    def __contains__(self, key):
        return key in self.vocab_hash

    def __sort(self, min_count):
        tmp = []
        tmp.append(VocabItem('<unk>'))
        unk_hash = 0
        
        count_unk = 0
        for token in self.vocab_items:
            if token.count < min_count:
                count_unk += 1
                tmp[unk_hash].count += token.count
            else:
                tmp.append(token)

        tmp.sort(key=lambda token : token.count, reverse=True)# 降序排列

        # Update vocab_hash
        vocab_hash = {}
        for i, token in enumerate(tmp):
            vocab_hash[token.word] = i

        self.vocab_items = tmp
        self.vocab_hash = vocab_hash

        print ('Unknown vocab size:', count_unk)

    def indices(self, tokens):
        return [self.vocab_hash[token] if token in self else self.vocab_hash['<unk>'] for token in tokens]

    def encode_huffman(self):
        # Building a Huffman tree
        vocab_size = len(self)
        # count已经是由大到小排好序的词频
        count = [t.count for t in self] + [1e15] * (vocab_size - 1)  # 创建单词叶结点以及Huffman的中间节点
        # 后面部分的目的是储存中间节点,比如将两个单词构造一个树结构的情况,1e15=10^15,目的是构造出来的小数中的频次的值一定会大于所有单词
        # 出现的频次,否则会出现将两个树进行合并时出现错位的问题
        parent = [0] * (2 * vocab_size - 2)  # [0]*2=[0,0] parent是Huffman树中节点的个数,parent节点个数貌似和Huffman边的个数一样??
        binary = [0] * (2 * vocab_size - 2)  # Huffman树中边的个数

        pos1 = vocab_size - 1  # 和单词数目对应,表示叶结点[t.count for t in self]部分,并且因为
        # 单词表是根据降序排列的,vocab_size表示最小的频次的单词,之所以是-1是因为数组从0开始索引,而不是像字典可以根据具体单词索引
        pos2 = vocab_size  # 和中间节点对应表示[1e15]*(vocab_size-1)这部分的标号

        for i in range(vocab_size - 1):
            # Find min1 第一个频次最小的节点
            if pos1 >= 0:
                if count[pos1] < count[pos2]:
                    min1 = pos1
                    pos1 -= 1  # 单词表中频次最小的单词挑选出来作为新构建树的左孩子节点,因此要将索引减一,再在单词表中查找只能查到次最小,以此类推
                else:
                    min1 = pos2
                    pos2 += 1
            else:
                min1 = pos2
                pos2 += 1
            # Find min2 第二个频次最小的节点
            if pos1 >= 0:
                if count[pos1] < count[pos2]:
                    min2 = pos1
                    pos1 -= 1
                else:
                    min2 = pos2
                    pos2 += 1
            else:
                min2 = pos2
                pos2 += 1

            count[vocab_size + i] = count[min1] + count[min2]  # 构造出的第一个树的父节点,更新频次
            parent[min1] = vocab_size + i  # 记录下单词索引为min1的父节点
            parent[min2] = vocab_size + i  # 记录下单词索引为min2的父节点
            binary[min2] = 1  # 设置单词索引为min2与索引为vocab_size+i的父节点的连边为1

        # 到这里为止Huffman树构造完成,不过并不像C++或C语言中使用指针构造,而是将这些节点以及他们的索引保存起来,
        # 记录为parent、binary、count分别记录单词索引下的父节点索引、与父节点连边的编码值、count用于比较最小值

        # Assign binary code and path pointers to each vocab word 构造好Huffman树后为单词设置编码
        ############################         理       解          ###########################
        #     注意这里之所以用语料库中词语的频次构造,只是为了迁移Huffman树构造方法,更加方便
        #     我猜只要满足Huffman树结构的树,单词可以任意放入这个树结构中,并随机编码  我猜是这样子
        #     因为如果不用频次构造Huffman树,构造这种树结构比较混乱,不能形式化去用
        ############################         理       解          ###########################

        root_idx = 2 * vocab_size - 2  # 根结点的索引
        for i, token in enumerate(self):
            path = []  # 记录从根节点到该单词token的路径
            code = []  # 根据path以及binary[]设置编码

            node_idx = i
            while node_idx < root_idx:
                if node_idx >= vocab_size: path.append(node_idx)
                code.append(binary[node_idx])  # 只是为了便于保存,更新单词中的编码的时候是倒序放入的
                node_idx = parent[node_idx]
            path.append(root_idx)

            # these are path and code from root to the leaf
            token.path = [j - vocab_size for j in path[::-1]]  # [::-1]表示从最后一个开始便利当前list列表
            # j根据Huffman树结构j-vocab_size刚好可以表示从根结点到当前单词所经过几个中间节点,也就是
            # 代表了编码长度
            token.code = code[::-1]


'''
unigram,bigram,trigram,是自然语言处理(NLP)中的问题。父词条:n-gram.
unigram: 单个word
bigram: 双word
trigram:3 word
比如:
西安交通大学:
unigram 形式为:西/安/交/通/大/学
bigram形式为: 西安/安交/交通/通大/大学
trigram形式为:西安交/安交通/交通大/通大学
'''
class UnigramTable:
    """
    A list of indices of tokens in the vocab following a power law distribution,
    used to draw negative samples.
    """
    def __init__(self, vocab):
        vocab_size = len(vocab)
        power = 0.75
        norm = sum([math.pow(t.count, power) for t in vocab]) # Normalizing constant
        table_size = int(1e8) # Length of the unigram table
        table = np.zeros(table_size, dtype=np.uint32)

        print ('Filling unigram table')
        p = 0 # Cumulative probability
        i = 0
        for j, unigram in enumerate(vocab):
            p += float(math.pow(unigram.count, power))/norm
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
        self.table = table

    def sample(self, count):
        # 随机采样count条数据
        indices = np.random.randint(low=0, high=len(self.table), size=count)
        return [self.table[i] for i in indices]

def sigmoid(z):
    if z > 6:
        return 1.0
    elif z < -6:
        return 0.0
    else:
        return 1 / (1 + math.exp(-z))

def init_net(dim, vocab_size):
    # Init syn0 with random numbers from a uniform distribution on the interval [-0.5, 0.5]/dim
    # 产生随机均匀分布:np.random.uniform(a,b,size)-->产生size个大小在a到b之间的随机数
    # vocab_size:词表大小
    tmp = np.random.uniform(low=-0.5/dim, high=0.5/dim, size=(vocab_size, dim))
    # 从numpy数组创建并返回ctypes对象
    syn0 = np.ctypeslib.as_ctypes(tmp)
    syn0 = Array(syn0._type_, syn0, lock=False)

    # Init syn1 with zeros
    tmp = np.zeros(shape=(vocab_size, dim))
    syn1 = np.ctypeslib.as_ctypes(tmp)
    syn1 = Array(syn1._type_, syn1, lock=False)

    return (syn0, syn1)

def train_process(pid):
    # Set fi to point to the right chunk of training file
    start = vocab.bytes / num_processes * pid
    end = vocab.bytes if pid == num_processes - 1 else vocab.bytes / num_processes * (pid + 1)
    fi.seek(start)
    #print 'Worker %d beginning training at %d, ending at %d' % (pid, start, end)

    alpha = starting_alpha

    word_count = 0
    last_word_count = 0

    while fi.tell() < end:
        line = fi.readline().strip()
        # Skip blank lines
        if not line:
            continue

        # Init sent, a list of indices of words in line
        sent = vocab.indices(['<bol>'] + line.split() + ['<eol>'])

        for sent_pos, token in enumerate(sent):
            if word_count % 10000 == 0:
                global_word_count.value += (word_count - last_word_count)
                last_word_count = word_count

                # Recalculate alpha
                alpha = starting_alpha * (1 - float(global_word_count.value) / vocab.word_count)
                if alpha < starting_alpha * 0.0001: alpha = starting_alpha * 0.0001

                # Print progress info
                sys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %
                                 (alpha, global_word_count.value, vocab.word_count,
                                  float(global_word_count.value) / vocab.word_count * 100))
                sys.stdout.flush()

            # Randomize window size, where win is the max window size
            # 获取到窗内的单词
            current_win = np.random.randint(low=1, high=win+1)
            context_start = max(sent_pos - current_win, 0)
            context_end = min(sent_pos + current_win + 1, len(sent))
            context = sent[context_start:sent_pos] + sent[sent_pos+1:context_end] # Turn into an iterator?

            # CBOW
            if cbow:
                # Compute neu1
                # 隐藏层向量neu1
                neu1 = np.mean(np.array([syn0[c] for c in context]), axis=0)
                assert len(neu1) == dim, 'neu1 and dim do not agree'

                # Init neu1e with zeros
                neu1e = np.zeros(dim)

                # Compute neu1e and update syn1
                if neg > 0:
                    # 正样本标记为1,负样本标记为0
                    classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]
                else:
                    classifiers = zip(vocab[token].path, vocab[token].code)
                for target, label in classifiers:
                    # 隐藏层与样本词向量2的内积
                    z = np.dot(neu1, syn1[target])
                    # 得到的结果sigmoid
                    p = sigmoid(z)
                    # (p-label)为关于syn2中某词向量的偏导数,alpha为学习率
                    g = alpha * (label - p)
                    # 更新syn1
                    syn1[target] += g * neu1  # Update syn1
                    # 关于隐藏层的偏导数
                    neu1e += g * syn1[target] # Error to backpropagate to syn0


                # Update syn0
                for context_word in context:
                    syn0[context_word] += neu1e

            # Skip-gram
            else:
                for context_word in context:
                    # Init neu1e with zeros
                    neu1e = np.zeros(dim)

                    # Compute neu1e and update syn1
                    if neg > 0:
                        classifiers = [(token, 1)] + [(target, 0) for target in table.sample(neg)]
                    else:
                        classifiers = zip(vocab[token].path, vocab[token].code)
                    for target, label in classifiers:
                        z = np.dot(syn0[context_word], syn1[target])
                        p = sigmoid(z)
                        g = alpha * (label - p)
                        neu1e += g * syn1[target]              # Error to backpropagate to syn0
                        syn1[target] += g * syn0[context_word] # Update syn1

                    # Update syn0
                    syn0[context_word] += neu1e

            word_count += 1

    # Print progress info
    global_word_count.value += (word_count - last_word_count)
    sys.stdout.write("\rAlpha: %f Progress: %d of %d (%.2f%%)" %
                     (alpha, global_word_count.value, vocab.word_count,
                      float(global_word_count.value)/vocab.word_count * 100))
    sys.stdout.flush()
    fi.close()

def save(vocab, syn0, fo, binary):
    print ('Saving model to', fo)
    dim = len(syn0[0])
    xxx = binary
    # if binary:
    #     fo = open(fo, 'wb')
    #     fo.write("%d %d\n" % (len(syn0), dim))
    #     fo.write('\n')
    #     for token, vector in zip(vocab, syn0):
    #         fo.write('%s ' % token.word)
    #         for s in vector:
    #             fo.write(struct.pack('f', s))
    #         fo.write('\n')
    # else:
    fo = open(fo, 'w')
    fo.write('%d %d\n' % (len(syn0), dim))
    for token, vector in zip(vocab, syn0):
        word = token.word
        vector_str = ' '.join([str(s) for s in vector])
        fo.write('%s %s\n' % (word, vector_str))

    fo.close()

def __init_process(*args):
    global vocab, syn0, syn1, table, cbow, neg, dim, starting_alpha
    global win, num_processes, global_word_count, fi
    
    vocab, syn0_tmp, syn1_tmp, table, cbow, neg, dim, starting_alpha, win, num_processes, global_word_count = args[:-1]
    fi = open(args[-1], 'r')
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', RuntimeWarning)
        syn0 = np.ctypeslib.as_array(syn0_tmp)
        syn1 = np.ctypeslib.as_array(syn1_tmp)

def train(fi, fo, cbow, neg, dim, alpha, win, min_count, num_processes, binary):
    # Read train file to init vocab
    vocab = Vocab(fi, min_count)

    # Init net
    # 词向量定义为dim维
    # vocab:词表大小
    # 初始化word2vec中的两个权重矩阵,也就是词向量矩阵
    syn0, syn1 = init_net(dim, len(vocab))

    global_word_count = Value('i', 0)
    table = None
    if neg > 0:
        print( 'Initializing unigram table')
        table = UnigramTable(vocab)
    else:
        print ('Initializing Huffman tree')
        vocab.encode_huffman()

    # Begin training using num_processes workers
    t0 = time.time()
    pool = Pool(processes=num_processes, initializer=__init_process,
                initargs=(vocab, syn0, syn1, table, cbow, neg, dim, alpha,
                          win, num_processes, global_word_count, fi))
    pool.map(train_process, range(num_processes))
    t1 = time.time()
    print ('Completed training. Training took', (t1 - t0) / 60, 'minutes')

    # Save model to file
    save(vocab, syn0, fo, binary)

if __name__ == '__main__':
    # parser = argparse.ArgumentParser()
    # parser.add_argument('-train', help='Training file', dest='fi', required=True)
    # parser.add_argument('-model', help='Output model file', dest='fo', required=True)
    # parser.add_argument('-cbow', help='1 for CBOW, 0 for skip-gram', dest='cbow', default=1, type=int)
    # parser.add_argument('-negative', help='Number of negative examples (>0) for negative sampling, 0 for hierarchical softmax', dest='neg', default=5, type=int)
    # parser.add_argument('-dim', help='Dimensionality of word embeddings', dest='dim', default=100, type=int)
    # parser.add_argument('-alpha', help='Starting alpha', dest='alpha', default=0.025, type=float)
    # parser.add_argument('-window', help='Max window length', dest='win', default=5, type=int)
    # parser.add_argument('-min-count', help='Min count for words used to learn <unk>', dest='min_count', default=5, type=int)
    # parser.add_argument('-processes', help='Number of processes', dest='num_processes', default=1, type=int)
    # parser.add_argument('-binary', help='1 for output model in binary format, 0 otherwise', dest='binary', default=0, type=int)
    # #TO DO: parser.add_argument('-epoch', help='Number of training epochs', dest='epoch', default=1, type=int)
    # args = parser.parse_args()

    # train(args.fi, args.fo, bool(args.cbow), args.neg, args.dim, args.alpha, args.win,
    #       args.min_count, args.num_processes, bool(args.binary))
    train('questions-words.txt','result.txt', 0, 0, 300, 0.01, 1, 5, 1, 0)
  • 7
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 3
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值