词向量:哈夫曼(Huffman)编码

代码实现:

import numpy as np

#
class BinaryTreeNode(object):
    def __init__(self,value):
        self.left = None # 左子树
        self.right = None # 右子树
        self.value = value # the value of word
        
#
class HuffmanTreeNode(BinaryTreeNode):
    '''
    common part of leaf node and tree node
    '''
    def __init__(self,value,possibility):
        '''
        :param value: value of leaf node will be the word, and be mid vector in tree node.
        :param possibility: it is the frequency of word in the corpus.
        '''
        super(HuffmanTreeNode, self).__init__(value)
        self.possibility = possibility
        self.Huffman = "" # store the huffman code

#
class HuffmanTree():
    def __init__(self, word_dict, vec_len=500):
        self.vec_len = vec_len   # the length of word vector
        self.root = None
        word_dict_list = list(word_dict.values())
        node_list = [HuffmanTreeNode(x['word'],x['possibility']) for x in word_dict_list] #对词典中的每个词都产生对应的HuffmanTreeNode对象,以便进一步处理
        self.build_tree(node_list)
        # self.build_CBT(node_list)
        #self.generate_huffman_code(self.root, word_dict)
    #
    def merge(self,node1,node2):
        top_pos = node1.possibility + node2.possibility # 将概率相加
        top_node = HuffmanTreeNode(np.zeros([1,self.vec_len]), top_pos)
        if node1.possibility >= node2.possibility :
            top_node.left = node1 #左边是大的
            top_node.right = node2
        else:
            top_node.left = node2
            top_node.right = node1
        return top_node
    #
    def build_tree(self,node_list):
        while node_list.__len__()>1:
            i1 = 0 # i1表示概率最小的节点
            i2 = 1 # i2 概率第二小的节点
            if node_list[i2].possibility < node_list[i1].possibility :
                [i1,i2] = [i2,i1]
            for i in range(2,node_list.__len__()): # 找到最小的两个节点
                if node_list[i].possibility<node_list[i2].possibility :
                    i2 = i
                    if node_list[i2].possibility < node_list[i1].possibility :
                        [i1,i2] = [i2,i1]
            top_node = self.merge(node_list[i1],node_list[i2]) # 合并两个节点,生成新的中间节点
            if i1<i2: # 删除两个旧节点
                node_list.pop(i2)
                node_list.pop(i1)
            elif i1>i2:
                node_list.pop(i1)
                node_list.pop(i2)
            else:
                raise RuntimeError('i1 should not be equal to i2')
            node_list.insert(0,top_node) # 插入新节点
        self.root = node_list[0]
    #
    def generate_huffman_code(self, node, word_dict):
        stack = [self.root]
        while (stack.__len__()>0):
            node = stack.pop()
        # go along left tree
            while node.left or node.right :
                code = node.Huffman
                node.left.Huffman = code + "1" #大的给1
                node.right.Huffman = code + "0"
                stack.append(node.right)
                node = node.left
            word = node.value
            code = node.Huffman
        # print(word,'\t',code.__len__(),'\t',node.possibility)
            word_dict[word]['Huffman'] = code
        return word_dict
#

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值