Huffman Tree(哈夫曼树)的python实现
创建Huffman Tree 的基础node
class node:
def __init__(self):
self.name = None;
self.left = None;
self.right = None;
self.code = "";
self.count = None;
def __init__(self, name):
self.name = name;
self.left = None;
self.right = None;
self.code = "";
self.count = None;
def __init__(self, name, count):
self.name = name;
self.left = None;
self.right = None;
self.code = "";
self.count = count
-
包含了
- name(symbol)
- left/right: 左右的子node
- code: huffman算法实现后的symbol的code
- count: 当前symbol出现的次数
定义Huffman算法
def nodesHuffman(nodes, names):
num = 0; # 初始化集合为P0, 之后依次为P1, P2 ...
for i in range(len(nodes)-1, -1, -1):
if (nodes[i].count == 0):
nodes.pop(i); # 如果出现次数为0, 则踢出huffman树, 否则会影响code, 可能会使code更复杂, 压缩率更低
while len(nodes) >1:
# 找出count最小的两个node
if(nodes[1].count >= nodes[0].count):
nodemin1 = 0;
nodemin2 = 1;
else:
nodemin1 = 1;
nodemin2 = 0;
for i in range(2, len(nodes)):
if(nodes[i].count < nodes[nodemin2].count):
if(nodes[i].count < nodes[nodemin1].count):
nodemin2 = nodemin1;
nodemin1 = i;
else:
nodemin2 = i;
# 新建父node, "P"+str(num), count为子node的和
countNew = nodes[nodemin2].count + nodes[nodemin1].count;
nameNew = "P"+str(num);
num += 1;
nodeNew = node(nameNew, countNew);
nodeNew.left = nodes[nodemin1];
nodeNew.right = nodes[nodemin2];
# 从待处理的nodes中踢出已经在tree里的子node
if (nodemin1 < nodemin2):
nodes.pop(nodemin2);
nodes.pop(nodemin1);
else:
nodes.pop(nodemin1);
nodes.pop(nodemin2);
# 将新的父node放入待处理的nodes
nodes.insert(0, nodeNew);
# 创建code的Dictionary, name对应code
codeDic = {};
# 给Huffman Tree里的每个node赋予code
setCode(nodes[0], nodes, names, codeDic);
# 返回
return [nodes[0], codeDic];
定义赋予code的函数
def setCode(node, nodes, names, codeDic):
# 判断左侧子node是否为空
if (node.left != None):
# 判断是否node属于names, 如果是则插入nodes
if (node.left.name in names):
nodes.insert(1, node.left);
# 赋予code
node.left.code = node.code + "0";
codeDic[node.left.name] = node.left.code;
# 给左侧子node的子node赋予code
setCode(node.left, nodes, names, codeDic);
# 判断右侧子node是否为空
if (node.right != None):
# 判断是否node属于names, 如果是则插入nodes
if (node.right.name in names):
nodes.insert(1, node.right);
# 赋予code
node.right.code = node.code + "1";
codeDic[node.right.name] = node.right.code;
# 给右侧子node的子node赋予code
setCode(node.right, nodes, names, codeDic);
使用(基于.wav压缩)
import struct;
import tkinter.filedialog;
fname = tkinter.filedialog.askopenfilename();
count = {};
file = open(fname, "rb");
file2 = open("test.Huffman", "wb");
# wav中, 0-44为头文件, 这里可以不用管, 直接从44开始读取
# 算出每个symbol出现的次数
for i in range(44, slength, 2):
temp = struct.unpack('h', s[i:i+2])[0];
if (count.has_key(temp)):
count[temp] += 1;
else:
count[temp] = 1;
# 初始化node array
keys = count.keys();
keylen = len(keys);
nodes = [0 for i in range(keylen)];
for i in range(keylen):
nodes[i] = node(keys[i], count[keys[i]]);
# 使用Huffman算法
[nodeOrigin, codeDic] = nodesHuffman(nodes, keys);
tempNow = '';
for i in range(44, slength, 2):
temp = struct.unpack('h', s[i:i+2])[0];
tempNow += codeDic[temp];
# 给得到的Huffman string添加 '0' 确保是8的倍数(因为是用B的范围是-128至127需要8位)
for i in range(len(tempNow)%8):
tempNow += '0';
# 写入压缩文件
for i in range(0, len(tempNow), 8):
tempInt = int(tempNow[i:i+8],2);
temp = struct.pack('B', tempInt);
file2.write(temp);
总结
Huffman算法比较适合多symbol大频率出现的情况, setCode因为是一个recursive的函数,所以给symbol赋予code比较耗费时间