霍夫曼编码的python实现
霍夫曼编码是一种经典的数据压缩算法,通过将出现频次高的符号用较少的位数进行编码,将出现频次比较低的符号用较多的位数进行编码。具体原理见哈夫曼编码及其应用——数据压缩(Huffman compression)。
霍夫曼树
霍夫曼编码可以通过霍夫曼树来实现。在构造霍夫曼树前,先统计字符出现的频次再升序排列,将所有的字符初始化为一个叶节点。在构造树时,将两个频次最低的两个节点结合成树,同时左边的叶节点的编码设为0,右边的叶节点设为1,树的根节点的频次为两个叶节点的频次之和;将树的根节点看成新的叶节点,继续上述步骤,最后只剩下一个节点便是我们需要的霍夫曼树;因为需要得到每个符号的编码,所以还需要进行一次迭代。
代码实现
定义节点和栈
栈主要是用于存放各个节点
class Node:
def __init__(self, symbol, freq) -> None:
self.symbol = symbol
self.freq = freq # 出现频次
self.code = None # 编码
self.left = None # 左子树
self.right = None # 右子树
def update(self): # 对应子树赋值
self.left.code = [0]
self.right.code = [1]
class Stack:
def __init__(self) -> None:
self.stack = [] # 存储所有节点
def insert_sorted(self, node:Node): # 插入已经排好序的
self.stack.append(node)
def __len__(self):
return len(self.stack)
def __getitem__(self, index):
return self.stack[index]
def insert(self, node: Node): # 按照顺序插入
i = 0
for i in range(len(self.stack)):
if self.stack[i].freq > node.freq:
self.stack.insert(i, node)
return
self.stack.append(node) # 最大的直接放在最后
def update(self): # 最小的两个节点出栈
s1 = self.stack.pop(0)
s2 = self.stack.pop(0)
return s1, s2
统计频次并排序
def cal_symbol_freq(data): # 计算每个符号的出现频次并排序
symbol_freq = dict(Counter(data))
symbol_freq = sorted(symbol_freq.items(), key=lambda s: s[1])
nodes = Stack()
for s in symbol_freq:
nodes.insert_sorted(Node(s[0], s[1]))
return nodes
霍夫曼编码
def updateCode(tree: Node, code): # 迭代获得每个符号的编码
global word2code
if code != None:
tree.code.extend(code)
if tree.symbol != None:
word2code[tree.symbol] = "".join(np.array(tree.code[::-1]).astype(str))
return
updateCode(tree.left, tree.code)
updateCode(tree.right, tree.code)
def huffman(data): # 霍夫曼编码
symbol_freq = cal_symbol_freq(data)
while len(symbol_freq) > 1:
s1, s2 = symbol_freq.update()
root = Node(None, freq=s1.freq+s2.freq)
root.left = s1
root.right = s2
root.update()
symbol_freq.insert(root)
tree = symbol_freq[0]
updateCode(tree, tree.code)
return tree
完整代码
from PIL import Image
import numpy as np
from collections import Counter
word2code = {}
class Node:
def __init__(self, symbol, freq) -> None:
self.symbol = symbol
self.freq = freq # 出现频次
self.code = None # 编码
self.left = None # 左子树
self.right = None # 右子树
def update(self): # 对应子树赋值
self.left.code = [0]
self.right.code = [1]
class Stack:
def __init__(self) -> None:
self.stack = [] # 存储所有节点
def insert_sorted(self, node:Node): # 插入已经排好序的
self.stack.append(node)
def __len__(self):
return len(self.stack)
def __getitem__(self, index):
return self.stack[index]
def insert(self, node: Node): # 按照顺序插入
i = 0
for i in range(len(self.stack)):
if self.stack[i].freq > node.freq:
self.stack.insert(i, node)
return
self.stack.append(node) # 最大的直接放在最后
def update(self): # 最小的两个节点出栈
s1 = self.stack.pop(0)
s2 = self.stack.pop(0)
return s1, s2
def cal_symbol_freq(data): # 计算每个符号的出现频次并排序
symbol_freq = dict(Counter(data))
symbol_freq = sorted(symbol_freq.items(), key=lambda s: s[1])
nodes = Stack()
for s in symbol_freq:
nodes.insert_sorted(Node(s[0], s[1]))
return nodes
def updateCode(tree: Node, code): # 获得每个符号的编码
global word2code
if code != None:
tree.code.extend(code)
if tree.symbol != None:
word2code[tree.symbol] = "".join(np.array(tree.code[::-1]).astype(str))
return
updateCode(tree.left, tree.code)
updateCode(tree.right, tree.code)
def huffman(data): # 霍夫曼编码
symbol_freq = cal_symbol_freq(data)
while len(symbol_freq) > 1:
s1, s2 = symbol_freq.update()
root = Node(None, freq=s1.freq+s2.freq)
root.left = s1
root.right = s2
root.update()
symbol_freq.insert(root) # 将根节点作为新的叶节点放入栈中
tree = symbol_freq[0]
updateCode(tree, tree.code)
return tree
if __name__ == "__main__":
# image = Image.open("test1.png")
# image = np.array(image)
# single_image = np.ravel(image[:, :, 0])
# code = huffman(single_image)
# print(word2code)
s = ['a'] * 31 + ['b'] * 16 + ['d'] * 8 + ['g'] * 4 + ['c'] * 10 + ['e'] * 11 + ['f'] * 20
huffman(s)
print(word2code)
验证
为方便验证正确,参考了数据结构-哈夫曼编码例题所给出的例子,一共有7个字符’a’-‘g’,出现频次如下
字符 | 出现频次 |
---|---|
a | 31 |
b | 16 |
c | 10 |
d | 8 |
e | 11 |
f | 20 |
g | 4 |
程序输出的结果为
{'f': '00', 'c': '010', 'e': '011', 'g': '1000', 'd': '1001', 'b': '101', 'a': '11'}
正确结果为(图片见数据结构-哈夫曼编码例题):