霍夫曼编码 (Huffman Coding) 诞生于1952年,迄今为止依然是可变字长编码 (VLC) 中压缩率最高的二进制编码算法之一。其原理在于根据元素出现的频率来决定其编码长度,在序列中出现频率最高的元素编码长度越短,因而可以实现序列的最佳编码。关于其介绍,更具体的请看其他 热门博文。以下呈现相关代码,可编码任何类型的序列。
from collections import defaultdict
class Huffman_coding(object):
'''Implementation of Huffman coding on sequence.'''
def __init__(self):
'''Initialize freqs and codes.'''
self.freqs = defaultdict(int) #频率
self.codes = defaultdict(str) #编码
self.elements = {} #解码
def fit(self,iterable):
'''Store frequency information of iterable sequence into the library and recode the library.'''
for e in iterable:
self.freqs[e] += 1
self.coding()
def coding(self):
'''Code the library.'''
e_list = list(self.freqs.keys()) #生成元素列表
e_list = [[e] for e in e_list]
freq_list = [self.freqs[e[0]] for e in e_list] #生成频率列表
while len(freq_list) > 1:
e_list,freq_list = self.heap_sort(e_list,freq_list) #重新排序
for e in e_list[0]:
self.codes[e] = '0' + self.codes[e]
for e in e_list[1]:
self.codes[e] = '1' + self.codes[e]
freq_list = freq_list[2:] + [freq_list[0]+freq_list[1]] #合并频率最低的两项
e_list = e_list[2:] + [e_list[0]+e_list[1]]
self.elements = dict(zip([self.codes[e] for e in self.codes.keys()],self.codes.keys())) #生成解码字典
def heap_sort(self,e_list,freq_list):
'''Sort e_list in accordance with freq_list.'''
def heapsort(e_list,freq_list): #使用堆排序算法对频率列表和字符列表进行同步排序
freq_list = [0] + freq_list #随意添加一个数
e_list = [0] + e_list
max_upper = (len(freq_list)-1)//2 #下标最大的父节点
for i in range(max_upper): #自下往上遍历每一个父节点
e_list,freq_list = adjust(e_list,freq_list, max_upper-i, len(freq_list)-1) #层层替换
for i in range(len(freq_list)-2):
freq_list[1], freq_list[len(freq_list)-1-i] = freq_list[len(freq_list)-1-i], freq_list[1] #将堆顶节点和最下最右节点互换
e_list[1], e_list[len(e_list)-1-i] = e_list[len(e_list)-1-i], e_list[1]
e_list,freq_list = adjust(e_list,freq_list, 1, len(freq_list)-i-2)
return [e_list[i] for i in range(1,len(freq_list))],[freq_list[i] for i in range(1,len(freq_list))]
def adjust(e_list,freq_list, start, end): #将当前堆转换为大根堆
temp = freq_list[start]
temp_e = e_list[start]
i = start #调查父节点
j = 2 * i #第一个子节点
while j <= end:
if (j < end) and (freq_list[j] < freq_list[j + 1]): j += 1 #取数值更大的子节点
if temp < freq_list[j]:
freq_list[i] = freq_list[j] #将该更大的值赋给父节点
e_list[i] = e_list[j]
i = j
j = 2 * i
else:
break
freq_list[i] = temp #将父节点初始值(最小值)赋给被替换的子节点
e_list[i] = temp_e
return e_list,freq_list
return heapsort(e_list,freq_list)
def encode(self,iterable):
'''Encode iterable sequence.'''
codes = ''
for e in iterable:
code = self.codes[e]
if code == '':
raise KeyError('Found no record of element <%s> in Huffman Tree. Please fit the sequence or element into library before encoding.'%e)
codes += code
return codes
def decode(self,codes):
'''Decode codes into sequence.'''
iterable = []
while bool(codes) == True:
code = ''
e = None
while e is None:
code += codes[0]
codes = codes[1:]
try:
e = self.elements[code]
except:
if bool(codes) == False: raise KeyError('Found no record of code <%s> in Huffman Tree.'%code)
iterable.append(e)
return iterable
text = 'This is a text.'
tree = Huffman_coding()
tree.fit(text) #导入素材以构建/丰富霍夫曼树
codes = tree.encode(text) #对素材进行编码
recovered_text = tree.decode(codes) #对现成编码进行解码
print(tree.freqs) #查看频率
print(tree.codes) #查看编码