python-霍夫曼编码实现压缩和解压缩（二）

最新推荐文章于 2023-06-23 22:53:03 发布

来自比邻星

最新推荐文章于 2023-06-23 22:53:03 发布

阅读量1.5k

点赞数 5

分类专栏： Python学习笔记

本文链接：https://blog.csdn.net/sinat_40936062/article/details/106910216

版权

Python学习笔记专栏收录该内容

8 篇文章 3 订阅

订阅专栏

1.问题定位

前一篇《python-霍夫曼编码实现压缩和解压缩》部分内容均来自文中给出的博客。但是在实际运行测试过程中有一个致命问题，就是对于权值相同的字符，每次迭代排序时编码要么是0、要么是1，这往往造成成对的编译码错误，问题主要出在下面的代码中：

sorts = sorted(l,key = lambda x:x.value,reverse = False)

在实际测试中，该函数往往造成上述提到的问题，因此解决办法是对该函数进行改造和重写。

2.解决办法

通过将字符出现频率从大到小排序，然后将出现频率最低（如果频率相同，将ASCII码大的排在前面）的字符分别编码0、1。
编码后，将已经编码的两个字符频率相加，将ASCII码较大的符号作为代表该整体的符号再进行排序
排序完成，紧接着，将出现频率最小的两个符号编码为0、1，对于频率值一样的情况，继续将ASCII码较大的放在前面
以此类推，最后将所有字符都编码

3.代码详细注释

3.1 定义哈夫曼树的节点类

为了使思路更加清晰、有助于算法实现，我们可以将单个节点定义为一个类，从而大大简化了二叉树的维护机制。

class node(object):
 
    def __init__(self,value = None,left = None,right = None,father = None):
        self.value = value
        self.left = left
        self.right = right
        self.father = father
 
    def build_father(left,right):
    	n = node(value = left.value + right.value,left = left,right = right)
    	left.father = right.father = n
    	return n
 
    def encode(n):
    	if n.father == None:
    		return b''
    	if n.father.left == n:
    	#左节点编号为0，右节点编号为1
    		return node.encode(n.father) + b'0'		#左节点编号'0'
    	else:
    		return node.encode(n.father) + b'1'		#右节点编号'1'

3.2 构建哈夫曼树

由于哈夫曼编码的过程中有许多步骤重复执行，因此在节点类的基础上，借助递归的思想来完成哈夫曼树的构建。

def build_tree(l):
 
	if len(l) == 1:
		return l
	#reverse = False,将节点升序排列
	sorts = sorted(l,key = lambda x:x.value,reverse = False)
	#构建父节点
	n = node.build_father(sorts[0],sorts[1])
	#构建完成后弹出前两个元素，并将新构建节点加入节点列表进行下次排序和父节点构建
	sorts.pop(0)
	sorts.pop(0)
	sorts.append(n)
	return build_tree(sorts)

3.3 利用构建好的哈夫曼树进行编码

在上一步中构建好的哈夫曼树的基础上进行编码：

def encode(echo):
#当echo = True，编码字典遍历并输出
	for x in node_dict.keys():
		ec_dict[x] = node.encode(node_dict[x])
		if echo == True:						#输出编码表（用于调试）
			print(x)
			print(ec_dict[x])

3.4 实现文件压缩、解压函数

既然我们实现的是压缩算法，那么就必须能够实现文件的压缩、解压操作才有意义。如果只能实现字符串的编码或者压缩解压，是没有很大的实用价值的。

文件压缩：

def encodefile(file):
 
	print("Starting encode...")
	f = open(file,"rb")
	bytes_width = 1						#每次读取的字节宽度
	i = 0
    #从文件末尾(0：开头， 1：中间， 2：末尾)开始读，开始读取的偏移量为0 
	f.seek(0,2)
    #tell()方法告诉你文件内的当前位置, 换句话说，下一次的读写会发生在文件开头这么多字节之后。count统计文件有多少字节
	count = f.tell() / bytes_width
	print(count)
	nodes = []							#结点列表，用于构建哈夫曼树
    #下面生成一个空列表['', '', '', '', '', '', '', '', '', '', '', '']
	buff = [b''] * int(count)
    #将指针放到文件开头
	f.seek(0)
 
	#计算字符频率,并将单个字符构建成单一节点
	while i < count:
		buff[i] = f.read(bytes_width)
		if count_dict.get(buff[i], -1) == -1:
			count_dict[buff[i]] = 0
	    #当前字符频率+1
		count_dict[buff[i]] = count_dict[buff[i]] + 1
		i = i + 1
	print("Read OK")
	print(count_dict)
	for x in count_dict.keys():
		node_dict[x] = node(count_dict[x])
		nodes.append(node_dict[x])
	
	f.close()
	tree = build_tree(nodes)		#哈夫曼树构建
	encode(False)					#构建编码表
	print("Encode OK")
 
	head = sorted(count_dict.items(),key = lambda x:x[1] ,reverse = True)
	bit_width = 1
	print("head:",head[0][1])					#动态调整编码表的字节长度，优化文件头大小
	if head[0][1] > 255:
		bit_width = 2
		if head[0][1] > 65535:
			bit_width = 3
			if head[0][1] > 16777215:
				bit_width = 4
	print("bit_width:",bit_width)
	i = 0
	raw = 0b1
	last = 0
	name = file.split('.')
    #写出原来的文件名
	o = open(name[0]+".ys" , 'wb')
	o.write(int.to_bytes(len(ec_dict) ,2 ,byteorder = 'big'))		#写出结点数量
	o.write(int.to_bytes(bit_width ,1 ,byteorder = 'big'))			#写出编码表字节宽度
	for x in ec_dict.keys():										#编码文件头
		o.write(x)
		o.write(int.to_bytes(count_dict[x] ,bit_width ,byteorder = 'big'))
 
	print('head OK')
	while i < count:												#开始压缩数据
		for x in ec_dict[buff[i]]:
			raw = raw << 1
			#如果当前读到了编码中的1，则将raw末尾置1，否则置0
			if x == 49:
				raw = raw | 1
			#如果编码已经读了8位，将低八位取出，以字节形式写入
			if raw.bit_length() == 9:
				raw = raw & (~(1 << 8))
				#byteorder = 'big',高字节在前，低字节在后
				o.write(int.to_bytes(raw ,1 , byteorder = 'big'))
				#flush() 方法是用来刷新缓冲区的，即将缓冲区中的数据立刻写入文件，同时清空缓冲区，不需要是被动的等待输出缓冲区写入。
				o.flush()
				#写入完成后，将raw变成0b1，继续进行下一个字节写入
				raw = 0b1
				tem = int(i  /len(buff) * 100)
				if tem > last:
					print("encode:", tem ,'%')						#输出压缩进度
					last = tem
		i = i + 1
 
	if raw.bit_length() > 1:										#处理文件尾部不足一个字节的数据
		raw = raw << (8 - (raw.bit_length() - 1))
		raw = raw & (~(1 << raw.bit_length() - 1))
		o.write(int.to_bytes(raw ,1 , byteorder = 'big'))
	o.close()
	print("File encode successful.")

解压文件：

def decodefile(inputfile, outputfile):
 
	print("Starting decode...")
	count = 0
	raw = 0
	last = 0
	f = open(inputfile ,'rb')
	o = open(outputfile ,'wb')
	f.seek(0,2)
	eof = f.tell()
	f.seek(0)
	count = int.from_bytes(f.read(2), byteorder = 'big')			#取出结点数量
	bit_width = int.from_bytes(f.read(1), byteorder = 'big')		#取出编码表字宽
	i = 0
	de_dict = {}
	while i < count:												#解析文件头，读取编码表，为译码做准备
		key = f.read(1)
		value = int.from_bytes(f.read(bit_width), byteorder = 'big')
		de_dict[key] = value
		i = i + 1
	for x in de_dict.keys():
		node_dict[x] = node(de_dict[x])
		nodes.append(node_dict[x])
	tree = build_tree(nodes)					#重建哈夫曼树
	encode(False)								#建立编码表
	for x in ec_dict.keys():					#反向字典构建
		inverse_dict[ec_dict[x]] = x
	i = f.tell()
	data = b''
	while i < eof:								#开始解压数据
		#每次只读取一个字节的数据，转换为int型，直到所有字节读完
		raw = int.from_bytes(f.read(1), byteorder = 'big')
		# print("raw:",raw)
		i = i + 1
		j = 8
		while j > 0:
		    #读取int型数据后，遇1写1，遇0写0
			if (raw >> (j - 1)) & 1 == 1:
				data = data + b'1'
				raw = raw & (~(1 << (j - 1)))
			else:
				data = data + b'0'
				raw = raw & (~(1 << (j - 1)))
			#查找这个data是不是在解码字典中，是的话写入，并立即刷新   
			if inverse_dict.get(data, 0) != 0:
				o.write(inverse_dict[data])
				o.flush()
				#print("decode",data,":",inverse_dict[data])
				data = b''
			j = j - 1
		tem = int(i / eof * 100)
		if tem > last:							
			print("decode:", tem,'%')			#输出解压进度
			last = tem
		raw = 0
 
	f.close()
	o.close()
	print("File decode successful.")

4.代码改进

以上内容分析了代码的主要步骤都干了些什么，下一步则是根据定位到的问题，对代码进行改进。改进代码正在整理，过段时间发出来。

来自比邻星

关注

5
点赞
踩
18

收藏

觉得还不错? 一键收藏
2
评论
python-霍夫曼编码实现压缩和解压缩（二）

前一篇《python-霍夫曼编码实现压缩和解压缩》部分内容均来自文中给出的博客。但是在实际运行测试过程中有一个致命问题，就是对于权值相同的字符，每次迭代排序时编码要么是0、要么是1，这往往造成成对的编译码错误。
复制链接

扫一扫

专栏目录