// # -*- coding: utf-8 -*-
"""
Created on Thu Apr 9 11:34:07 2020
@author: 周楠
"""
import math
import xlwt
p_dict={}#这个字典用来存信源字母及其概率
alphabet=' abcdefghijklmnopqrstuvwxyz'
value=[0.1956,0.063,0.0105,0.023,0.035,0.105,0.0225,0.011,0.047,0.029,0.001,0.003,0.055,0.021,0.059,0.0654,0.0175,0.001,0.054,0.052,0.072,0.0225,0.008,0.012,0.002,0.012,0.001]
string='The fundamental problem of communication is that of reproducing at one point either exactly or approximately a message selected at another point'
#上面就是最后要编译的文本
string = string.lower()#把文本大写全部换成小写
for i in range(len(alphabet)):
p_dict[alphabet[i]]=value[i]#字典格式{字母1:概率,字母2:概率....}
#树节点类的定义
class treenode:
def __init__(self,key,freq):
self.key = key #节点的名字
self.freq = freq #节点的概率
self.leftchild = None
self.midchild = None
self.rightchild = None
self.code = '' #节点的编码
#创建树节点队列函数
def create_noteQ(p_dict):
Q=[]
for i in p_dict.keys():
Q.append(treenode(i,p_dict[i]))
Q.sort(key=lambda item:item.freq,reverse = True)#这里就是用lambda隐函数实现队列按照字母的概率降序排列
return Q
#向队列中添加节点,并保证按照概率降序排序
def addQ(Q, nodeNew):
if len(Q) == 0:
return [nodeNew]
else:
Q=Q+[nodeNew]
Q.sort(key=lambda item:item.freq,reverse=True)
#每次加入节点都需要重新排列成降序
return Q
#队列类的定义,该类有两个功能函数,当然本身有一个创建初始化的函数。另两个分别是添加节点和弹出最低概率节点。
class Nodequeue:
def __init__(self,p_dict):
self.que = create_noteQ(p_dict)
self.size = len(self.que)
def addnode(self,node):
self.que = addQ(self.que, node)
self.size += 1
def popNode(self):
self.size -= 1
return self.que.pop()
#加入队列长度要+1,弹出长度-1
#接下来就可以开始生成huffman树了
#创建huffman树,最后返回的是树的根节点
def creatHuffmanTree(nodeQ,exact_division):#第一个参数是队列,第二个参数就是是否能整除,一会儿在最后讲
if exact_division == True:
node1 = nodeQ.popNode()
node2 = nodeQ.popNode()
r = treenode(None,node1.freq+node2.freq)
r.leftchild = node2
r.midchild = node1
nodeQ.addnode(r)
while nodeQ.size != 1:
node1 = nodeQ.popNode()
node2 = nodeQ.popNode()
node3 = nodeQ.popNode()
r = treenode(None, node1.freq+node2.freq+node3.freq)#这里节点的名字是None表示是一个虚点,是由其他的点生成的一个节点,最后形成编码字典的时候会跳过虚点
r.leftchild = node3
r.midchild = node2
r.rightchild = node1
nodeQ.addnode(r)
return nodeQ.popNode()
#最后返回的是队列的最后一个节点,也就是概率最大的点,就是这个huffman树的根节点
#接下来就是由树得到编码表
codeDic1 = {}#编码字典
codeDic2 = {}#解码字典,可以看出来两个字典相当于互为“反函数”的感觉,这里解码也是方便后面验证编码对不对
# 由哈夫曼树得到哈夫曼编码表,中序遍历,逢层赋值
def HuffmanCodeDic(roof, x):
global codeDic, codeList
if roof:#只要根不为空
HuffmanCodeDic(roof.leftchild, x+'0')
roof.code += x
if roof.key:
codeDic2[roof.code] = roof.key
codeDic1[roof.key] = roof.code
HuffmanCodeDic(roof.midchild, x+'1')
HuffmanCodeDic(roof.rightchild, x+'2')
#中序编列,逢层赋值,这里循环可以好好品一品,很简单也很简洁。
# 字符串编码
def TransEncode(string):
global codeDic1
transcode = ""
for i in string:
transcode += codeDic1[i]
return transcode
# 字符串解码
def TransDecode(StringCode):
global codeDic2
code = ""
ans = ""
for ch in StringCode:
code += ch
if code in codeDic2:
ans += codeDic2[code]
code = ""
return ans
#编码解码的算法很常规很简单,直接搬运了兄长的,偷了点小懒
#接下来就是使用上面写好的函数咯
if len(p_dict.keys()) % 2 == 0:
exact_division = True
else:
exact_division = False
t = Nodequeue(p_dict)
tree = creatHuffmanTree(t,exact_division)
HuffmanCodeDic(tree, '')
print(codeDic1,codeDic2)
a = TransEncode(string)
print(a)
aa = TransDecode(a)
print(aa.capitalize())
aver_length = len(a)/len(string)
print('平均编码长度:',aver_length)
#H = comentropy() 这个H就是信源熵哈,我这里调用了我之前自己写的一个算熵的函数,很简单就不复制上来了,所以我这里都注释掉了,当然如果要算编码效率,算出H就好,算出H的话下面两行就可以正常跑了
#efficienty = H/(aver_length*math.log(3,2))
#print('编码效率:',"%.2f%%" % (efficienty*100))
#将编码字典输出到表格
file=xlwt.Workbook(encoding='utf-8')
sheet = file.add_sheet('sheet1')
title=['码字','编码']
for col in range(len(title)):
sheet.write(0,col,title[col])
row=1
for k in codeDic1.keys():
data=[k,codeDic1[k]]
for index in range(len(data)):
sheet.write(row,index,data[index])
row+=1
file.save('码字编码表.xlsx')
print('保存成功')