1 '''2 Create a huffman tree from3 the input is a list like4 [('a',3), ('b',2)]5 frequnce of 'a' appeard is stored as it's weight6 '''7 fromQueueimportPriorityQueue8 #if do not use treeWiter so not include pygraphviz than can use py3.09 fromtreeWriterimportTreeWriter10 fromcopyimportcopy11 12 classNodeBase():13 def__init__(self):14 self.weight=015 16 defelem(self):17 returnself.weight18 19 classNode(NodeBase):20 def__init__(self, weight=0, left=None, right=None):21 self.weight=weight22 self.left=left23 self.right=right24 25 def__str__(self):26 returnstr(self.weight)27 28 classLeaf(NodeBase):29 def__init__(self, key='', weight=0):30 self.key=key31 self.weight=weight32 33 def__str__(self):34 returnstr(self.key)35 36 37 defconvert(c):38 '''39 input c = 'a' ord(a) = 9740 bin(97) = '0b1100001'41 return ['0', '1', '1', '0', '0', '0', '0', '1']42 '''43 l1=list(bin(ord(c)))#like 0b1110144 l2=['0']*(10-len(l1))45 l2.extend(l1[2:])46 returnl247 48 classHuffmanTree():49 '''50 base class for HuffmanTreeForCompress and HuffmanTreeForDecompress51 '''52 def__init__(self):53 self.root=None54 55 classHuffmanTreeForCompress(HuffmanTree):56 '''57 create a huffman tree for the compressing process58 here self.list like [('a',3),('b',4)
] where 'a' is key, 3 is weight59 or say frequence of 'a' appear in the text60 '''61 def__init__(self, list):62 HuffmanTree.__init__(self)63 self.list=list#like [('a',3),('b',4)
]64 self.dict={}#like {'a':[0,1,1,0] ,
.}65 66 self.__buildTree()67 self.__genEncode()68 69 def__initPriorityQueue(self, queue):70 '''71 init priority queue let lowest weight at top72 '''73 forkey, weightinself.list:74 leaf=Leaf(key, weight)75 queue.put((weight,leaf))76 77 def__buildTree(self):78 '''79 build the huffman tree from the list of weight using prority queue80 greedy alogrithm,choose two least frequence node first81 '''82 length=len(self.list)83 queue=PriorityQueue(length)84 self.__initPriorityQueue(queue)85 #while queue.qsize() > 1:86 #do len(self.list) - 1 times same as while queue.qsize() > 187 foriinrange(length-1):88 left=queue.get()[1]89 right=queue.get()[1]90 weight=left.weight+right.weight91 node=Node(weight, left, right)92 queue.put((weight,node))93 self.root=queue.get()[1]94 95 def__genEncode(self):96 '''97 get huffman encode for each key using depth first travel of tree98 '''99 defgenEncodeHelp(root, encode=[]):100 ifisinstance(root, Leaf):101 #TODO notice need copy content here,why can't list(encode)?102 self.dict[root.key]=copy(encode)103 #print self.dict[root.key]104 return105 encode.append(0)106 genEncodeHelp(root.left, encode)107 encode[len(encode)-1]=1108 genEncodeHelp(root.right, encode)109 encode.pop()110 genEncodeHelp(self.root)111 112 113 classHuffmanTreeForDecompress(HuffmanTree):114 '''115 rebuild of huffman tree for the decompressing process116 '''117 def__init__(self, infile):118 HuffmanTree.__init__(self)119 self.__buildTree(infile)120 121 def__buildTree(self, infile):122 defbuildTreeHelp(infile):123 first=infile.read(1)124 second=infile.read(1)125 #if not (first == '\xff' and second == '\xfe'): #is leaf126 iffirst=='\x00':#is leaf, not consider unicode now127 returnLeaf(second)128 node=Node()129 node.left=buildTreeHelp(infile)130 node.right=buildTreeHelp(infile)131 returnnode132 infile.read(2)133 self.root=Node()134 self.root.left=buildTreeHelp(infile)135 self.root.right=buildTreeHelp(infile)136 137 classDecompress():138 def__init__(self, infileName, outfileName=''):139 #TODO better name, expection of opening file140 self.infile=open(infileName,'rb')141 ifoutfileName=='':142 outfileName=infileName+'.de'143 self.outfile=open(outfileName,'wb')144 self.tree=None145 146 def__del__(self):147 self.infile.close()148 self.outfile.close()149 150 defdecompress(self):151 self.__rebuildHuffmanTree()152 self.__decodeFile()153 154 def__rebuildHuffmanTree(self):155 self.infile.seek(0)156 self.tree=HuffmanTreeForDecompress(self.infile)157 #HuffmanTreeWriter(self.tree).write('tree2.png') #for debug158 159 def__decodeFile(self):160 #right now do not consier speed up using table161 #do not consider the last byte since it's wrong right now162 163 #TODO use a table as 0x00 -> 0000 0000 will speed up?164 self.outfile.seek(0)165 leftBit=ord(self.infile.read(1))166 lastByte=self.infile.read(1)#it is the last byte if leftBit != 0167 curNode=self.tree.root168 #import gc169 #gc.disable()170 while1:171 c=self.infile.read(1)#how about Chinese caracter? 2 bytes?172 ifc=='':173 break174 li=convert(c)#in c++ you can not return refernce to local in func here ok? yes175 forxinli:176 ifx=='0':177 curNode=curNode.left178 else:179 curNode=curNode.right180 ifisinstance(curNode, Leaf):#the cost of isinstance is higer than lkie root.left == None ?181 self.outfile.write(curNode.key)182 curNode=self.tree.root183 184 185 #deal with the last bye if leftBit != 0186 #TODO notcice code repeate can we improve?187 ifleftBit:188 li=convert(lastByte)189 forxinli:190 ifx=='0':191 curNode=curNode.left192 else:193 curNode=curNode.right194 ifisinstance(curNode, Leaf):#the cost of isinstance is higer than lkie root.left == None ?195 self.outfile.write(curNode.key)196 curNode=self.tree.root197 break#for the last byte if we find one than it's over,the other bits are useless198 199 self.outfile.flush()200 #gc.enable()201 202 203 204 classCompress():205 def__init__(self, infileName, outfileName=''):206 self.infile=open(infileName,'rb')207 ifoutfileName=='':208 outfileName=infileName+'.compress'209 self.outfile=open(outfileName,'wb')210 self.dict={}211 self.tree=None212 213 def__del__(self):214 self.infile.close()215 self.outfile.close()216 217 defcompress(self):218 self.__caculateFrequence()219 self.__createHuffmanTree()220 self.__writeCompressedFile()221 222 def__caculateFrequence(self):223 '''224 The first time of reading the input file and caculate each225 character frequence store in self.dict226 '''227 self.infile.seek(0)228 while1:229 c=self.infile.read(1)#how about Chinese caracter? 2 bytes?230 ifc=='':231 break232 #print c233 ifcinself.dict:234 self.dict[c]+=1235 else:236 self.dict[c]=0237 238 def__createHuffmanTree(self):239 '''240 Build a huffman tree from self.dict.items()241 '''242 #TODO for py 3.0 need list(self.dict.items()) instead243 self.tree=HuffmanTreeForCompress(list(self.dict.items()))244 #HuffmanTreeWriter(self.tree).write('tree1.png') #for debug245 246 def__writeCompressedFile(self):247 '''248 Create the compressed file249 First write the huffman tree to the head of outfile250 than translate the input file with encode and write the result to251 outfile252 '''253 self.outfile.seek(0)254 self.__serializeTree()255 self.__encodeFile()256 257 def__serializeTree(self):258 '''259 In order to write the tree like node node leaf node
.260 in pre order sequence to the compressed file head261 here will return the sequence list262 TODO reuse pre order and using decorator technic!!263 list like [(0,0), (0,0), (1,'c')
],264 (0,0) the first 0 means internal node265 (1,'c') the first 1 means leaf and 'c' is the key266 '''267 defserializeTreeHelp(root, mfile):268 ifisinstance(root, Leaf):269 mfile.write('\x00')#0x0270 mfile.write(root.key)271 return272 mfile.write('\xff')#'\xff' is one character representing 0xff273 mfile.write('\xfe')#0xfe274 serializeTreeHelp(root.left, mfile)275 serializeTreeHelp(root.right, mfile)276 serializeTreeHelp(self.tree.root, self.outfile)277 278 279 def__encodeFile(self):280 '''281 The second time of reading input file282 translate the input file with encode and write the result to outfile283 TODO can this be improved speed up?284 just write \xff as \b 1111 1111 ? can this be possible so do not need285 to caculate 255 than translate to \xff and write?286 '''287 self.infile.seek(0)288 #save this pos we will write here later289 pos=self.outfile.tell()290 self.outfile.write(chr(0))#store left bit291 self.outfile.write(chr(0))#if left bit !=0 this is the last byte292 num=0293 i=0;294 while1:295 c=self.infile.read(1)#how about Chinese caracter? 2 bytes?296 ifc=='':297 break298 li=self.tree.dict[c]299 forxinli:300 num=(num<<1)+x301 i+=1302 if(i==8):303 self.outfile.write(chr(num))304 num=0305 i=0306 #for all left bit we will fill with 0,and fil finally save left bit307 #like the last is 11 wich has 6 bits left than will store the last308 #byte as 1100,0000309 leftBit=(8-i)%8310 ifleftBit:311 forjinrange(i,8):312 num=(num<<1)313 314 #just after the huffman tree sotre how many bits are left for last315 #byte that is not used and filled with 0316 self.outfile.seek(pos)317 self.outfile.write(chr(leftBit))#still wrong can't not read well318 self.outfile.write(chr(num))319 self.outfile.flush()#well need this, why? remember !!!!320 #self.outfile.seek(0,2) #will not write success without this a bug???321 #print self.outfile.read(1)322 323 324 325 #def test(self):326 #for k, v in self.dict.items():327 #print k328 #print v329 330 331 classHuffmanTreeWriter(TreeWriter):332 '''333 draw a huffman tree to tree.png or user spcified file334 For huffman debug only335 '''336 defwriteHelp(self, root, A):337 p=str(self.num)338 self.num+=1339 340 ifisinstance(root, Leaf):341 key=root.key#TODO '\n' wrong to fix342 #key.replace('\n', '\\n')343 #A.add_node(p, label = str(root.elem()) + r'\n' + key, shape = 'rect')344 A.add_node(p, label=str(root.elem())+r'\n', shape='rect')345 returnp346 347 #if not a leaf for huffman tree it must both have left and right child348 A.add_node(p, label=str(root.elem()))349 350 q=self.writeHelp(root.left, A)351 A.add_node(q, label=str(root.left.elem()))352 A.add_edge(p, q, label='0')353 354 r=self.writeHelp(root.right, A)355 A.add_node(r, label=str(root.right.elem()))356 A.add_edge(p, r, label='1')357 358 l=str(self.num2)359 self.num2-=1360 A.add_node(l, style='invis')361 A.add_edge(p, l, style='invis')362 B=A.add_subgraph([q, l, r], rank='same')363 B.add_edge(q, l, style='invis')364 B.add_edge(l, r, style='invis')365 366 returnp#return key root node367 368 369 370 371 if__name__=='__main__':372 #d = [chr(ord('a')+i) for i in range(13)]373 #w = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41]374 #list = []375 #for i in range(13):376 #list.append((d[i], w[i]))377 #print(list)378 #tree = HuffmanTreeForCompress(list)379 #writer = HuffmanTreeWriter(tree)380 #writer.write()381 #tree.test()382 importsys383 iflen(sys.argv)==1:384 inputFileName='test.log'385 else:386 inputFileName=sys.argv[1]387 compress=Compress(inputFileName)388 compress.compress()389 390 decompress=Decompress(inputFileName+'.compress')391 decompress.decompress()392 393 #compress.test()394