霍夫曼编码的python实现

霍夫曼编码的python实现

在这里插入图片描述

霍夫曼编码是一种经典的数据压缩算法,通过将出现频次高的符号用较少的位数进行编码,将出现频次比较低的符号用较多的位数进行编码。具体原理见哈夫曼编码及其应用——数据压缩(Huffman compression)

霍夫曼树

霍夫曼编码可以通过霍夫曼树来实现。在构造霍夫曼树前,先统计字符出现的频次再升序排列,将所有的字符初始化为一个叶节点。在构造树时,将两个频次最低的两个节点结合成树,同时左边的叶节点的编码设为0,右边的叶节点设为1,树的根节点的频次为两个叶节点的频次之和;将树的根节点看成新的叶节点,继续上述步骤,最后只剩下一个节点便是我们需要的霍夫曼树;因为需要得到每个符号的编码,所以还需要进行一次迭代。

代码实现

定义节点和栈

栈主要是用于存放各个节点

class Node:
    def __init__(self, symbol, freq) -> None:
        self.symbol = symbol
        self.freq = freq    # 出现频次
        self.code = None    # 编码
        self.left = None    # 左子树
        self.right = None   # 右子树
    
    def update(self):       # 对应子树赋值
        self.left.code = [0]
        self.right.code = [1]
        
    
class Stack:
    def __init__(self) -> None:
        self.stack = []     # 存储所有节点
    
    def insert_sorted(self, node:Node): # 插入已经排好序的
        self.stack.append(node)     

    def __len__(self):
        return len(self.stack)
    
    def __getitem__(self, index): 
        return self.stack[index]
    
    def insert(self, node: Node):   # 按照顺序插入
        i = 0
        for i in range(len(self.stack)):
            if self.stack[i].freq > node.freq:
                self.stack.insert(i, node)
                return
        self.stack.append(node)     # 最大的直接放在最后
        
    def update(self):               # 最小的两个节点出栈
        s1 = self.stack.pop(0)
        s2 = self.stack.pop(0)
        return s1, s2

统计频次并排序

def cal_symbol_freq(data):          # 计算每个符号的出现频次并排序
    symbol_freq = dict(Counter(data))
    symbol_freq = sorted(symbol_freq.items(), key=lambda s: s[1])
    nodes = Stack()
    for s in symbol_freq:
        nodes.insert_sorted(Node(s[0], s[1]))
    return nodes

霍夫曼编码

def updateCode(tree: Node, code):   # 迭代获得每个符号的编码
    global word2code
    if code != None:
        tree.code.extend(code)
    if tree.symbol != None:
        word2code[tree.symbol] = "".join(np.array(tree.code[::-1]).astype(str))
        return
    updateCode(tree.left, tree.code)
    updateCode(tree.right, tree.code)
    
def huffman(data):              # 霍夫曼编码
    symbol_freq = cal_symbol_freq(data)
    while len(symbol_freq) > 1:
        s1, s2 = symbol_freq.update()
        root = Node(None, freq=s1.freq+s2.freq)
        root.left = s1
        root.right = s2
        root.update()
        symbol_freq.insert(root)
    tree = symbol_freq[0]
    updateCode(tree, tree.code)
    return tree

完整代码

from PIL import Image
import numpy as np
from collections import Counter

word2code = {}
class Node:
    def __init__(self, symbol, freq) -> None:
        self.symbol = symbol
        self.freq = freq    # 出现频次
        self.code = None    # 编码
        self.left = None    # 左子树
        self.right = None   # 右子树
    
    def update(self):       # 对应子树赋值
        self.left.code = [0]
        self.right.code = [1]
        
    
class Stack:
    def __init__(self) -> None:
        self.stack = []     # 存储所有节点
    
    def insert_sorted(self, node:Node): # 插入已经排好序的
        self.stack.append(node)     

    def __len__(self):
        return len(self.stack)
    
    def __getitem__(self, index): 
        return self.stack[index]
    
    def insert(self, node: Node):   # 按照顺序插入
        i = 0
        for i in range(len(self.stack)):
            if self.stack[i].freq > node.freq:
                self.stack.insert(i, node)
                return
        self.stack.append(node)     # 最大的直接放在最后
        
    def update(self):               # 最小的两个节点出栈
        s1 = self.stack.pop(0)
        s2 = self.stack.pop(0)
        return s1, s2


def cal_symbol_freq(data):          # 计算每个符号的出现频次并排序
    symbol_freq = dict(Counter(data))
    symbol_freq = sorted(symbol_freq.items(), key=lambda s: s[1])
    nodes = Stack()
    for s in symbol_freq:
        nodes.insert_sorted(Node(s[0], s[1]))
    return nodes

def updateCode(tree: Node, code):   # 获得每个符号的编码
    global word2code
    if code != None:
        tree.code.extend(code)
    if tree.symbol != None:
        word2code[tree.symbol] = "".join(np.array(tree.code[::-1]).astype(str))
        return
    updateCode(tree.left, tree.code)
    updateCode(tree.right, tree.code)
    

def huffman(data):              # 霍夫曼编码
    symbol_freq = cal_symbol_freq(data)

    while len(symbol_freq) > 1:
        s1, s2 = symbol_freq.update()
        root = Node(None, freq=s1.freq+s2.freq)
        root.left = s1
        root.right = s2
        root.update()
        symbol_freq.insert(root)	# 将根节点作为新的叶节点放入栈中
    tree = symbol_freq[0]
    updateCode(tree, tree.code)
    return tree


if __name__ == "__main__":
    
    # image = Image.open("test1.png")
    # image = np.array(image)
    # single_image = np.ravel(image[:, :, 0])
    # code = huffman(single_image)
    # print(word2code)
    
    s = ['a'] * 31 + ['b'] * 16 + ['d'] * 8  + ['g'] * 4 + ['c'] * 10 + ['e'] * 11 + ['f'] * 20
    huffman(s)
    print(word2code)

验证

为方便验证正确,参考了数据结构-哈夫曼编码例题所给出的例子,一共有7个字符’a’-‘g’,出现频次如下

字符出现频次
a31
b16
c10
d8
e11
f20
g4

程序输出的结果为

{'f': '00', 'c': '010', 'e': '011', 'g': '1000', 'd': '1001', 'b': '101', 'a': '11'}

正确结果为(图片见数据结构-哈夫曼编码例题):
数据结构-哈夫曼编码例题中的结果

  • 1
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
霍夫曼编码Python实现可以分为以下几个步骤: 1. 统计字符出现的频率,并将其存储在一个字典中。 2. 根据字符频率构建霍夫曼树。可以使用优先队列(heapq模块)来实现。 3. 遍历霍夫曼树,生成每个字符的编码。可以使用递归来实现。 4. 将编码后的数据写入文件。 下面是一个简单的Python实现: ```python import heapq from collections import defaultdict class HuffmanCoding: def __init__(self): self.codes = {} self.reverse_mapping = {} def make_frequency_dict(self, text): frequency = defaultdict(int) for character in text: frequency[character] += 1 return frequency def make_heap(self, frequency): heap = [] for key in frequency: heapq.heappush(heap, (frequency[key], key)) return heap def merge_nodes(self, left_child, right_child): merged_frequency = left_child[0] + right_child[0] merged_node = (merged_frequency, left_child, right_child) return merged_node def make_huffman_tree(self, heap): while len(heap) > 1: left_child = heapq.heappop(heap) right_child = heapq.heappop(heap) merged_node = self.merge_nodes(left_child, right_child) heapq.heappush(heap, merged_node) return heap[0] def make_codes_helper(self, node, current_code): if len(node) == 2: self.codes[node[1]] = current_code self.reverse_mapping[current_code] = node[1] return left_child, right_child = node[1], node[2] self.make_codes_helper(left_child, current_code + "0") self.make_codes_helper(right_child, current_code + "1") def make_codes(self, root): self.make_codes_helper(root, "") def get_encoded_text(self, text): encoded_text = "" for character in text: encoded_text += self.codes[character] return encoded_text def pad_encoded_text(self, encoded_text): extra_padding = 8 - len(encoded_text) % 8 for i in range(extra_padding): encoded_text += "0" padded_info = "{0:08b}".format(extra_padding) padded_encoded_text = padded_info + encoded_text return padded_encoded_text def get_byte_array(self, padded_encoded_text): if len(padded_encoded_text) % 8 != 0: print("Encoded text not padded properly") exit(0) b = bytearray() for i in range(0, len(padded_encoded_text), 8): byte = padded_encoded_text[i:i+8] b.append(int(byte, 2)) return b def compress(self, text): frequency = self.make_frequency_dict(text) heap = self.make_heap(frequency) root = self.make_huffman_tree(heap) self.make_codes(root) encoded_text = self.get_encoded_text(text) padded_encoded_text = self.pad_encoded_text(encoded_text) byte_array = self.get_byte_array(padded_encoded_text) return byte_array def remove_padding(self, padded_encoded_text): padded_info = padded_encoded_text[:8] extra_padding = int(padded_info, 2) padded_encoded_text = padded_encoded_text[8:] encoded_text = padded_encoded_text[:-1*extra_padding] return encoded_text def decode_text(self, encoded_text): current_code = "" decoded_text = "" for bit in encoded_text: current_code += bit if current_code in self.reverse_mapping: character = self.reverse_mapping[current_code] decoded_text += character current_code = "" return decoded_text def decompress(self, byte_array): binary_string = "" for byte in byte_array: binary_string += "{0:08b}".format(byte) encoded_text = self.remove_padding(binary_string) decompressed_text = self.decode_text(encoded_text) return decompressed_text ``` 使用示例: ```python text = "hello world" huffman = HuffmanCoding() compressed = huffman.compress(text) decompressed = huffman.decompress(compressed) print("Original text:", text) print("Compressed text:", compressed) print("Decompressed text:", decompressed) ``` 输出: ``` Original text: hello world Compressed text: bytearray(b'x\x9c\xcbH\xcd\xc9\xc9\x07\x00 \x02\x8d') Decompressed text: hello world ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值