【数据结构与算法基础】哈夫曼树与哈夫曼编码(C++)-CSDN博客

1952年，哈夫曼将这个发明整理成了学期报告《一种构建极小多余编码的方法》(A Method for the Construction of Minimum-Redundancy Codes)一文，顺利的完成了该科目的学习~~(这要是不过那可真说不过去了)~~ 。现在这种编码方案一般就叫做哈夫曼(Huffman)编码。

哈夫曼的编码方案

哈夫曼遇到的问题，简单来说，需要解决这么几个关键问题：

编码不能存在歧义，避免编码的多义性，即不能有某个字符的编码是另一个字符编码的前缀
编码应该尽可能的短，这要求采用不等长的编码方案，并将出现频率搞得字符赋予更短的编码
编码算法产生的编码方案应该唯一，避免编码与解码的不对应
编码算法效率应尽可能的高

哈夫曼定义了一种二叉树，他的构建规则如下

对于所有字符，统计其出现的频率。
定义二叉树的结点，其中叶子结点的值为各个字符，权值为频率
定义结点间的比较规则
- 频率为第一关键字小频率优先；
- 值为第二关键字，字符大于树内结点
- 最早出现位置为第三关键字，更早出现的结点优先
每次挑选两个权值最小的结点，创建新的结点作为两节点父亲，父节点的权值为两子节点的和，且较小结点为左儿子，较大者为右儿子
重复上述过程，直至集合中剩余一个结点，该节点即为该二叉树的根

依照这种规则建立起来的二叉树，我们称之为哈夫曼树

如下就是一次构建哈夫曼树的过程

哈夫曼的编码方案就是基于这样一颗二叉树进行的。我们规定，所有编码从根节点开始，每次向左走编码尾部追加’0’，向右走编码尾部追加’1’。

在上图中，四个字母的编码方案如下：

当然这个方案中四个字符的编码长度相同且有规律，纯属巧合，一般情况下编码长度是不相同的。

哈夫曼树的实现

了解了哈夫曼树的构建过程，下面一步我们就需要来想办法实现构建哈夫曼树的过程

首先呢，先来定义一下哈夫曼树的结点类型，同时，我们要重载结点类的小于号用以比较：

class Node{
public:
	char c;		//结点字符，非字符结点默认为'z' + 1 = '{'
	int value;	//结点出现频率
	int idx;	//结点出现最早时间
	Node * left;
	Node * right;
	
	bool operator < (const Node & node){
		if(value == node.value){
			return idx < node.idx;
		}
		return value < node.value
	}

};

为了演示哈夫曼树的构建过程，我们拟定一个需求:

给定一个序列仅包含小写字母，统计出每个字符的频率，创建出它的哈夫曼树。

传统的实现方案

传统与优化的解决方案的差别，主要在于如何寻找集合中最小的结点。传统的方案是使用遍历的方式寻找，优化的方案是使用二叉堆来优化这个过程。

话不多说，上代码：

Node* character[26];//二十六个英文字母的结点数组，没出现的字母为空
Node * nodes[1000];//结点集合，每次从中挑选最小结点
int tot = 0;//结点集合规模
char str[10000];
int main(){
	cin >> str;
	int len = strlen(str);
	
	for(int i = 0,c;i < len;i++){
		c = str[i] - 'a';
		if(character[c] == NULL){//如果该字母第一次出现，则创建结点并且记录第一次出现位置
			character[c] = new Node();
			character[c]->c = str[i];
			character[c]->idx = i;
			character[c]->value = 1;
			character[c]->left = NULL;
			character[c]->right = NULL;
			nodes[tot++] = character[c];//加入结点集合
		}else{
			character[c]->value++;
		}
	}
	int cnt = tot;//总结点数量
	
	int first;
	int second;
	Node * node;
	while(cnt-- > 1){//总结点数量每次减一，总共循环cnt-1次
		first = -1;
		second = -1;
		for(int j = 0;j < tot;j++){
			if(nodes[j] == NULL){
				continue;
			}
			if(first == -1 || *nodes[j] < *nodes[first]){//注意这里是取值进行比较而非直接比较指针
				second = first;
				first = j;
			}else if(second == -1 || *nodes[j] < *nodes[second]){
				second = j;
			}
		}
		//创建新的结点
		node = new Node();
		node->c = 'z' + 1;
		node->value = nodes[first]->value + nodes[second]->value;
		node->left = nodes[first];
		node->right = nodes[second];
		node->idx = len++;

		nodes[tot++] = node;//添加新结点
		nodes[first] = NULL;//删除合并的结点
		nodes[second] = NULL;
	}

	Node * root = nodes[tot - 1];//哈夫曼树根节点为最后加入集合的结点
}

使用堆优化

在前面的章节，我们介绍了二叉堆，这是个能够快速存取最值的结构，在这里正是它大显身手的地方。

如果读者此时还没有掌握二叉堆，可以先回去看一看，或者暂时跳过这段内容，不影响后续的阅读

每次我们可以从堆中拿去两个最小节点(堆顶取一次，弹一次，再取一次，再弹一次)。创建一个新节点作为父节点，并将它存入堆中。

在具体实现之前，我们有必要先将二叉堆写出来：

//C
Node * heap[2000];//堆得存储
int tot = 0;//堆得规模

void down(int k){
	int son = k << 1;
	while(son <= tot){
		if(son + 1 <= tot && *heap[son + 1] < *heap[son]){
			son++;
		}
		if(*heap[k]  < *heap[son]){
			break;
		}

		Node * temp = heap[k];
		heap[k] = heap[son];
		heap[son] = temp;

		k = son;
		son <<= 1;
	}
}

void up(int k){
	int fa = k >> 1;
	while(fa != 0){
		if(*heap[fa] < *heap[k]){
			break;
		}
	
		Node * temp = heap[k];
		heap[k] = heap[fa];
		heap[fa] = temp;

		k = fa;
		fa >>= 1;
	}
}

void add(Node * node){
	heap[++tot] = node;
	up(tot);
}

Node * peak(){
	return heap[1];
}

void pop(){
	heap[1] = heap[tot--];
	down(1);
}

堆实现了之后，我们使用堆再来进行构建的过程

Node* character[26];//二十六个英文字母的结点数组，没出现的字母为空
char str[10000];
int main(){
	cin >> str;
	int len = strlen(str);
	
	for(int i = 0,c;i < len;i++){
		c = str[i] - 'a';
		if(character[c] == NULL){//如果该字母第一次出现，则创建结点并且记录第一次出现位置
			character[c] = new Node();
			character[c]->c = str[i];
			character[c]->idx = i;
			character[c]->value = 1;
			character[c]->left = NULL;
			character[c]->right = NULL;
			add(character[c]);
		}else{
			character[c]->value++;
		}
	}
	int cnt = tot;//总结点数量
	
	Node * first;
	Node * second;
	Node * node;
	while(size > 1){//总结点数量每次减一，总共循环cnt-1次
		first = peak();
		pop();
		second = peak();
		pop();
		//创建新的结点
		node = new Node();
		node->c = 'z' + 1;
		node->value = first->value + second->value;
		node->left = first;
		node->right = second;
		node->idx = len++;

		add(node);
	}

	Node * root = peak();//哈夫曼树根节点
}

编码与译码

获取编码

编码方案在上文描述算法的时候我们有提到过。要获取所有的字符编码，我们需要遍历整颗哈夫曼树，对于所有存放字符的叶子结点，应该获取并记录其编码。

那么首先，我们应该有一个存放编码的地方:

char * code[26];//26个小写字母的编码

接着，来遍历这棵树。当然，遍历整棵树获取编码的方式有很多，这里博主仅提供一种可供参考的方案：

void encode(char * s,int dep,Node * node){//当前编码，深度，当前结点
	if(node->left == NULL && node->right == NULL){//如果该结点为叶子结点，则获取编码
		int c = node->c - 'a';
		s[dep] = '\0';
		code[c] = strdup(s);//根据当前编码克隆一个字符串并储存
		return;
	}
	
	s[dep] = '0';
	encode(s,dep + 1,node->left);//遍历左子树，编码追加0
	s[dep] = '1';
	encode(s,dep + 1,node->right);//遍历右子树，编码追加1
}

完成编码之后，在主函数中调用一下：

Node* root;
	...//获取根节点
	char s[20];
	encode(s,0,root);
	
	for(int i = 0;i < 26;i++){//打印出出现字母的编码
		if(code[i] != NULL){
			cout << (char)(i + 'a') << ":" << code[i] << endl;
		}
	}

进行译码

一般来说，译码工作是在哈夫曼树构建成功基础上，给定一个编码序列，将其翻译成为源码，如果无法翻译则给予提示。

例如，对于刚才展示的运行示例，如果给定编码序列为： $则译码序列为：若为：，则这个序列存在问题，因为没有字符编码为10$

译码的过程本质上是在哈夫曼树上跟着序列进行模拟

如果当前结点对应位为0，则下一位跳转到左结点
如果当前结点对应位为1，则下一位跳转到右结点
如果当前结点为叶子结点，则获得一位译码。当前结点指向根节点，重复上述过程
如果当前结点对应最后一位编码且非叶子结点，说明编码存在问题。

用代码实现以下：

char * decode(Node * root,char * target){//根节点，原码
	char result[50];//结果序列
	int resultCnt = 0;//结果长度
	int targetCnt = strlen(target);//当前对应原码下标
	
	Node * curr = root;//当前结点
	for(int i = 0;i < targetCnt;i++){
		if(target[i] == '0'){
			curr = curr->left;	
		}else{
			curr = curr->right;
		}
		if(curr->left == NULL && curr->right == NULL){//如果当前结点为叶子结点，获取译码
			result[resultCnt++] = curr->c;
			curr = root;//当前结点指向头结点
		}
	}
	
	if(curr != root){//译码结束后不指向头结点，原码存在错误
		return "INVALID";
	}else{//原码有效，返回结果
		result[resultCnt] = '\0';
		return strdup(result);//克隆结果字符串返回
	}
}

在主函数中调用测试这个函数：

Node* root;
char str[100];
...
while(true){
	cout << "请输入原码:";
	cin >> str;
	cout << "译码结果为:" << decode(root,str) << endl;
}

跑几组数据试一试：

ok没有问题~

完整的堆实现的代码这里再放一下：

#include<iostream>
#include<cstring>
using namespace std;

class Node{
public:
	char c;		//结点字符，非字符结点默认为'z' + 1 = '{'
	int value;	//结点出现频率
	int idx;	//结点出现最早时间
	Node * left;
	Node * right;
	
	bool operator < (const Node & node){
		if(value == node.value){
			return idx < node.idx;
		}
		return value < node.value;
	}

};

Node * character[26]; 
Node * heap[2000];
int tot = 0;

void down(int k){
	int son = k << 1;
	while(son <= tot){
		if(son + 1 <= tot && *heap[son + 1] < *heap[son]){
			son++;
		}
		if(*heap[k]  < *heap[son]){
			break;
		}

		Node * temp = heap[k];
		heap[k] = heap[son];
		heap[son] = temp;

		k = son;
		son <<= 1;
	}
}

void up(int k){
	int fa = k >> 1;
	while(fa != 0){
		if(*heap[fa] < *heap[k]){
			break;
		}
	
		Node * temp = heap[k];
		heap[k] = heap[fa];
		heap[fa] = temp;

		k = fa;
		fa >>= 1;
	}
}

void add(Node * node){
	heap[++tot] = node;
	up(tot);
}

Node * peak(){
	return heap[1];
}

void pop(){
	heap[1] = heap[tot--];
	down(1);
}

char * code[26];

void encode(char * s,int dep,Node * node){//当前编码，深度，当前结点
	if(node->left == NULL && node->right == NULL){//如果该结点为叶子结点，则获取编码
		int c = node->c - 'a';
		s[dep] = '\0';
		code[c] = strdup(s);//根据当前编码克隆一个字符串并储存
		return;
	}
	
	s[dep] = '0';
	encode(s,dep + 1,node->left);//遍历左子树，编码追加0
	s[dep] = '1';
	encode(s,dep + 1,node->right);//遍历右子树，编码追加1
}

char * decode(Node * root,char * target){//根节点，原码
	char result[50];//结果序列
	int resultCnt = 0;//结果长度
	int targetCnt = strlen(target);//当前对应原码下标
	
	Node * curr = root;//当前结点
	for(int i = 0;i < targetCnt;i++){
		if(target[i] == '0'){
			curr = curr->left;	
		}else{
			curr = curr->right;
		}
		if(curr->left == NULL && curr->right == NULL){//如果当前结点为叶子结点，获取译码
			result[resultCnt++] = curr->c;
			curr = root;//当前结点指向头结点
		}
	}
	
	if(curr != root){//译码结束后不指向头结点，原码存在错误
		return "INVALID";
	}else{//原码有效，返回结果
		result[resultCnt] = '\0';
		return strdup(result);//克隆结果字符串返回
	}
}

char str[10000];
int main(){
	cin >> str;
	int len = strlen(str);
	
	for(int i = 0,c;i < len;i++){
		c = str[i] - 'a';
		if(character[c] == NULL){//如果该字母第一次出现，则创建结点并且记录第一次出现位置
			character[c] = new Node();
			character[c]->c = str[i];
			character[c]->idx = i;
			character[c]->value = 1;
			character[c]->left = NULL;
			character[c]->right = NULL;
			add(character[c]);
		}else{
			character[c]->value++;
		}
	}
	int cnt = tot;//总结点数量
	
	Node * first;
	Node * second;
	Node * node;
	while(tot > 1){//总结点数量每次减一，总共循环cnt-1次
		first = peak();
		pop();
		second = peak();
		pop();
		//创建新的结点
		node = new Node();
		node->c = 'z' + 1;
		node->value = first->value + second->value;
		node->left = first;
		node->right = second;
		node->idx = len++;

		add(node);
	}

	Node * root = peak();//哈夫曼树根节点
	char s[20];
	encode(s,0,root);
	
	for(int i = 0;i < 26;i++){
		if(code[i] != NULL){
			cout << (char)(i + 'a') << ":" << code[i] << endl;
		}
	}
	
	while(true){
		cout << "请输入原码:";
		cin >> str;
		cout << "译码结果为:" << decode(root,str) << endl;
	}
}