参考维基百科与《算法导论》
霍夫曼编码(Huffman Coding)是一种编码方式,是一种用于无损数据压缩的熵编码(权编码)算法。也称“哈夫曼编码”,“赫夫曼编码”。1952年,David A. Huffman在麻省理工攻读博士时所发明的,并发表于《一种构建极小多余编码的方法》(A Method for the Construction of Minimum-Redundancy Codes)一文。
在计算机数据处理中,霍夫曼编码使用变长编码表对源符号(如文件中的一个字母)进行编码,其中变长编码表是通过一种评估来源符号出现机率的方法得到的,出现机率高的字母使用较短的编码,反之出现机率低的则使用较长的编码,这便使编码之后的字符串的平均长度、期望值降低,从而达到无损压缩数据的目的。
例如,在英文中,e的出现机率最高,而z的出现概率则最低。当利用霍夫曼编码对一篇英文进行压缩时,e极有可能用一个比特来表示,而z则可能花去25个比特(不是26)。用普通的表示方法时,每个英文字母均占用一个字节(byte),即8个比特。二者相比,e使用了一般编码的1/8的长度,z则使用了3倍多。倘若我们能实现对于英文中各个字母出现概率的较准确的估算,就可以大幅度提高无损压缩的比例。
霍夫曼树又称最优二叉树,是一种带权路径长度最短的二叉树。所谓树的带权路径长度,就是树中所有的叶结点的权值乘上其到根结点的路径长度(若根结点为0层,叶结点到根结点的路径长度为叶结点的层数)。树的路径长度是从树根到每一结点的路径长度之和,记为WPL=(W1*L1+W2*L2+W3*L3+...+Wn*Ln),N个权值Wi(i=1,2,...n)构成一棵有N个叶结点的二叉树,相应的叶结点的路径长度为Li(i=1,2,...n)。可以证明霍夫曼树的WPL是最小的。
霍夫曼编码可以有效的压缩数据;通常可以节省20%~90%的空间,具体压缩率依赖于数据的特征。变长编码(variable-length code)赋予高频字符短码字,赋予低频字符长码字,这样可以达到比定长编码好得多的压缩率。前缀码(prefix code)即没有任何码字是其他码字的前缀。
霍夫曼设计了一个贪心算法来构造最优前缀码,被称为霍夫曼编码(Huffman code)。
赫夫曼编码的实现
在实现中,我们假设C是一个含n个字符的集合,而其中每个字符c∈C都是一个对象,其属性c.freq给出了字符的出现频率。算法自底向上地构建出对应最优编码的二叉树T。它从|C|个叶子结点开始,执行|C|-1个“合并”操作创建出最终的二叉树。算法使用一个以属性freq为关键字最小优先队列Q,以识别两个最低频率的对象将其合并。当合并两个对象时,得到的新对象的频率设置为原来两个对象的频率之和。
代码实现1:
//=============================================================
// Huffman编码实现(2014/8/18)
// 使用二叉树构建最小优先队列
//=============================================================
#include<stdio.h>
#include<stdlib.h>
typedef struct hmap {
char c;
int freq;
} HMap;
typedef struct hnode { // huffman树结点
char c;
int freq;
struct hnode *left;
struct hnode *right;
} HNode, *HTree;
// 用数组遍历可能实现更简单一些
#define LEFT(i) (2 * (i) + 1)
#define RIGHT(i) (2 * (i) + 2)
#define PARENT(i) (((i) - 1) / 2)
//=============================================================
// 对堆中第i个元素进行堆化
//=============================================================
void Heapify(HNode **arr, int n, int i)
{
int min; // 对小的孩子进行标记
for (int j = i; j <= n / 2 - 1; j = min) {
min = 2 * j + 1; // left
if (min + 1 < n && arr[min+1]->freq < arr[min]->freq)
min += 1; // 2 * j + 2 right
if (arr[j]->freq > arr[min]->freq) {
HNode *tmp = arr[j];
arr[j] = arr[min];
arr[min] = tmp;
}
}
}
//=============================================================
// 建小顶堆
//=============================================================
void BuildHeap(HNode **arr, int n)
{
for (int i = n / 2 - 1; i >= 0; i--) {
Heapify(arr, n, i);
}
}
//=============================================================
// 获取堆中最小结点的指针
//=============================================================
HNode *ExtractMin(HNode **arr, int n)
{
HNode *min = arr[0];
arr[0] = arr[n - 1];
arr[n - 1] = NULL;
Heapify(arr, n - 1, 0);
return min;
}
//=============================================================
// 向堆中插入元素
//=============================================================
void MinHeapInsert(HNode **arr, int n, HNode *x)
{
if (n == 0) { // 堆为空的情况
arr[0] = x;
return;
}
// 找到x插入的位置, 向上过滤
while (n > 0 && arr[PARENT(n)]->freq > x->freq) {
arr[n] = arr[PARENT(n)];
n = PARENT(n);
}
arr[n] = x;
}
//=========================================================
// Huffman编码实现
//=========================================================
HTree Huffman(HMap *C, int n)
{
HNode **Q = NULL;
Q = (HNode **)malloc(sizeof(HNode *) * n);
if (!Q) {
printf("Q malloc error\n");
return NULL;
}
for (int i = 0; i < n; i++)
Q[i] = NULL;
// 初始化Q
for (int i = 0; i < n; i++) {
HNode *p = (HNode *)malloc(sizeof(HNode));
if (!p) {
printf("p malloc error\n");
return NULL;
}
p->c = C[i].c;
p->freq = C[i].freq;
p->left = p->right = NULL;
Q[i] = p;
}
// 建堆,形成最小优先队列
BuildHeap(Q, n);
// 建立huffman树
for (int i = 0; i < n - 1; i++) {
HNode *z = (HNode *)malloc(sizeof(HNode));
if (!z) {
printf("z malloc error\n");
return NULL;
}
HNode *x = ExtractMin(Q, n - i);
HNode *y = ExtractMin(Q, n - i - 1);
z->left = x;
z->right = y;
z->freq = x->freq + y->freq;
MinHeapInsert(Q, n - i - 2, z);
}
return ExtractMin(Q, 1);
}
HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12},
{'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}};
#include <iostream>
#include <vector>
#include <map>
using namespace std;
typedef vector<int> Huff_code; // 8 bit code of one char
map<char, Huff_code> Huff_Dic; // huffman coding dictionary
//=============================================================
// Give Huffman Coding to the Huffman Tree
//=============================================================
void Huffman_Coding(HTree root, Huff_code& curcode)
{
if (root->left == NULL && root->right == NULL) {
Huff_Dic[root->c] = curcode;
return;
}
Huff_code lcode = curcode;
Huff_code rcode = curcode;
lcode.push_back(0);
rcode.push_back(1);
Huffman_Coding(root->left, lcode);
Huffman_Coding(root->right, rcode);
}
int main()
{
int n = sizeof(C) / sizeof(C[0]);
HTree root;
root = Huffman(C, n);
Huff_code nullcode;
nullcode.clear();
Huffman_Coding(root, nullcode);
// 打印Huffman编码
for (map<char,Huff_code>::iterator it = Huff_Dic.begin();
it != Huff_Dic.end(); ++it) {
cout << (*it).first << '\t';
for (vector<int>::iterator vit = (*it).second.begin();
vit != (*it).second.end(); ++vit) {
cout << *vit;
}
cout << endl;
}
system("pause");
return 0;
}
代码实现2:
//=============================================================
// Huffman编码实现(2014/8/18)
// 使用数组构建最小优先队列
//=============================================================
#include<stdio.h>
#include<stdlib.h>
typedef struct hmap {
char c;
int freq;
} HMap;
typedef struct hnode { // huffman树结点
char c;
int freq;
struct hnode *left;
struct hnode *right;
} HNode, *HTree;
typedef struct qnode {
HNode **node;
bool *visited;
} QNode, *QUEUE;
//=========================================================
// 构建一个最小优先队列
//=========================================================
QUEUE CreateQueue(int n)
{
QUEUE Q = (QNode *)malloc(sizeof(QNode));
if (!Q) {
printf("Q malloc error\n");
return NULL;
}
// 开辟2n-1个pointer用于盛放所有结点的地址
Q->node = (HNode **)malloc(sizeof(HNode *) * (2 * n - 1));
if (!Q->node) {
free(Q);
printf("Q->node malloc error\n");
return NULL;
}
for (int i = 0; i < 2 * n - 1; i++) {
Q->node[i] = NULL;
}
// 开辟2n-1个bool用于标识结点是否被访问过
Q->visited = (bool *)malloc(sizeof(bool) * (2 * n - 1));
if (!Q->visited) {
free(Q);
free(Q->node);
printf("Q->visited malloc error\n");
return NULL;
}
for (int i = 0; i < 2 * n - 1; i++) {
Q->visited[i] = false;
}
return Q;
}
//=========================================================
// 获取队列中freq最小的结点
//=========================================================
HNode *ExtractMin(QUEUE Q, int n)
{
int min = 0xff;
int index = -1; // 最小结点的索引
for (int i = 0; i < 2 * n - 1; i++) {
// 找到队列中未被访问的最小元素
if (Q->node[i] && !Q->visited[i] && Q->node[i]->freq < min) {
min = Q->node[i]->freq;
index = i;
}
}
if (index == -1)
return NULL;
Q->visited[index] = true;
return Q->node[index];
}
//=========================================================
// 将node结点插入队列中
//=========================================================
void QueueInsert(QUEUE Q, int n, HNode *node)
{
int index = 0;
while (index < 2 * n - 1) {
if (Q->node[index] == NULL) { // 找到新的位置,进行插入
Q->node[index] = node;
break;
}
index++;
}
}
//=========================================================
// Huffman编码实现
//=========================================================
HTree Huffman(HMap *C, int n)
{
QUEUE Q = CreateQueue(n);
// 初始化Q
for (int i = 0; i < n; i++) {
HNode *p = (HNode *)malloc(sizeof(HNode));
if (!p) {
printf("p malloc error\n");
return NULL;
}
p->c = C[i].c;
p->freq = C[i].freq;
p->left = p->right = NULL;
Q->node[i] = p;
}
// 建立huffman树
for (int i = 0; i < n - 1; i++) {
HNode *z = (HNode *)malloc(sizeof(HNode));
if (!z) {
printf("z malloc error\n");
return NULL;
}
HNode *x = ExtractMin(Q, n);
HNode *y = ExtractMin(Q, n);
z->left = x;
z->right = y;
z->freq = x->freq + y->freq;
QueueInsert(Q, n, z);
}
return ExtractMin(Q, n);
}
HMap C[] = {{'a', 35}, {'b', 13}, {'c', 12},
{'d', 16}, {'e', 9}, {'f', 5}, {'g', 10}};
#include <iostream>
#include <vector>
#include <map>
using namespace std;
typedef vector<int> Huff_code; // 8 bit code of one char
map<char, Huff_code> Huff_Dic; // huffman coding dictionary
//=============================================================
// Give Huffman Coding to the Huffman Tree
//=============================================================
void Huffman_Coding(HTree root, Huff_code& curcode)
{
if (root->left == NULL && root->right == NULL) {
Huff_Dic[root->c] = curcode;
return;
}
Huff_code lcode = curcode;
Huff_code rcode = curcode;
lcode.push_back(0);
rcode.push_back(1);
Huffman_Coding(root->left, lcode);
Huffman_Coding(root->right, rcode);
}
int main()
{
int n = sizeof(C) / sizeof(C[0]);
HTree root;
root = Huffman(C, n);
Huff_code nullcode;
nullcode.clear();
Huffman_Coding(root, nullcode);
for (map<char,Huff_code>::iterator it = Huff_Dic.begin();
it != Huff_Dic.end(); ++it) {
cout << (*it).first << '\t';
for (vector<int>::iterator vit = (*it).second.begin();
vit != (*it).second.end(); ++vit) {
cout << *vit;
}
cout << endl;
}
system("pause");
return 0;
}
代码实现3(C++)
引用以妹子的,链接:http://blog.csdn.net/abcjennifer/article/details/8020695
/************************************************************************/
/* File Name: Huffman.cpp
* @Function: Lossless Compression
@Author: Sophia Zhang
@Create Time: 2012-9-26 10:40
@Last Modify: 2012-9-26 12:10
*/
/************************************************************************/
#include"iostream"
#include "queue"
#include "map"
#include "string"
#include "iterator"
#include "vector"
#include "algorithm"
using namespace std;
#define NChar 8 //suppose use 8 bits to describe all symbols
#define Nsymbols 1<<NChar //can describe 256 symbols totally (include a-z, A-Z)
typedef vector<bool> Huff_code;//8 bit code of one char
map<char,Huff_code> Huff_Dic; //huffman coding dictionary
/************************************************************************/
/* Tree Class elements:
*2 child trees
*character and frequency of current node
*/
/************************************************************************/
class HTree
{
public :
HTree* left;
HTree* right;
char ch;
int weight;
HTree(){left = right = NULL; weight=0;ch ='\0';}
HTree(HTree* l,HTree* r,int w,char c){left = l; right = r; weight=w; ch=c;}
~HTree(){delete left; delete right;}
bool Isleaf(){return !left && !right; }
};
/************************************************************************/
/* prepare for pointer sorting*/
/*because we cannot use overloading in class HTree directly*/
/************************************************************************/
class Compare_tree
{
public:
bool operator () (HTree* t1, HTree* t2)
{
return t1->weight> t2->weight;
}
};
/************************************************************************/
/* use priority queue to build huffman tree*/
/************************************************************************/
HTree* BuildTree(int *frequency)
{
priority_queue<HTree*,vector<HTree*>,Compare_tree> QTree;
//1st level add characters
for (int i=0;i<Nsymbols;i++)
{
if(frequency[i])
QTree.push(new HTree(NULL,NULL,frequency[i],(char)i));
}
//build
while (QTree.size()>1)
{
HTree* lc = QTree.top();
QTree.pop();
HTree* rc = QTree.top();
QTree.pop();
HTree* parent = new HTree(lc,rc,lc->weight+rc->weight,(char)256);
QTree.push(parent);
}
//return tree root
return QTree.top();
}
/************************************************************************/
/* Give Huffman Coding to the Huffman Tree*/
/************************************************************************/
void Huffman_Coding(HTree* root, Huff_code& curcode)
{
if(root->Isleaf())
{
Huff_Dic[root->ch] = curcode;
return;
}
Huff_code lcode = curcode;
Huff_code rcode = curcode;
lcode.push_back(false);
rcode.push_back(true);
Huffman_Coding(root->left,lcode);
Huffman_Coding(root->right,rcode);
}
int main()
{
int freq[Nsymbols] = {0};
char *str = "this is the string need to be compressed";
//statistic character frequency
while (*str!='\0')
freq[*str++]++;
//build tree
HTree* r = BuildTree(freq);
Huff_code nullcode;
nullcode.clear();
Huffman_Coding(r,nullcode);
for(map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); it++)
{
cout<<(*it).first<<'\t';
std::copy(it->second.begin(),it->second.end(),std::ostream_iterator<bool>(cout));
cout<<endl;
}
}