算法思想:
哈夫曼编码算法基于贪心算法从叶子节点到根节点的方向编码的。
一.为每个符号建立一个叶子节点,并加上其相应的发生频率。
二.当有一个以上的节点存在时,进行下列循环:
- 把这些节点作为带权值的二叉树的根节点,左右子树为空
- 选择两棵根结点权值最小的树作为左右子树构造一棵新的二叉树,且至新的二叉树的根结点的权值为其左右子树上根结点的权值之和。
- 把权值最小的两个根节点移除
- 将新的二叉树加入队列中.
三.最后剩下的节点暨为根节点,此时二叉树已经完成。
示例:
符号 | A | B | C | D | E |
---|---|---|---|---|---|
计数 | 15 | 7 | 6 | 6 | 5 |
其中huffman树构造过程如下图所示:
源码:
#include<iostream>
#include<string>
#include<vector>
#include <algorithm>
#include<list>
using namespace std;
typedef struct node{
int tag;
int frequency;
struct node * lchild;
struct node * rchild;
struct node * parent;
}Node,*TreeNode;
void buildList(const string str, list <TreeNode> &listnode){
for (string::size_type is = 0; is != str.size(); ++is){
list <TreeNode>::iterator ip = listnode.begin();
for (; ip != listnode.end(); ++ip){
if (str[is] == (*ip)->tag){
(*ip)->frequency++;
break;
}
}//ifor
if (ip == listnode.end ()){
TreeNode pnew = (TreeNode) malloc(sizeof(Node));
pnew->tag = str[is];
pnew->frequency = 1;
pnew->lchild = NULL;
pnew->rchild = NULL;
pnew->parent = NULL;
listnode.push_back(pnew);
}//if
}//ofor
}
bool comp(const TreeNode &p1, const TreeNode &p2) {
return p1->frequency < p2->frequency;
}
void buildTree(list <TreeNode> &listnode){
while (listnode.size() != 1){
listnode.sort(comp);
list <TreeNode>::iterator ip = listnode.begin();
TreeNode p1 = (*ip);
++ip;
TreeNode p2 = (*ip);
listnode.pop_front();
listnode.pop_front();
TreeNode pnew = (TreeNode)malloc(sizeof(Node));
pnew->tag = -1;
pnew->frequency = p1->frequency+p2->frequency;
pnew->lchild = p1;
pnew->rchild = p2;
pnew->parent = NULL;
p1->parent = pnew;
p2->parent = pnew;
listnode.push_back(pnew);
//----for debugging and checking------------------------------------------
// cout << "------------------------" << endl;
// list <TreeNode>::iterator ip1 = listnode.begin();
// for (; ip1 != listnode.end(); ++ip1){
// cout << (*ip1)->tag << " " << (*ip1)->frequency << endl;
// }//ifor
//----for debugging and checking------------------------------------------
}
}
void locate(TreeNode T, char tag, TreeNode &p){
if (T != NULL){
if (T->tag == tag)
p = T;
locate(T->lchild, tag, p);
locate(T->rchild, tag, p);
}
}
void reverse(string &str){
int i = 0;
int j = str.size() - 1;
for (; i < j; i++, j--){
char tmp;
tmp = str[i];
str[i] = str[j];
str[j] = tmp;
}
}
string huffman(list <TreeNode> listnode, string str){
string str_huffman("");
list <TreeNode>::iterator ip = listnode.begin();
for (string::size_type is = 0; is != str.size(); ++is){
TreeNode p;
string str1("");
locate(*ip, str[is], p);
for (; p->parent != NULL; p = p->parent){
if (p == p->parent->lchild)
str1 += "0";
else if (p == p->parent->rchild)
str1 += "1";
}
reverse(str1);
str_huffman += str1;
}
return str_huffman;
}
void main(){
string str; //= "abcddbacca";
cout << "input the plaintext:" << endl;
getline(cin,str);
//cout << str;
list <TreeNode> listnode;
buildList(str, listnode);
/*
listnode.sort(comp);
list <TreeNode>::iterator ip = listnode.begin();
for (; ip != listnode.end(); ++ip){
cout << (*ip)->tag << " " << (*ip)->frequency << endl;
}//ifor
*/
buildTree(listnode);
/*
list <TreeNode>::iterator ip1 = listnode.begin();
for (; ip1 != listnode.end(); ++ip1){
cout << (*ip1)->tag << " " << (*ip1)->frequency << endl;
}//ifor
*/
cout << endl;
cout << "huffman code:"<<huffman(listnode, str)<<endl;
//cout << "huffman code:" << "0011111010110001001010101100111110001001";
system("pause");
}
于是,对于我们的原始字符串 beep boop beer!
其对就能的二进制为 : 0110 0010 0110 0101 0110 0101 01110000 0010 0000 0110 0010 0110 1111 0110 1111 0111 0000 0010 0000 0110 0010 01100101 0110 0101 0111 0010 0010 0001
我们的Huffman的编码为: 0001 01100101 0011 0110 1001 0100 0101 1110 1111
从上面的例子中,我们可以看到被压缩的比例还是很可观的