1952年, David A. Huffman提出了一个不同的算法,这个算法可以为任何的可能性提供出一个理想的树。香农-范诺编码(Shanno-Fano)是从树的根节点到叶子节点所进行的的编码,哈夫曼编码算法却是从相反的方向,暨从叶子节点到根节点的方向编码的。
- 为每个符号建立一个叶子节点,并加上其相应的发生频率
- 当有一个以上的节点存在时,进行下列循环:
- 把这些节点作为带权值的二叉树的根节点,左右子树为空
- 选择两棵根结点权值最小的树作为左右子树构造一棵新的二叉树,且至新的二叉树的根结点的权值为其左右子树上根结点的权值之和。
- 把权值最小的两个根节点移除
- 将新的二叉树加入队列中.
- 最后剩下的节点暨为根节点,此时二叉树已经完成。
示例
-
符号 A B C D E 计数 15 7 6 6 5 概率 0.38461538 0.17948718 0.15384615 0.15384615 0.12820513
在这种情况下,D,E的最低频率和分配分别为0和1,分组结合概率的0.28205128。现在最低的一双是B和C,所以他们就分配0和1组合结合概率的0.33333333在一起。这使得BC和DE所以0和1的前面加上他们的代码和它们结合的概率最低。然后离开只是一个和BCDE,其中有前缀分别为0和1,然后结合。这使我们与一个单一的节点,我们的算法是完整的。
可得A代码的代码长度是1比特,其余字符是3比特。
-
字符 A B C D E 代码 0 100 101 110 111
Pseudo-code
1: begin
2: count frequencies of single characters (source units)
3: output(frequencies using Fibonacci Codes of degree 2)
4: sort them to non-decreasing sequence
5: create a leaf node (character, frequency c, left son = NULL, right son = NULL)
6: of the tree for each character and put nodes into queue F
7: while (|F|>=2) do
8: begin
9: pop the first two nodes (u1, u2) with the lowest
10: frequencies from sorted queue
11: create a node evaluated with sum of the chosen units,
12: successors are chosen units (eps, c(u1)+c(u2), u1, u2)
13: insert new node into queue
14: end
15: node evaluate with way from root to leaf node (left son 1, right son 0)
16: create output from coded intput characters
17: end
- /************************************************************************/
- /* File Name: Huffman.cpp
- * @Function: Lossless Compression
- @Author: Sophia Zhang
- @Create Time: 2012-9-26 10:40
- @Last Modify: 2012-9-26 11:10
- */
- /************************************************************************/
- #include"iostream"
- #include "queue"
- #include "map"
- #include "string"
- #include "iterator"
- #include "vector"
- #include "algorithm"
- using namespace std;
- #define NChar 8 //suppose use at most 8 bits to describe all symbols
- #define Nsymbols 1<<NChar //can describe 256 symbols totally (include a-z, A-Z)
- typedef vector<bool> Huff_code;//8 bit code of one char
- map<char,Huff_code> Huff_Dic; //huffman coding dictionary
- class HTree
- {
- public :
- HTree* left;
- HTree* right;
- char ch;
- int weight;
- HTree(){left = right = NULL; weight=0;}
- HTree(HTree* l,HTree* r,int w,char c){left = l; right = r; weight=w; ch=c;}
- ~HTree(){delete left; delete right;}
- int Getweight(){return weight?weight:left->weight+right->weight;}
- bool Isleaf(){return !left && !right; }
- bool operator < (const HTree tr) const
- {
- return tr.weight < weight;
- }
- };
- HTree* BuildTree(int *frequency)
- {
- priority_queue<HTree*> QTree;
- //1st level add characters
- for (int i=0;i<Nsymbols;i++)
- {
- if(frequency[i])
- QTree.push(new HTree(NULL,NULL,frequency[i],(char)i));
- }
- //build
- while (QTree.size()>1)
- {
- HTree* lc = QTree.top();
- QTree.pop();
- HTree* rc = QTree.top();
- QTree.pop();
- HTree* parent = new HTree(lc,rc,parent->Getweight(),(char)256);
- QTree.push(parent);
- }
- //return tree root
- return QTree.top();
- }
- void Huffman_Coding(HTree* root, Huff_code& curcode)
- {
- if(root->Isleaf())
- {
- Huff_Dic[root->ch] = curcode;
- return;
- }
- Huff_code& lcode = curcode;
- Huff_code& rcode = curcode;
- lcode.push_back(false);
- rcode.push_back(true);
- Huffman_Coding(root->left,lcode);
- Huffman_Coding(root->right,rcode);
- }
- int main()
- {
- int freq[Nsymbols] = {0};
- char *str = "this is the string need to be compressed";
- //statistic character frequency
- while (*str!='\0')
- freq[*str++]++;
- //build tree
- HTree* r = BuildTree(freq);
- Huff_code nullcode;
- nullcode.clear();
- Huffman_Coding(r,nullcode);
- for(map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); it++)
- {
- cout<<(*it).first<<'\t';
- Huff_code vec_code = (*it).second;
- for (vector<bool>::iterator vit = vec_code.begin(); vit!=vec_code.end();vit++)
- {
- cout<<(*vit)<<endl;
- }
- }
- }
那我们将friend bool operator >(Node node1,Node node2)修改为friend bool operator >(Node* node1,Node* node2),也就是传递的是Node的指针行不行呢?
答案是不可以,因为根据c++primer中重载操作符中讲的“程序员只能为类类型或枚举类型的操作数定义重载操作符,在把操作符声明为类的成员时,至少有一个类或枚举类型的参数按照值或者引用的方式传递”,也就是说friend bool operator >(Node* node1,Node* node2)形参中都是指针类型的是不可以的。我们只能再建一个类,用其中的重载()操作符作为优先队列的比较函数。
就得到了下面正确的代码:
- /************************************************************************/
- /* File Name: Huffman.cpp
- * @Function: Lossless Compression
- @Author: Sophia Zhang
- @Create Time: 2012-9-26 10:40
- @Last Modify: 2012-9-26 12:10
- */
- /************************************************************************/
- #include"iostream"
- #include "queue"
- #include "map"
- #include "string"
- #include "iterator"
- #include "vector"
- #include "algorithm"
- using namespace std;
- #define NChar 8 //suppose use 8 bits to describe all symbols
- #define Nsymbols 1<<NChar //can describe 256 symbols totally (include a-z, A-Z)
- typedef vector<bool> Huff_code;//8 bit code of one char
- map<char,Huff_code> Huff_Dic; //huffman coding dictionary
- /************************************************************************/
- /* Tree Class elements:
- *2 child trees
- *character and frequency of current node
- */
- /************************************************************************/
- class HTree
- {
- public :
- HTree* left;
- HTree* right;
- char ch;
- int weight;
- HTree(){left = right = NULL; weight=0;ch ='\0';}
- HTree(HTree* l,HTree* r,int w,char c){left = l; right = r; weight=w; ch=c;}
- ~HTree(){delete left; delete right;}
- bool Isleaf(){return !left && !right; }
- };
- /************************************************************************/
- /* prepare for pointer sorting*/
- /*because we cannot use overloading in class HTree directly*/
- /************************************************************************/
- class Compare_tree
- {
- public:
- bool operator () (HTree* t1, HTree* t2)
- {
- return t1->weight> t2->weight;
- }
- };
- /************************************************************************/
- /* use priority queue to build huffman tree*/
- /************************************************************************/
- HTree* BuildTree(int *frequency)
- {
- priority_queue<HTree*,vector<HTree*>,Compare_tree> QTree;
- //1st level add characters
- for (int i=0;i<Nsymbols;i++)
- {
- if(frequency[i])
- QTree.push(new HTree(NULL,NULL,frequency[i],(char)i));
- }
- //build
- while (QTree.size()>1)
- {
- HTree* lc = QTree.top();
- QTree.pop();
- HTree* rc = QTree.top();
- QTree.pop();
- HTree* parent = new HTree(lc,rc,lc->weight+rc->weight,(char)256);
- QTree.push(parent);
- }
- //return tree root
- return QTree.top();
- }
- /************************************************************************/
- /* Give Huffman Coding to the Huffman Tree*/
- /************************************************************************/
- void Huffman_Coding(HTree* root, Huff_code& curcode)
- {
- if(root->Isleaf())
- {
- Huff_Dic[root->ch] = curcode;
- return;
- }
- Huff_code lcode = curcode;
- Huff_code rcode = curcode;
- lcode.push_back(false);
- rcode.push_back(true);
- Huffman_Coding(root->left,lcode);
- Huffman_Coding(root->right,rcode);
- }
- int main()
- {
- int freq[Nsymbols] = {0};
- char *str = "this is the string need to be compressed";
- //statistic character frequency
- while (*str!='\0')
- freq[*str++]++;
- //build tree
- HTree* r = BuildTree(freq);
- Huff_code nullcode;
- nullcode.clear();
- Huffman_Coding(r,nullcode);
- for(map<char,Huff_code>::iterator it = Huff_Dic.begin(); it != Huff_Dic.end(); it++)
- {
- cout<<(*it).first<<'\t';
- std::copy(it->second.begin(),it->second.end(),std::ostream_iterator<bool>(cout));
- cout<<endl;
- }
- }