1. Huffman概述
- 是一种无损压缩编码方式
- 根据字符出现的概率编码,对概率高的字符使用较短的编码,对概率低的字符使用较长的编码,从而使得编码后的字符串的长度的期望最小
- 是一种贪心算法:每次总选择两个概率最小的字符结点进行合并
- 选择用频数代替频率,方便计算
- Huffman编码不唯一
下图是计算过程示例:
2. 编程实现过程
(1) 对给定的数组进行字符出现频率统计。
(2) 剔除掉统计结果中频数为0的数值,对数组进行整理,并将不为0的字符压入vector,用于后续遍历。
(3) 由剔除后的数组进行HuffmanTree的创建,其中涉及查找数据组中最小的两个值问题,将当前最小的两个结点其进行合并,依此填入后续数组中。(注:N个数据,生成HuffmanTree后,共计2N-1个结点)。
(4) 完成HuffmanTree的建立后,依次遍历N个叶结点,每次都查找至根结点,并进行规则设定,例如本代码中令左孩子为0,右孩子为1。本代码采用二维vector进行Huffman编码存储。
(5) 依次输出显示。
3. 代码
#include "stdafx.h"
#include <vector>
#include <iostream>
using namespace std;
//定义结构体,HuffmanTree中的结点
typedef struct tagHuffmanNode
{
int nWeight;
int nLeft;
int nRight;
int nParent;
}HuffmanNode;
//计算字符出现的次数,等效于频率
void CalcFrequency(char* str, int* pWeight)
{
while (*str)
{
pWeight[*str]++;
str++;
}
}
//对出现为0的字符进行剔除
void CalcExistChar(int* pWeight, int N, vector<int>& pChar)
{
int j = 0;
for (int i=0; i<N; i++)
{
if (pWeight[i] != 0)
{
pChar.push_back(i); //对频数不等于0的字符,压入vector中,由于后期的对应显示
if (j != i)
{
pWeight[j] = pWeight[i];
}
j++;
}
}
//将剩余的进行清零操作
for (j; j<N; j++)
{
pWeight[j] = 0;
}
}
//选择最小的两个值
void SelectNode(HuffmanNode* pHuffmanTree, int n, int &nMin1, int &nMin2)
{
nMin1 = -1;
nMin2 = -1;
int nTempWeight1 = -1;
int nTempWeight2 = -1;
for (int i=0; i<n; i++)
{
if ((pHuffmanTree[i].nParent == 0) && (pHuffmanTree[i].nWeight) > 0)
{
if ((nMin1 <0) || (nTempWeight1 > pHuffmanTree[i].nWeight))
{
nMin2 = nMin1;
nTempWeight2 = nTempWeight1;
nMin1 = i;
nTempWeight1 = pHuffmanTree[i].nWeight;
}
else if ((nMin2 < 0) || (nTempWeight2 > pHuffmanTree[i].nWeight))
{
nMin2 = i;
nTempWeight2 = pHuffmanTree[i].nWeight;
}
}
}
}
void HuffmanCoding(int* pWeight, int N, vector<vector<char>>& code)
{
if (N <= 0)
return;
int m = 2*N - 1; //由N个结点生成的Huffman树,共计2N-1个结点
HuffmanNode* pHuffmanTree = new HuffmanNode[m];
for (int i=0; i<m; i++)
{
pHuffmanTree[i].nLeft = 0;
pHuffmanTree[i].nParent = 0;
pHuffmanTree[i].nRight = 0;
pHuffmanTree[i].nWeight = 0;
}
int nMin1, nMin2; //定义最小的两个数值
int i;
//建立叶子结点
for (i=0; i<N; i++)
{
pHuffmanTree[i].nWeight = pWeight[i];
}
//每次选择现有结点中权值最小的两个结点,创建数
for (i=N; i<m; i++)
{
SelectNode(pHuffmanTree, i, nMin1, nMin2);
pHuffmanTree[nMin1].nParent = i;
pHuffmanTree[nMin2].nParent = i;
pHuffmanTree[i].nWeight = pHuffmanTree[nMin1].nWeight + pHuffmanTree[nMin2].nWeight;
pHuffmanTree[i].nLeft = nMin1;
pHuffmanTree[i].nRight = nMin2;
}
//针对HuffmanTree遍历叶结点,向上索引至根节点
int nChild, nParent;
for(int i=0; i<N; i++)
{
vector<char>& cur = code[i];
nParent = pHuffmanTree[i].nParent;
nChild = i;
while(nParent != 0)
{
if (pHuffmanTree[nParent].nLeft == nChild)
{
cur.push_back('0');
}
else if(pHuffmanTree[nParent].nRight == nChild)
{
cur.push_back('1');
}
nChild = nParent;
nParent = pHuffmanTree[nParent].nParent;
}
reverse(cur.begin(), cur.end());
}
}
void PrintCode(char c, vector<char>& code)
{
cout << (int)c << "\t" << c << ":\t";
for (vector<char>::iterator it=code.begin(); it!=code.end(); it++)
{
cout << *it;
}
cout <<endl;
}
void Print(vector<vector<char>>& code, vector<int>& pChar)
{
int nSize = (int)code.size();
for (int i = 0; i < nSize; i++)
{
PrintCode(pChar[i], code[i]);
}
}
int _tmain(int argc, _TCHAR* argv[])
{
char text[] = "every night in my dreams \
i see you, i feel you,\
that is how i know you go on \
far across the distance \
and spaces between us \
you have come to show you go on \
near, far, wherever you are \
i believe that the heart does go on \
once more you open the door \
and you're here in my heart \
and my heart will go on and on \
love can touch us one time \
and last for a lifetime \
and never let go till we're one \
love was when i loved you \
one true time i hold to \
in my life we'll always go on \
near, far, wherever you are \
i believe that the heart does go on \
once more you open the door \
and you're here in my heart \
and my heart will go on and on \
there is some love that will not go away \
you're here, there's nothing i fear,\
and i know that my heart will go on \
we'll stay forever this way \
you are safe in my heart \
and my heart will go on and on";
const int N = 256;
int pWeight[N] = {0};
CalcFrequency(text, pWeight);
pWeight['\t'] = 0;
vector<int> pChar;
CalcExistChar(pWeight, N, pChar);
int N2 = (int)pChar.size();
vector<vector<char>> code(N2);
HuffmanCoding(pWeight, N2, code);
Print(code, pChar);
system("pause");
return 0;
}
4. 程序运行结果
5. 注意
以下任一种情况均会导致,同一文件压缩编码可能不一样:
(1) 在进行查找最小的两个数值时,可能同时存在多个频数相同的数值;
(2) 在生成编码时定义的左0右1、左1右0规则不同。