哈夫曼编码分为动态和静态之分。静态哈夫曼编码需要统计和计算每个字段的权重(比如文本'A'字母出现的次数),效率会很低,特别是压缩大文件,基本是不现实的。只是,理解静态哈夫曼编码是基础,理解压缩和解压思想。
本文使所使用的算法和构建思路跟我们通常的数据结构和算法书本里面的介绍思路是一致的,这个思路也是最原始和直观的,比较容易理解,本质上是一个经典的贪心算法例子。就是每次从各个子树中(原始的单个元素我们也看成是一棵只有root节点的一棵树),选择2个权重最大的或者权重最小,最大或者最小的为左子树,次大或者次小的为右子树,左右子树合并成一课树。当然,合并过程中出现的子树,它的权重是左右子树的权重之和。这样如此循环往复选取,直到最后只有一棵树,即生成哈夫曼编码树。有了编码树,每个叶子节点的编码也就明显了。压缩过程就是编码位域替换文本字段,解压缩就是扫描编码树和文件的位域,如果是0走到左子树,如果是1走到右子树,到了叶子节点,那么就解压出一个文本字段了。如此循环,即完整的解压出全部内容。
为了完整理解该思路,我这边的实现并没有考虑效率等,完全模拟编解码的最原始的思路来实现,这样非常便于理解。实现思路是把字段的出现次数(权重)按从小到大排序,每次选择2个最小的构成二叉树,如此循环直到只剩最后一棵二叉树,即编码树。每个字段都是一个字节,为0-255的数字。创建二叉树后,这里需要遍历2遍编码树,才可以获取到每个字段的编码,关于这点,如果有效率更高的遍历方法,多谢告知。如下是实现代码。
#include "HanfuMan.h"
#define TEST_DATA_LEN 16
void InsertSort(PDATA_INFO pDataInfo,int len,unsigned char data,unsigned short dataTimes);
BOOL TestHanfuMan()
{
BOOL bRet = FALSE;
unsigned char *p = (unsigned char*)new unsigned char[TEST_DATA_LEN];
if(!p)
{
return FALSE;
}
memset(p,0,TEST_DATA_LEN);
srand(time(NULL));
for(int i=0;i<TEST_DATA_LEN;i++)
{
p[i] = rand()%256; //随机生成数据内容
}
//统计0-255之间每个数字出现的次数
unsigned short times[256] = {0};
for(int i=0;i<TEST_DATA_LEN;i++)
{
times[p[i]]++;
}
int count = 0;//统计有出现的数字个数
for(int i=0;i<256;i++)
{
if(times[i])
{
count++;
}
}
PDATA_INFO pDataInfo = (PDATA_INFO)new DATA_INFO[count];
if(!pDataInfo)
{
goto RET;
}
memset(pDataInfo,0,count*sizeof(DATA_INFO));
int len = 0;
for(int i=0;i<256;i++)
{
if(times[i])
{
//使用插入排序,把0-255之间出现的数字的次数进行从小到大排序
InsertSort(pDataInfo,len,i,times[i]);
len++;
}
}
PHANFUMAN_TREE tree = CreateHanfuManTree(pDataInfo,len);
EnumHanfuManCode(tree);
DestroyTree(tree);
RET:
if(pDataInfo)
{
delete [] pDataInfo;
}
if(p)
{
delete [] p;
}
return bRet;
}
void InsertSort(PDATA_INFO pDataInfo,int len,unsigned char data,unsigned short dataTimes)
{
if(0 == len)
{
pDataInfo[0].data = data;
pDataInfo[0].times = dataTimes;
return;
}
int inserIndex = 0;
//使用插入排序
for(inserIndex=0;inserIndex<len;inserIndex++)
{
if(dataTimes >= pDataInfo[inserIndex].times)
{
continue;
}
break;
}
for(int i=len-1;i>=inserIndex;i--)
{
memcpy(&pDataInfo[i+1],&pDataInfo[i],sizeof(DATA_INFO));
}
//插入新数据
pDataInfo[inserIndex].data = data;
pDataInfo[inserIndex].times = dataTimes;
}
void InsertSortTree(PHANFUMAN_TREE *pSubTree,int subTreeCount,PHANFUMAN_TREE insertTree)
{
if(0 == subTreeCount)
{
pSubTree[0] = insertTree;
return;
}
int inserIndex = 0;
//使用插入排序
for(inserIndex=0;inserIndex<subTreeCount;inserIndex++)
{
if(insertTree->weight >= (pSubTree[inserIndex])->weight)
{
continue;
}
break;
}
for(int i=subTreeCount-1;i>=inserIndex;i--)
{
pSubTree[i+1] = pSubTree[i];
}
//插入新数据
pSubTree[inserIndex] = insertTree;
}
void RefreshSubTrees(PHANFUMAN_TREE *pSubTree,int subTreeCount,PHANFUMAN_TREE mergeTree)
{
for(int i=2;i<subTreeCount;i++)
{
pSubTree[i-2] = pSubTree[i];
}
//插入排序,按照权重的从小到大顺序排序
InsertSortTree(pSubTree,subTreeCount-2,mergeTree);
}
//合并2棵子树,pSubTree1的权重默认比pSubTree2的小
PHANFUMAN_TREE MergeTree(PHANFUMAN_TREE pLeftSubTree,PHANFUMAN_TREE pRightSubTree)
{
PHANFUMAN_TREE mergeRoot = new HANFUMAN_TREE;
if(!mergeRoot)
{
return NULL;
}
mergeRoot->data = 0;
pLeftSubTree->parent = mergeRoot;
mergeRoot->weight = pLeftSubTree->weight;
//pLeftSubTree 默认不为空
if(pRightSubTree)
{
mergeRoot->weight += pRightSubTree->weight;
pRightSubTree->parent = mergeRoot;
}
mergeRoot->parent = NULL;
mergeRoot->left = pLeftSubTree;
mergeRoot->right = pRightSubTree;
return mergeRoot;
}
//创建新树,用于创建叶子节点
PHANFUMAN_TREE CreateLeaf(PDATA_INFO pDataInfo)
{
PHANFUMAN_TREE leafTree = new HANFUMAN_TREE;
if(!leafTree)
{
return NULL;
}
leafTree->data = pDataInfo->data;
leafTree->weight = pDataInfo->times;
leafTree->parent = NULL;
leafTree->left = NULL;
leafTree->right = NULL;
return leafTree;
}
//创建哈夫曼编码树
PHANFUMAN_TREE CreateHanfuManTree(PDATA_INFO pDataInfo,int len)
{
if(len<=0)
{
return NULL;
}
int dataIndex = 0;
//最多只可能出现len+1/2个子树,用于保存编码过程可能出现的全部子树的根节点指针
PHANFUMAN_TREE *pSubTree = (PHANFUMAN_TREE*) new PHANFUMAN_TREE[(len+1)/2];
PHANFUMAN_TREE root = NULL;
int subTreeCount = 0; //子树的个数
HANFUMAN_SELECT_HELPER selectHelper;
memset(pSubTree,0,sizeof(PHANFUMAN_TREE)*((len+1)/2));
while(dataIndex<len)
{
//对比数组中剩余未编码的数据和各个子树选择2个权重最小的,如果权重相同,优先选择子树中的
//由于数组和子树都已经按照从小到大的顺序,因此直接选取对比即可
if(subTreeCount>=2)
{
selectHelper.firstMinIndex = 0;
selectHelper.secondMinIndex = 1;
}
else
{
if(subTreeCount>=1)
{
selectHelper.firstMinIndex = 0;
}
}
if(-1 == selectHelper.firstMinIndex)
{
selectHelper.firstMinIndex = dataIndex;
selectHelper.firstMinType = INDEX_TYPE_INFO;
if(++dataIndex<len)
{
selectHelper.secondMinIndex = dataIndex++;
selectHelper.secondMinType = INDEX_TYPE_INFO;
}
}
else
{
if(pDataInfo[dataIndex].times < (pSubTree[selectHelper.firstMinIndex])->weight)
{
selectHelper.secondMinIndex = selectHelper.firstMinIndex;
selectHelper.firstMinIndex = dataIndex;
selectHelper.firstMinType = INDEX_TYPE_INFO;
if( (++dataIndex<len) && ( pDataInfo[dataIndex].times < (pSubTree[selectHelper.secondMinIndex])->weight ) )
{
selectHelper.secondMinIndex = dataIndex++;
selectHelper.secondMinType = INDEX_TYPE_INFO;
}
}
else
{
if( (-1==selectHelper.secondMinIndex) || (pDataInfo[dataIndex].times < (pSubTree[selectHelper.secondMinIndex])->weight))
{
selectHelper.secondMinIndex = dataIndex++;
selectHelper.secondMinType = INDEX_TYPE_INFO;
}
}
}//至此,已经选择出了2个最小权重的
if(INDEX_TYPE_TREE == selectHelper.firstMinType && INDEX_TYPE_TREE == selectHelper.secondMinType)
{
//合并2棵子树
PHANFUMAN_TREE mergeTree = MergeTree(pSubTree[0],pSubTree[1]);
if(!mergeTree)
{
exit(0);
}
RefreshSubTrees(pSubTree,subTreeCount,mergeTree);
subTreeCount--;
}
if(INDEX_TYPE_TREE == selectHelper.firstMinType && INDEX_TYPE_INFO == selectHelper.secondMinType)
{
PHANFUMAN_TREE newLeaf = CreateLeaf(&pDataInfo[selectHelper.secondMinIndex]);
if(!newLeaf)
{
exit(0);
}
PHANFUMAN_TREE mergeTree = MergeTree(pSubTree[0],newLeaf);
if(!mergeTree)
{
exit(0);
}
for(int i=1;i<subTreeCount;i++)
{
pSubTree[i-1] = pSubTree[i];
}
InsertSortTree(pSubTree,subTreeCount-1,mergeTree);//插入子树后,子树的数量不变
}
if(INDEX_TYPE_INFO == selectHelper.firstMinType && INDEX_TYPE_INFO == selectHelper.secondMinType)
{
PHANFUMAN_TREE leftLeaf = CreateLeaf(&pDataInfo[selectHelper.firstMinIndex]);
if(!leftLeaf)
{
exit(0);
}
PHANFUMAN_TREE rightLeaf = CreateLeaf(&pDataInfo[selectHelper.secondMinIndex]);
if(!leftLeaf)
{
exit(0);
}
PHANFUMAN_TREE mergeTree = MergeTree(leftLeaf,rightLeaf);
if(!mergeTree)
{
exit(0);
}
InsertSortTree(pSubTree,subTreeCount,mergeTree);
subTreeCount++; //插入子树后,子树的数量+1
}
if(INDEX_TYPE_INFO == selectHelper.firstMinType && INDEX_TYPE_TREE == selectHelper.secondMinType)
{
if(-1 == selectHelper.secondMinIndex)
{
PHANFUMAN_TREE leftLeaf = CreateLeaf(&pDataInfo[selectHelper.firstMinIndex]);
if(!leftLeaf)
{
exit(0);
}
PHANFUMAN_TREE mergeTree = MergeTree(leftLeaf,NULL);
if(!mergeTree)
{
exit(0);
}
InsertSortTree(pSubTree,subTreeCount,mergeTree);
subTreeCount++;
}
else
{
PHANFUMAN_TREE leftLeaf = CreateLeaf(&pDataInfo[selectHelper.firstMinIndex]);
if(!leftLeaf)
{
exit(0);
}
PHANFUMAN_TREE mergeTree = MergeTree(leftLeaf,pSubTree[selectHelper.secondMinIndex]);
if(!mergeTree)
{
exit(0);
}
for(int i=1;i<subTreeCount;i++)
{
pSubTree[i-1] = pSubTree[i];
}
InsertSortTree(pSubTree,subTreeCount-1,mergeTree);
}
}
selectHelper.Init();
}
//合并sub trees
while(subTreeCount>1)
{
//合并2棵子树
PHANFUMAN_TREE mergeTree = MergeTree(pSubTree[0],pSubTree[1]);
if(!mergeTree)
{
exit(0);
}
RefreshSubTrees(pSubTree,subTreeCount,mergeTree);
subTreeCount--;
}
//最后子树中只剩下一课,这棵树即为编码树
PHANFUMAN_TREE tree = pSubTree[0];
delete [] pSubTree;
return tree;
}
//释放树
void DestroyTree(PHANFUMAN_TREE tree)
{
if(!tree)
{
return;
}
DestroyTree(tree->left); //刪除左子树
DestroyTree(tree->right);//删除右子树
delete tree; //删除根节点
tree = NULL;
}
//通过叶子的父节点向上
void PrintHanfuManCode(PHANFUMAN_TREE tree,int *codeLen)
{
if(!tree)
{
return;
}
PHANFUMAN_TREE parent = tree->parent;
if(!parent)
{
return;
}
PrintHanfuManCode(parent,codeLen);
if(parent->left == tree)
{
(*codeLen)++;
printf("0");
}
else
{
(*codeLen)++;
printf("1");
}
}
//通过二次遍历编码树,枚举得到每个data的哈夫曼编码
void EnumHanfuManCode(PHANFUMAN_TREE tree)
{
if(!tree)
{
return;
}
//叶子节点
if(!tree->left && !tree->right)
{
int codeLen = 0;
printf("data value = 0x%2x HanfuMan Code = ",tree->data);
PrintHanfuManCode(tree,&codeLen);
printf(" CodeLen = %d\r\n",codeLen);
return;
}
if(tree->left)
{
EnumHanfuManCode(tree->left);
}
if(tree->right)
{
EnumHanfuManCode(tree->right);
}
}
头文件内容如下:
#ifndef _HANFUMAN_H_
#define _HANFUMAN_H_
typedef struct _t_HANFUMAN_TREE
{
unsigned char data; //编码的数据值,0-255之间,如果不是叶子节点,设置为0
unsigned short weight; //编码数字的权重,可以是出现的概率,这里使用data出现的次数
_t_HANFUMAN_TREE* parent;
_t_HANFUMAN_TREE* left;
_t_HANFUMAN_TREE* right;
}HANFUMAN_TREE,*PHANFUMAN_TREE;
#define INDEX_TYPE_TREE 0x00
#define INDEX_TYPE_INFO 0x01
typedef struct _t_HANFUMAN_SELECT_HELPER
{
_t_HANFUMAN_SELECT_HELPER()
{
Init();
}
void Init()
{
firstMinIndex = -1;
secondMinIndex = -1;
firstMinType = INDEX_TYPE_TREE; //默认值为子树类型
secondMinType = INDEX_TYPE_TREE; //默认值为子树类型
}
int firstMinIndex;
int secondMinIndex;
unsigned char firstMinType;
unsigned char secondMinType;
}HANFUMAN_SELECT_HELPER,*PHANFUMAN_SELECT_HELPER;
typedef struct _t_DATA_INFO
{
unsigned char data;
unsigned short times; //data出现的次数
}DATA_INFO,*PDATA_INFO;
BOOL TestHanfuMan();
//创建哈夫曼编码树
PHANFUMAN_TREE CreateHanfuManTree(PDATA_INFO pDataInfo,int len);
void EnumHanfuManCode(PHANFUMAN_TREE tree);
void DestroyTree(PHANFUMAN_TREE tree);
#endif
测试例子如下:
#include <Windows.h>
#include <stdio.h>
#include "HanfuMan.h"
int main(int agrc,char* argv[])
{
TestHanfuMan();
getchar();
return 0;
}