一、哈夫曼树的概念
1、哈夫曼树:假设有m个权值,可以构造一棵含n个叶子结点的二叉树,若该树的带权路径长度达到最小,则称这样的二叉树为哈夫曼树。
2、结点的带权路径长度:根节点到任意结点的路径长度与该节点的权值的乘积
3、路径长度:路径上的分支数目
4、树的带权路径长度:树中所有叶子结点的带权路径长度之和
离根结点越远,权值越小
二、相关功能实现
1、结构体的定义
typedef struct wordcnt{ // 统计字符和对应的次数
char ch;
int cnt = 0;
}Count;
typedef struct NumCount{ // 统计次数的外部封装
Count count[MaxSize];
int length = 0;
}NumCount;
typedef struct HTree{ // 哈夫曼树结构
char data;
int weight;//结点的权值
int parent,lchild,rchild;//结点的双亲、左孩子和右孩子的下标
}HTNode,*HuffmanTree;
typedef struct HCode{ // 编码结构
char data;
char* str;
}*HuffmanCode;
2、读入文件
将需要压缩的字符读入数组
void ReadData(char *source)
{
//打开文件读入数据
ifstream infile;
infile.open("in.txt");
cout<<"Reading..."<<endl;
cout<<"the input file is:"<<endl;
infile.getline(source,MaxSize);
cout<<source<<endl;
infile.close();
cout<<endl;
}
3、统计字符出现次数
void WordCount(char *data,NumCount *paraCnt)
{
int flag;// 标识是否已经记录
int len = strlen(data);
for(int i = 0;i < len;++i)
{
flag = 0;
for(int j = 0;j < paraCnt->length;++j)
{
if(paraCnt->count[j].ch == data[i]) // 若已有记录,直接++
{
++paraCnt->count[j].cnt;
flag = 1;
break;
}
}
if(!flag)//没有记录,则新增
{
paraCnt->count[paraCnt->length].ch = data[i];
++paraCnt->count[paraCnt->length].cnt;
++paraCnt->length;
}
}
}
4、展示次数
void Show(NumCount *paraCnt)
{
cout<<"the length is "<<paraCnt->length<<endl;
for(int i = 0;i < paraCnt->length;++i)
{
cout<<"The character "<<paraCnt->count[i].ch<<" appears "<<paraCnt->count[i].cnt<<endl;
}
cout<<endl;
}
5、选择权重最小的两个节点
void select(HuffmanTree HT,int top,int *s1,int *s2)
{
int min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重最小的节点
{
if(HT[i].weight < min && HT[i].parent == 0)
{
min = HT[i].weight;
*s1 = i;
}
}
min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重次小的节点
{
if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
{
min = HT[i].weight;
*s2 = i;
}
}
}
6、创建哈夫曼树
(1)根据给定的n个权值{W1, W2,…, Wn},构造n棵只有根结点的二叉树,这n棵二叉树构成一个森林 F。
(2)在森林F中选取两棵根结点的权值最小的树作为左右子树构造一棵新的二叉树,且置新的二叉树的根结点的权值为其左、右子树上根结点的权值之和。
(3)在森林F中删除这两棵树,同时将新得到的二叉树加入F中。
(4)重复(2)和(3),直到F只含一棵树为止。这棵树便是哈夫曼树。
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
if(length <= 1)
return;
int s1,s2;
int m = length*2-1; // 没有度为1的节点,则总结点是2*叶子节点数-1个
HT = new HTNode[m+1]; //0号单元未用,所以需要动态分配m+1个单元,HT[m]表示根节点
for(int i = 1;i <= m;++i) // 将下标初始化为0
{
HT[i].parent = 0;
HT[i].lchild = 0;
HT[i].rchild = 0;
}
for(int i = 1;i <= length;++i)
{
HT[i].data = cntarray.count[i-1].ch;
HT[i].weight = cntarray.count[i-1].cnt;
}
for(int i = length + 1;i <= m;++i)
{
select(HT,i-1,&s1,&s2); // 从前面的范围里选择权重最小的两个节点
HT[s1].parent = i;
HT[s2].parent = i;//得到新节点i,删除s1,s2,将s1,s2的双亲域由0改为i
HT[i].lchild = s1;
HT[i].rchild = s2;// 将s1,s2作为i的左右孩子
HT[i].weight = HT[s1].weight + HT[s2].weight; // i的权值为左右孩子权值的和
}
}
7、创建哈夫曼编码
在构造哈夫曼树之后,求哈夫曼编码的主要思想是:依次以叶子为出发点,向上回溯至根结点为止。回溯时走左分支则生成代码0,走右分支则生成代码1。
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
HC = new HCode[length+1];
char *cd = new char[length]; // 存储编码的临时空间
cd[length-1] = '\0'; // 方便之后调用strcpy函数
int c,f,start;
for(int i = 1;i <= length;++i)
{
start = length-1; // start表示编码在临时空间内的起始下标,由于是从叶子节点回溯,所以是从最后开始
c = i;
f = HT[c].parent;//f指向结点c的双亲结点
while(f != 0)//从叶子结点开始向上回溯,直到根节点
{
--start; // 由于是回溯,所以从临时空间的最后往回计
if(HT[f].lchild == c)
cd[start] = '0';//结点c是f的左孩子,生成代码0
else
cd[start] = '1';//结点c是f的右孩子,生成代码1
c = f;
f = HT[c].parent;//继续向上回溯
}//求出第i个字符的编码
HC[i].str = new char[length-start]; // 为第i个字符编码分配空间
HC[i].data = HT[i].data;
strcpy(HC[i].str,&cd[start]); // 从实际起始地址开始,拷贝到编码结构中
}
delete[] cd;
}
8、将读入的文件编码,写到txt文件
void Encode(char *data,HuffmanCode HC,int length)
{
ofstream outfile;
outfile.open("code.txt");
for(int i = 0;i < strlen(data);++i) // 依次读入数据,查找对应的编码,写入编码文件
{
for(int j = 1;j <= length;++j)
{
if(data[i] == HC[j].data)
{
outfile<<HC[j].str;
}
}
}
outfile.close();
cout<<"the code txt has been written"<<endl;
cout<<endl;
}
9、读入编码文件,解码
void Decode(HuffmanTree HT,int length)
{
char* codetxt = new char[MaxSize*length];
ifstream infile;
infile.open("code.txt");
infile.getline(codetxt,MaxSize*length);
infile.close();
ofstream outfile;
outfile.open("out.txt");
int root = 2*length-1; // 从根节点开始遍历
for(int i = 0;i < strlen(codetxt);++i)
{
if(codetxt[i] == '0') root = HT[root].lchild; //为0表示向左遍历
else if(codetxt[i] == '1') root = HT[root].rchild; //为1表示向右遍历
if(HT[root].lchild == 0 && HT[root].rchild == 0) // 如果已经是叶子节点,输出到输出文件中,然后重新回到根节点
{
outfile<<HT[root].data;
root = 2*length-1;
}
}
outfile.close();
cout<<"the output txt has been written"<<endl;
cout<<endl;
}
三、完整代码
#include <iostream>
#include <fstream>
#include <string.h>
#define MaxSize 1024 // 读入文件的上限
using namespace std;
typedef struct wordcnt{ // 统计字符和对应的次数
char ch;
int cnt = 0;
}Count;
typedef struct NumCount{ // 统计次数的外部封装
Count count[MaxSize];
int length = 0;
}NumCount;
typedef struct HTree{ // 哈夫曼树结构
char data;
int weight;//结点的权值
int parent,lchild,rchild;//结点的双亲、左孩子和右孩子的下标
}HTNode,*HuffmanTree;
typedef struct HCode{ // 编码结构
char data;
char* str;
}*HuffmanCode;
// 读入文件
void ReadData(char *source)
{
//打开文件读入数据
ifstream infile;
infile.open("in.txt");
cout<<"Reading..."<<endl;
cout<<"the input file is:"<<endl;
infile.getline(source,MaxSize);
cout<<source<<endl;
infile.close();
cout<<endl;
}
//统计次数
void WordCount(char *data,NumCount *paraCnt)
{
int flag;// 标识是否已经记录
int len = strlen(data);
for(int i = 0;i < len;++i)
{
flag = 0;
for(int j = 0;j < paraCnt->length;++j)
{
if(paraCnt->count[j].ch == data[i]) // 若已有记录,直接++
{
++paraCnt->count[j].cnt;
flag = 1;
break;
}
}
if(!flag)//没有记录,则新增
{
paraCnt->count[paraCnt->length].ch = data[i];
++paraCnt->count[paraCnt->length].cnt;
++paraCnt->length;
}
}
}
// 展示次数
void Show(NumCount *paraCnt)
{
cout<<"the length is "<<paraCnt->length<<endl;
for(int i = 0;i < paraCnt->length;++i)
{
cout<<"The character "<<paraCnt->count[i].ch<<" appears "<<paraCnt->count[i].cnt<<endl;
}
cout<<endl;
}
// 选择权重最小的两个节点
void select(HuffmanTree HT,int top,int *s1,int *s2)
{
int min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重最小的节点
{
if(HT[i].weight < min && HT[i].parent == 0)
{
min = HT[i].weight;
*s1 = i;
}
}
min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重次小的节点
{
if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
{
min = HT[i].weight;
*s2 = i;
}
}
}
// 创建哈夫曼树
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
if(length <= 1)
return;
int s1,s2;
int m = length*2-1; // 没有度为1的节点,则总结点是2*叶子节点数-1个
HT = new HTNode[m+1]; //0号单元未用,所以需要动态分配m+1个单元,HT[m]表示根节点
for(int i = 1;i <= m;++i) // 将下标初始化为0
{
HT[i].parent = 0;
HT[i].lchild = 0;
HT[i].rchild = 0;
}
for(int i = 1;i <= length;++i)
{
HT[i].data = cntarray.count[i-1].ch;
HT[i].weight = cntarray.count[i-1].cnt;
}
for(int i = length + 1;i <= m;++i)
{
select(HT,i-1,&s1,&s2); // 从前面的范围里选择权重最小的两个节点
HT[s1].parent = i;
HT[s2].parent = i;//得到新节点i,删除s1,s2,将s1,s2的双亲域由0改为i
HT[i].lchild = s1;
HT[i].rchild = s2;// 将s1,s2作为i的左右孩子
HT[i].weight = HT[s1].weight + HT[s2].weight; // i的权值为左右孩子权值的和
}
}
// 创建哈夫曼编码
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
HC = new HCode[length+1];
char *cd = new char[length]; // 存储编码的临时空间
cd[length-1] = '\0'; // 方便之后调用strcpy函数
int c,f,start;
for(int i = 1;i <= length;++i)
{
start = length-1; // start表示编码在临时空间内的起始下标,由于是从叶子节点回溯,所以是从最后开始
c = i;
f = HT[c].parent;//f指向结点c的双亲结点
while(f != 0)//从叶子结点开始向上回溯,直到根节点
{
--start; // 由于是回溯,所以从临时空间的最后往回计
if(HT[f].lchild == c)
cd[start] = '0';//结点c是f的左孩子,生成代码0
else
cd[start] = '1';//结点c是f的右孩子,生成代码1
c = f;
f = HT[c].parent;//继续向上回溯
}//求出第i个字符的编码
HC[i].str = new char[length-start]; // 为第i个字符编码分配空间
HC[i].data = HT[i].data;
strcpy(HC[i].str,&cd[start]); // 从实际起始地址开始,拷贝到编码结构中
}
delete[] cd;
}
// 将读入的文件编码,写到txt文件
void Encode(char *data,HuffmanCode HC,int length)
{
ofstream outfile;
outfile.open("code.txt");
for(int i = 0;i < strlen(data);++i) // 依次读入数据,查找对应的编码,写入编码文件
{
for(int j = 1;j <= length;++j)
{
if(data[i] == HC[j].data)
{
outfile<<HC[j].str;
}
}
}
outfile.close();
cout<<"the code txt has been written"<<endl;
cout<<endl;
}
//读入编码文件,解码
void Decode(HuffmanTree HT,int length)
{
char* codetxt = new char[MaxSize*length];
ifstream infile;
infile.open("code.txt");
infile.getline(codetxt,MaxSize*length);
infile.close();
ofstream outfile;
outfile.open("out.txt");
int root = 2*length-1; // 从根节点开始遍历
for(int i = 0;i < strlen(codetxt);++i)
{
if(codetxt[i] == '0') root = HT[root].lchild; //为0表示向左遍历
else if(codetxt[i] == '1') root = HT[root].rchild; //为1表示向右遍历
if(HT[root].lchild == 0 && HT[root].rchild == 0) // 如果已经是叶子节点,输出到输出文件中,然后重新回到根节点
{
outfile<<HT[root].data;
root = 2*length-1;
}
}
outfile.close();
cout<<"the output txt has been written"<<endl;
cout<<endl;
}
int main(int argc, char** argv){
char data[MaxSize];
NumCount Cntarray;
ReadData(data); // 读入数据
WordCount(data,&Cntarray); // 统计次数
Show(&Cntarray); //可以查看每个单词出现的对应次数
HuffmanTree tree;
CreateHuffmanTree(tree,Cntarray.length,Cntarray); // 建树
HuffmanCode code;
CreateHuffmanCode(tree,code,Cntarray.length); // 创建编码
Encode(data,code,Cntarray.length); // 生成编码文件
Decode(tree,Cntarray.length); // 解码
cout<<"Please view the generated TXT file to check the result"<<endl;
return 0;
}
四、运行结果
Reading...
the input file is:
Many people do not work hard, do not work, and do nothing all day long
the length is 18
The character M appears 1
The character a appears 5
The character n appears 7
The character y appears 2
The character appears 14
The character p appears 2
The character e appears 2
The character o appears 10
The character l appears 4
The character d appears 6
The character t appears 3
The character w appears 2
The character r appears 3
The character k appears 2
The character h appears 2
The character , appears 2
The character i appears 1
The character g appears 2
the code txt has been written
the output txt has been written
Please view the generated TXT file to check the result