代码转自涂然哥,只做整理和略微修改
思路
1.输入一个字符串后统计每个字符出现的次数(频率)
2.接下来按照规则构造一棵哈夫曼树
3.然后01编码(即给每个字符赋予一个01组成的字符串)
4.将我们输入的字符串按照编码规则生成01字符串
5.解码:将字符串的01编码按照规则还原并输出
相关功能实现
结构体的定义
/*记录每个字符出现的次数*/
typedef struct wordcnt
{
char ch;
int cnt=0;
}Count;
/*整个字符串的字符构成*/
typedef struct NumCount
{
Count count[MaxSize];
int length=0;
}NumCount;
/*核心哈夫曼树*/
typedef struct HTree
{
char data;
int weight;
int parent,lchild,rchild;
}HTNode,*HuffmanTree;
/*树的节点的编码*/
typedef struct HCode
{
char data;
char* str;
}*HuffmanCode;
函数申明
void ReadData(char *source); // 读入文件
void WordCount(char *data,NumCount *paraCnt); // 统计次数
void Show(NumCount *paraCnt); // 展示次数
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray); // 创建哈夫曼树
void select(HuffmanTree HT,int top,int *s1,int *s2); // 选择权重最小的两个节点
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length); // 创建哈夫曼编码
void Encode(char *data,HuffmanCode HC,int length); // 将读入的文件编码,写到txt文件
void Decode(HuffmanTree HT,int length); //读入编码文件,解码
读入文件
将我们需要压缩的字符串从文件读入到数组中
void ReadData(char *source)
{
//打开文件读入数据
ifstream infile;
infile.open("in.txt");
cout<<"Reading..."<<endl;
cout<<"the input file is:"<<endl;
infile.getline(source,MaxSize);
cout<<source<<endl;
infile.close();
cout<<endl;
}
统计次数
统计读入的字符串中每个字符分别出现多少次
void WordCount(char* data,NumCount* paraCnt)
{
int flag;
int len=strlen(data);
for(int i=0;i<len;++i)
{
flag=0;
for(int j=0;j<paraCnt->length;++j)
{
if(data[i]==paraCnt->count[j].ch)
{
++paraCnt->count[j].cnt;
flag=1;
break;
}
}
if(flag==0)
{
paraCnt->count[paraCnt->length].ch=data[i];
++paraCnt->count[paraCnt->length].cnt;
++paraCnt->length;
}
}
}
展示次数
将每个字符及其出现的次数输出
void Show(NumCount* paraCnt)
{
cout<<"the length is "<<paraCnt->length<<endl;
for(int i = 0;i < paraCnt->length;++i)
{
cout<<"The character "<<paraCnt->count[i].ch<<" appears "<<paraCnt->count[i].cnt<<endl;
}
cout<<endl;
}
创建哈夫曼树
核心哈夫曼树的构造,此处会多次调用select函数找到权重最小的两个节点
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
if(length<=1)
printf("ERROR!!!\r\n");
int s1,s2;
int m = length*2-1; // 没有度为1的节点,则总结点是2*叶子节点数-1个
HT = new HTNode[m+1];
for(int i = 1;i <= m;++i) // 初始化
{
HT[i].parent = 0;
HT[i].lchild = 0;
HT[i].rchild = 0;
}
for(int i = 1;i <= length;++i)
{
HT[i].data = cntarray.count[i-1].ch;
HT[i].weight = cntarray.count[i-1].cnt;
}
for(int i = length + 1;i <= m;++i)
{
select(HT,i-1,&s1,&s2); // 从前面的范围里选择权重最小的两个节点
HT[s1].parent = i;
HT[s2].parent = i;
HT[i].lchild = s1;
HT[i].rchild = s2;
HT[i].weight = HT[s1].weight + HT[s2].weight; // 得到一个新节点
}
}
选择权重最小的两个节点
void select(HuffmanTree HT,int top,int* s1,int* s2)
{
int min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重最小的节点
{
if(HT[i].weight < min && HT[i].parent == 0)
{
min = HT[i].weight;
*s1 = i;
}
}
min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重次小的节点
{
if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
{
min = HT[i].weight;
*s2 = i;
}
}
}
创建哈夫曼编码
将哈夫曼树中的每一个非空节点生成一个对应的01编码
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
HC = new HCode[length+1];
char *cd = new char[length]; // 存储编码的临时空间
cd[length-1] = '\0'; // 方便之后调用strcpy函数
int c,f,start;
for(int i = 1;i <= length;++i)
{
start = length-1; // start表示编码在临时空间内的起始下标,由于是从叶子节点回溯,所以是从最后开始
c = i;
f = HT[c].parent;
while(f != 0)
{
--start; // 由于是回溯,所以从临时空间的最后往回计
if(HT[f].lchild == c)
cd[start] = '0';
else
cd[start] = '1';
c = f;
f = HT[c].parent;
}
HC[i].str = new char[length-start]; // 最后,实际使用的编码空间大小是length-start
HC[i].data = HT[i].data;
strcpy(HC[i].str,&cd[start]); // 从实际起始地址开始,拷贝到编码结构中
}
delete[] cd;
}
将读入的文件编码,写到txt文件中
void Encode(char* data,HuffmanCode HC,int length)
{
ofstream outfile;
outfile.open("code.txt");
for(int i = 0;i < strlen(data);++i) // 依次读入数据,查找对应的编码,写入编码文件
{
for(int j = 1;j <= length;++j)
{
if(data[i] == HC[j].data)
{
outfile<<HC[j].str;
}
}
}
outfile.close();
cout<<"the code txt has been written"<<endl;
cout<<endl;
}
读入编码文件,解码
void Decode(HuffmanTree HT,int length)
{
char codetxt[MaxSize*length];
ifstream infile;
infile.open("code.txt");
infile.getline(codetxt,MaxSize*length);
infile.close();
ofstream outfile;
outfile.open("out.txt");
int root = 2*length-1; // 从根节点开始遍历
for(int i = 0;i < strlen(codetxt);++i)
{
if(codetxt[i] == '0') root = HT[root].lchild; //为0表示向左遍历
else if(codetxt[i] == '1') root = HT[root].rchild; //为1表示向右遍历
if(HT[root].lchild == 0 && HT[root].rchild == 0) // 如果已经是叶子节点,输出到输出文件中,然后重新回到根节点
{
outfile<<HT[root].data;
root = 2*length-1;
}
}
outfile.close();
cout<<"the output txt has been written"<<endl;
cout<<endl;
}
完整代码实现
#include <iostream>
#include <fstream>
#include <string.h>
using namespace std;
#define MaxSize 1024
typedef struct wordcnt
{
char ch;
int cnt=0;
}Count;
typedef struct NumCount
{
Count count[MaxSize];
int length=0;
}NumCount;
typedef struct HTree
{
char data;
int weight;
int parent,lchild,rchild;
}HTNode,*HuffmanTree;
typedef struct HCode
{
char data;
char* str;
}*HuffmanCode;
void ReadData(char *source); // 读入文件
void WordCount(char *data,NumCount *paraCnt); // 统计次数
void Show(NumCount *paraCnt); // 展示次数
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray); // 创建哈夫曼树
void select(HuffmanTree HT,int top,int *s1,int *s2); // 选择权重最小的两个节点
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length); // 创建哈夫曼编码
void Encode(char *data,HuffmanCode HC,int length); // 将读入的文件编码,写到txt文件
void Decode(HuffmanTree HT,int length); //读入编码文件,解码
int main(int argc, char** argv) {
char data[MaxSize];
NumCount Cntarray;
ReadData(data); // 读入数据
WordCount(data,&Cntarray); // 统计次数
Show(&Cntarray); //可以查看每个单词出现的对应次数
HuffmanTree tree;
CreateHuffmanTree(tree,Cntarray.length,Cntarray); // 建树
HuffmanCode code;
CreateHuffmanCode(tree,code,Cntarray.length); // 创建编码
Encode(data,code,Cntarray.length); // 生成编码文件
Decode(tree,Cntarray.length); // 解码
cout<<"Please view the generated TXT file to check the result"<<endl;
return 0;
}
void ReadData(char *source)
{
//打开文件读入数据
ifstream infile;
infile.open("in.txt");
cout<<"Reading..."<<endl;
cout<<"the input file is:"<<endl;
infile.getline(source,MaxSize);
cout<<source<<endl;
infile.close();
cout<<endl;
}
void WordCount(char *data,NumCount *paraCnt)
{
int flag;// 标识是否已经记录
int len = strlen(data);
for(int i = 0;i < len;++i)
{
flag = 0;
for(int j = 0;j < paraCnt->length;++j)
{
if(paraCnt->count[j].ch == data[i]) // 若已有记录,直接++
{
++paraCnt->count[j].cnt;
flag = 1;
break;
}
}
if(!flag) // 没有记录,则新增
{
paraCnt->count[paraCnt->length].ch = data[i];
++paraCnt->count[paraCnt->length].cnt;
++paraCnt->length;
}
}
}
void Show(NumCount* paraCnt)
{
cout<<"the length is "<<paraCnt->length<<endl;
for(int i = 0;i < paraCnt->length;++i)
{
cout<<"The character "<<paraCnt->count[i].ch<<" appears "<<paraCnt->count[i].cnt<<endl;
}
cout<<endl;
}
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
if(length<=1)
printf("ERROR!!!\r\n");
int s1,s2;
int m = length*2-1; // 没有度为1的节点,则总结点是2*叶子节点数-1个
HT = new HTNode[m+1];
for(int i = 1;i <= m;++i) // 初始化
{
HT[i].parent = 0;
HT[i].lchild = 0;
HT[i].rchild = 0;
}
for(int i = 1;i <= length;++i)
{
HT[i].data = cntarray.count[i-1].ch;
HT[i].weight = cntarray.count[i-1].cnt;
}
for(int i = length + 1;i <= m;++i)
{
select(HT,i-1,&s1,&s2); // 从前面的范围里选择权重最小的两个节点
HT[s1].parent = i;
HT[s2].parent = i;
HT[i].lchild = s1;
HT[i].rchild = s2;
HT[i].weight = HT[s1].weight + HT[s2].weight; // 得到一个新节点
}
}
void select(HuffmanTree HT,int top,int* s1,int* s2)
{
int min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重最小的节点
{
if(HT[i].weight < min && HT[i].parent == 0)
{
min = HT[i].weight;
*s1 = i;
}
}
min = INT_MAX;
for(int i = 1;i <= top;++i) // 选择没有双亲的节点中,权重次小的节点
{
if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
{
min = HT[i].weight;
*s2 = i;
}
}
}
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
HC = new HCode[length+1];
char *cd = new char[length]; // 存储编码的临时空间
cd[length-1] = '\0'; // 方便之后调用strcpy函数
int c,f,start;
for(int i = 1;i <= length;++i)
{
start = length-1; // start表示编码在临时空间内的起始下标,由于是从叶子节点回溯,所以是从最后开始
c = i;
f = HT[c].parent;
while(f != 0)
{
--start; // 由于是回溯,所以从临时空间的最后往回计
if(HT[f].lchild == c)
cd[start] = '0';
else
cd[start] = '1';
c = f;
f = HT[c].parent;
}
HC[i].str = new char[length-start]; // 最后,实际使用的编码空间大小是length-start
HC[i].data = HT[i].data;
strcpy(HC[i].str,&cd[start]); // 从实际起始地址开始,拷贝到编码结构中
}
delete[] cd;
}
void Encode(char* data,HuffmanCode HC,int length)
{
ofstream outfile;
outfile.open("code.txt");
for(int i = 0;i < strlen(data);++i) // 依次读入数据,查找对应的编码,写入编码文件
{
for(int j = 1;j <= length;++j)
{
if(data[i] == HC[j].data)
{
outfile<<HC[j].str;
}
}
}
outfile.close();
cout<<"the code txt has been written"<<endl;
cout<<endl;
}
void Decode(HuffmanTree HT,int length)
{
char codetxt[MaxSize*length];
ifstream infile;
infile.open("code.txt");
infile.getline(codetxt,MaxSize*length);
infile.close();
ofstream outfile;
outfile.open("out.txt");
int root = 2*length-1; // 从根节点开始遍历
for(int i = 0;i < strlen(codetxt);++i)
{
if(codetxt[i] == '0') root = HT[root].lchild; //为0表示向左遍历
else if(codetxt[i] == '1') root = HT[root].rchild; //为1表示向右遍历
if(HT[root].lchild == 0 && HT[root].rchild == 0) // 如果已经是叶子节点,输出到输出文件中,然后重新回到根节点
{
outfile<<HT[root].data;
root = 2*length-1;
}
}
outfile.close();
cout<<"the output txt has been written"<<endl;
cout<<endl;
}
测试样例
输入:in.txt
In my lifetime, I will be a sincere person and never give up my love and dedication to life. I will live an infinite life in a limited time and space.
输出
the input file is:
In my lifetime, I will be a sincere person and never give up my love and dedication to life. I will live an infinite life in a limited time and space.
the length is 24
The character I appears 3
The character n appears 12
The character appears 30
The character m appears 5
The character y appears 2
The character l appears 10
The character i appears 18
The character f appears 4
The character e appears 18
The character t appears 6
The character , appears 1
The character w appears 2
The character b appears 1
The character a appears 8
The character s appears 3
The character c appears 3
The character r appears 3
The character p appears 3
The character o appears 4
The character d appears 6
The character v appears 4
The character g appears 1
The character u appears 1
The character . appears 2
the code txt has been written
the output txt has been written
Please view the generated TXT file to check the result