哈夫曼树及编码

最新推荐文章于 2024-04-20 11:37:23 发布

谁说剪刀打不过石头

最新推荐文章于 2024-04-20 11:37:23 发布

阅读量816

点赞数

文章标签：霍夫曼树数据结构

本文链接：https://blog.csdn.net/Tang_ming_yue/article/details/125034479

版权

在认识哈夫曼树之前，必须先了解以下几个基本术语：

1、什么是路径？

在一棵树中，从一个结点往下可以达到的结点之间的通路，称为路径。

2、什么是路径长度？

某一路径所经过的“边”的数量，称为该路径的路径长度。

3、什么是结点的带权路径长度？

若将树中结点赋给一个带有某种含义的数值，则该数值称为该结点的权。从根结点到该结点之间的路径长度与该结点的权的乘积，称为该结点的带权路径长度。

4、什么是树的带权路径长度？

树的带权路径长度规定为所有叶子结点的带权路径长度之和，记为WPL。

5、什么是哈夫曼树？

给定n个权值作为n个叶子结点，构造一棵二叉树，若该树的带权路径长度达到最小，则称该二叉树为哈夫曼树，也被称为最优二叉树。

二、相关功能实现

1、结构体的定义

typedef struct wordcnt{  // 统计字符和对应的次数 
	char ch;
	int cnt = 0;
}Count;

typedef struct NumCount{  // 统计次数的外部封装 
	Count count[MaxSize];
	int length = 0;
}NumCount;

typedef struct HTree{  // 哈夫曼树结构 
	char data; 
	int weight;//结点的权值 
	int parent,lchild,rchild;//结点的双亲、左孩子和右孩子的下标 
}HTNode,*HuffmanTree; 

typedef struct HCode{ // 编码结构 
	char data;
	char* str; 
}*HuffmanCode;

2、读入文件
将需要压缩的字符读入数组

void ReadData(char *source)
{
	//打开文件读入数据 
	ifstream infile;
	infile.open("in.txt");
	cout<<"Reading..."<<endl;
	cout<<"the input file is:"<<endl;
	infile.getline(source,MaxSize);
	cout<<source<<endl;
	infile.close();
	cout<<endl;
}

3、统计字符出现次数

void WordCount(char *data,NumCount *paraCnt)
{
	int flag;// 标识是否已经记录 
	int len = strlen(data);
	for(int i = 0;i < len;++i)
	{
		flag = 0;
		for(int j = 0;j < paraCnt->length;++j)
		{
			if(paraCnt->count[j].ch == data[i]) // 若已有记录，直接++ 
			{
				++paraCnt->count[j].cnt;
				flag = 1;
				break;
			}
			
		}
		if(!flag)//没有记录，则新增 
		{
			paraCnt->count[paraCnt->length].ch = data[i];
			++paraCnt->count[paraCnt->length].cnt;
			++paraCnt->length;
		}
	}
}

4、展示次数

void Show(NumCount *paraCnt)
{
	cout<<"the length is "<<paraCnt->length<<endl;
	for(int i = 0;i < paraCnt->length;++i)
	{
		cout<<"The character "<<paraCnt->count[i].ch<<"  appears  "<<paraCnt->count[i].cnt<<endl;
	}
	cout<<endl;
}

5、选择权重最小的两个节点

void select(HuffmanTree HT,int top,int *s1,int *s2)
{
	int min = INT_MAX;
	for(int i = 1;i <= top;++i)  // 选择没有双亲的节点中，权重最小的节点 
	{
		if(HT[i].weight < min && HT[i].parent == 0)
		{
			min = HT[i].weight;
			*s1 = i;
		}
	}
	
	min = INT_MAX;
	for(int i = 1;i <= top;++i)  // 选择没有双亲的节点中，权重次小的节点 
	{
		if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
		{
			min = HT[i].weight;
			*s2 = i;
		}
	}	
}

6、创建哈夫曼树
(1)根据给定的n个权值{W1, W2,…, Wn},构造n棵只有根结点的二叉树，这n棵二叉树构成一个森林 F。

(2)在森林F中选取两棵根结点的权值最小的树作为左右子树构造一棵新的二叉树，且置新的二叉树的根结点的权值为其左、右子树上根结点的权值之和。

(3)在森林F中删除这两棵树，同时将新得到的二叉树加入F中。

(4)重复(2)和(3),直到F只含一棵树为止。这棵树便是哈夫曼树。

void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
	if(length <= 1) 
	   return;
	
	int s1,s2;
	int m = length*2-1;  // 没有度为1的节点，则总结点是2*叶子节点数-1个 
	HT = new HTNode[m+1]; //0号单元未用，所以需要动态分配m+1个单元，HT[m]表示根节点 
	for(int i = 1;i <= m;++i)  // 将下标初始化为0 
	{
		HT[i].parent = 0;
		HT[i].lchild = 0;
		HT[i].rchild = 0;
	}
	
	for(int i = 1;i <= length;++i) 
	{
		HT[i].data = cntarray.count[i-1].ch;
		HT[i].weight = cntarray.count[i-1].cnt;
	}
	
	for(int i = length + 1;i <= m;++i)
	{
		select(HT,i-1,&s1,&s2);  // 从前面的范围里选择权重最小的两个节点 
		HT[s1].parent = i;
		HT[s2].parent = i;//得到新节点i，删除s1，s2，将s1，s2的双亲域由0改为i 
		HT[i].lchild = s1;
		HT[i].rchild = s2;// 将s1，s2作为i的左右孩子 
		HT[i].weight = HT[s1].weight + HT[s2].weight;  // i的权值为左右孩子权值的和 
	}
}

7、创建哈夫曼编码
在构造哈夫曼树之后，求哈夫曼编码的主要思想是：依次以叶子为出发点，向上回溯至根结点为止。回溯时走左分支则生成代码0，走右分支则生成代码1。

void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
	HC = new HCode[length+1];
	char *cd = new char[length];  // 存储编码的临时空间 
	cd[length-1] = '\0';  // 方便之后调用strcpy函数 
	int c,f,start;
	for(int i = 1;i <= length;++i)
	{
		start = length-1;  // start表示编码在临时空间内的起始下标，由于是从叶子节点回溯，所以是从最后开始 
		c = i;
		f = HT[c].parent;//f指向结点c的双亲结点 
		while(f != 0)//从叶子结点开始向上回溯，直到根节点 
		{
			--start;  // 由于是回溯，所以从临时空间的最后往回计 
			if(HT[f].lchild == c)
				cd[start] = '0';//结点c是f的左孩子，生成代码0 
			else 
				cd[start] = '1';//结点c是f的右孩子，生成代码1 
			c = f;
			f = HT[c].parent;//继续向上回溯 
		}//求出第i个字符的编码 
		HC[i].str = new char[length-start];  // 为第i个字符编码分配空间
		HC[i].data = HT[i].data;
		strcpy(HC[i].str,&cd[start]);  // 从实际起始地址开始，拷贝到编码结构中 
	}
	delete[] cd;
}

8、将读入的文件编码，写到txt文件

void Encode(char *data,HuffmanCode HC,int length)
{
	ofstream outfile;
	outfile.open("code.txt");
	for(int i = 0;i < strlen(data);++i)  // 依次读入数据，查找对应的编码，写入编码文件 
	{
		for(int j = 1;j <= length;++j)
		{
			if(data[i] == HC[j].data)
			{
				outfile<<HC[j].str;
			}
		}
	}
	outfile.close();
	cout<<"the code txt has been written"<<endl;
	cout<<endl;
}

完整代码：

#include <iostream>
#include <fstream>
#include <string.h>
 
#define MaxSize 1024  // 读入文件的上限 
using namespace std;

typedef struct wordcnt{  // 统计字符和对应的次数 
	char ch;
	int cnt = 0;
}Count;

typedef struct NumCount{  // 统计次数的外部封装 
	Count count[MaxSize];
	int length = 0;
}NumCount;

typedef struct HTree{  // 哈夫曼树结构 
	char data; 
	int weight;//结点的权值 
	int parent,lchild,rchild;//结点的双亲、左孩子和右孩子的下标 
}HTNode,*HuffmanTree; 

typedef struct HCode{ // 编码结构 
	char data;
	char* str; 
}*HuffmanCode;

// 读入文件 
void ReadData(char *source)
{
	//打开文件读入数据 
	ifstream infile;
	infile.open("in.txt");
	cout<<"Reading..."<<endl;
	cout<<"the input file is:"<<endl;
	infile.getline(source,MaxSize);
	cout<<source<<endl;
	infile.close();
	cout<<endl;
}

//统计次数 
void WordCount(char *data,NumCount *paraCnt)
{
	int flag;// 标识是否已经记录 
	int len = strlen(data);
	for(int i = 0;i < len;++i)
	{
		flag = 0;
		for(int j = 0;j < paraCnt->length;++j)
		{
			if(paraCnt->count[j].ch == data[i]) // 若已有记录，直接++ 
			{
				++paraCnt->count[j].cnt;
				flag = 1;
				break;
			}
			
		}
		if(!flag)//没有记录，则新增 
		{
			paraCnt->count[paraCnt->length].ch = data[i];
			++paraCnt->count[paraCnt->length].cnt;
			++paraCnt->length;
		}
	}
}

// 展示次数 
void Show(NumCount *paraCnt)
{
	cout<<"the length is "<<paraCnt->length<<endl;
	for(int i = 0;i < paraCnt->length;++i)
	{
		cout<<"The character "<<paraCnt->count[i].ch<<"  appears  "<<paraCnt->count[i].cnt<<endl;
	}
	cout<<endl;
}

// 选择权重最小的两个节点 
void select(HuffmanTree HT,int top,int *s1,int *s2)
{
	int min = INT_MAX;
	for(int i = 1;i <= top;++i)  // 选择没有双亲的节点中，权重最小的节点 
	{
		if(HT[i].weight < min && HT[i].parent == 0)
		{
			min = HT[i].weight;
			*s1 = i;
		}
	}
	
	min = INT_MAX;
	for(int i = 1;i <= top;++i)  // 选择没有双亲的节点中，权重次小的节点 
	{
		if(HT[i].weight < min && i != *s1 && HT[i].parent == 0)
		{
			min = HT[i].weight;
			*s2 = i;
		}
	}	
}

// 创建哈夫曼树 
void CreateHuffmanTree(HuffmanTree &HT,int length,NumCount cntarray)
{
	if(length <= 1) 
	   return;
	
	int s1,s2;
	int m = length*2-1;  // 没有度为1的节点，则总结点是2*叶子节点数-1个 
	HT = new HTNode[m+1]; //0号单元未用，所以需要动态分配m+1个单元，HT[m]表示根节点 
	for(int i = 1;i <= m;++i)  // 将下标初始化为0 
	{
		HT[i].parent = 0;
		HT[i].lchild = 0;
		HT[i].rchild = 0;
	}
	
	for(int i = 1;i <= length;++i) 
	{
		HT[i].data = cntarray.count[i-1].ch;
		HT[i].weight = cntarray.count[i-1].cnt;
	}
	
	for(int i = length + 1;i <= m;++i)
	{
		select(HT,i-1,&s1,&s2);  // 从前面的范围里选择权重最小的两个节点 
		HT[s1].parent = i;
		HT[s2].parent = i;//得到新节点i，删除s1，s2，将s1，s2的双亲域由0改为i 
		HT[i].lchild = s1;
		HT[i].rchild = s2;// 将s1，s2作为i的左右孩子 
		HT[i].weight = HT[s1].weight + HT[s2].weight;  // i的权值为左右孩子权值的和 
	}
}

// 创建哈夫曼编码 
void CreateHuffmanCode(HuffmanTree HT,HuffmanCode &HC,int length)
{
	HC = new HCode[length+1];
	char *cd = new char[length];  // 存储编码的临时空间 
	cd[length-1] = '\0';  // 方便之后调用strcpy函数 
	int c,f,start;
	for(int i = 1;i <= length;++i)
	{
		start = length-1;  // start表示编码在临时空间内的起始下标，由于是从叶子节点回溯，所以是从最后开始 
		c = i;
		f = HT[c].parent;//f指向结点c的双亲结点 
		while(f != 0)//从叶子结点开始向上回溯，直到根节点 
		{
			--start;  // 由于是回溯，所以从临时空间的最后往回计 
			if(HT[f].lchild == c)
				cd[start] = '0';//结点c是f的左孩子，生成代码0 
			else 
				cd[start] = '1';//结点c是f的右孩子，生成代码1 
			c = f;
			f = HT[c].parent;//继续向上回溯 
		}//求出第i个字符的编码 
		HC[i].str = new char[length-start];  // 为第i个字符编码分配空间
		HC[i].data = HT[i].data;
		strcpy(HC[i].str,&cd[start]);  // 从实际起始地址开始，拷贝到编码结构中 
	}
	delete[] cd;
}

// 将读入的文件编码，写到txt文件 
void Encode(char *data,HuffmanCode HC,int length)
{
	ofstream outfile;
	outfile.open("code.txt");
	for(int i = 0;i < strlen(data);++i)  // 依次读入数据，查找对应的编码，写入编码文件 
	{
		for(int j = 1;j <= length;++j)
		{
			if(data[i] == HC[j].data)
			{
				outfile<<HC[j].str;
			}
		}
	}
	outfile.close();
	cout<<"the code txt has been written"<<endl;
	cout<<endl;
}

//读入编码文件，解码 
void Decode(HuffmanTree HT,int length)
{
	char* codetxt = new char[MaxSize*length];
	ifstream infile;
	infile.open("code.txt");
	infile.getline(codetxt,MaxSize*length);
	infile.close();
	
	ofstream outfile;
   	outfile.open("out.txt");
	
	int root = 2*length-1;  // 从根节点开始遍历 
	for(int i = 0;i < strlen(codetxt);++i)
	{
		if(codetxt[i] == '0') root = HT[root].lchild;  //为0表示向左遍历 
		else if(codetxt[i] == '1') root = HT[root].rchild; //为1表示向右遍历 
		if(HT[root].lchild == 0 && HT[root].rchild == 0)  // 如果已经是叶子节点，输出到输出文件中，然后重新回到根节点 
		{
			outfile<<HT[root].data;
			root = 2*length-1;
		}
	}
	outfile.close();
	cout<<"the output txt has been written"<<endl;
	cout<<endl;
}

int main(int argc, char** argv){
	char data[MaxSize]; 
	NumCount Cntarray;
	ReadData(data);  // 读入数据 
	WordCount(data,&Cntarray);  // 统计次数 
	Show(&Cntarray); //可以查看每个单词出现的对应次数 
	HuffmanTree tree;
	CreateHuffmanTree(tree,Cntarray.length,Cntarray);  // 建树 
	HuffmanCode code; 
	CreateHuffmanCode(tree,code,Cntarray.length);  // 创建编码 
	Encode(data,code,Cntarray.length);  // 生成编码文件 
	Decode(tree,Cntarray.length);  // 解码 
	cout<<"Please view the generated TXT file to check the result"<<endl; 
	return 0;
}

运行结果：

Reading...
the input file is:
Many people do not work hard, do not work, and do nothing all day long

the length is 18
The character M  appears  1
The character a  appears  5
The character n  appears  7
The character y  appears  2
The character    appears  14
The character p  appears  2
The character e  appears  2
The character o  appears  10
The character l  appears  4
The character d  appears  6
The character t  appears  3
The character w  appears  2
The character r  appears  3
The character k  appears  2
The character h  appears  2
The character ,  appears  2
The character i  appears  1
The character g  appears  2

the code txt has been written

the output txt has been written

Please view the generated TXT file to check the result

谁说剪刀打不过石头

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
哈夫曼树及编码

在认识哈夫曼树之前，必须先了解以下几个基本术语：1、什么是路径？在一棵树中，从一个结点往下可以达到的结点之间的通路，称为路径。2、什么是路径长度？某一路径所经过的“边”的数量，称为该路径的路径长度。3、什么是结点的带权路径长度？若将树中结点赋给一个带有某种含义的数值，则该数值称为该结点的权。从根结点到该结点之间的路径长度与该结点的权的乘积，称为该结点的带权路径长度。4、什么是树的带权路径长度？树的带权路径长度规定为所有叶子结点的带权路径长度之和，记为WPL。5、什么是哈夫
复制链接

扫一扫