c++实现基于哈夫曼编码的文本(中英混合)压缩和解压缩

会修bug的猫

已于 2023-04-22 16:32:23 修改

阅读量4.9k

点赞数 39

分类专栏：数据结构文章标签： c++ 数据结构

于 2022-12-20 17:54:37 首次发布

本文链接：https://blog.csdn.net/yyl1025/article/details/128306080

版权

数据结构专栏收录该内容

1 篇文章 0 订阅

订阅专栏

4.根据哈夫曼树得到每个字符(包括汉字)的哈夫曼编码

一、什么是哈夫曼编码？

哈夫曼编码，又称为哈夫曼编码（Huffman Coding）是一种可变长编码（ VLC, variable length coding)）方式，该方法完全依据字符出现概率来构造异字头的平均长度最短的码字，有时称之为最佳编码。

二、utf-8编码

UTF-8是Unicode的实现方式之一并不是唯一也不等同于Unicode。除了UTF-8 还有UTF-16和UTF-32 只是很少被使用。

UTF-8的特点是对不同范围的字符使用不同长度的编码它可以使用1~4个字节表示一个符号根据不同的符号而变化字节长度。

其编码规则很简单对于单字节的符号，字节的第一位设为0 ，后面7位为这个符号的Unicode码。因此对于英语字母 UTF-8编码和ASCII码是相同的。对于n字节的符号,其第一个字节的前n位都设为1 第n+1位设为0 后面字节的前两位一律设为10 剩下的没有提及的二进制位全部为这个符号的Unicode码。

三、压缩流程

1.哈夫曼节点及汉字判断

在utf-8中一个英文字符占一个字节，用一个char表示足矣，但对中文而言，汉字占了三个字节，所以在这里我采用c++string类来统一表示中英文。

typedef struct huffmannode {
	string ch;	//字符(既可以是英文字符，也可以是汉字)
	int weight = 0;	//权重
	int father = 0;	//父节点
	int lc = 0, rc = 0;	//子节点
}huffmanNode;

在读取文件之前我们要先判断所读取的字符是汉字还是英文，由上面的utf-8编码规则我们可以知道汉字的第一个字节的编码一定是 1110 xxxx 而英文字符的编码一定是 0xxx xxxx，所以我们只要判断字节的第一位是不是1就行。

//判断字符是否为英文字符还是中文汉字(假设文本只包含中英文)
bool judgeEng(unsigned char c)
{
	if (!(c & 0x80))
		return true;
	return false;
}

2.获取字符权重

在判断一个字符是否为汉字后，如果是汉字，就接着再读两个字节，这三个连着的字节就是一个汉字。按此方法将文本遍历到结尾，统计出每个字符的出现次数。（为方便，在这里我用了map）

//读取要压缩的文件,获取每个字符的权重
void getWeightMap(ifstream& fin, const char* fileName,map<string, int>& _weightMap)
{
	fin.open(fileName, ios::in);
	if (!fin.is_open())
	{
		cout << "错误" << endl;
		return;
	}
	unsigned char c;
	string s;
	while (!fin.eof())
	{
		c = fin.get();
		if (fin.eof())
			break;
		s = "";
		if (judgeEng(c))
			s += c;
		else
		{
			s += c;
			c = fin.get();
			s += c;
			c = fin.get();
			s += c;
		}
		_weightMap[s]++;
	}
	fin.close();
}

3.根据权重构造哈夫曼树

//构建哈夫曼树
void getHuffmanTree(map<string, int>& _weightMap, vector<huffmanNode>& _huffmanTree)
{
	int n = _weightMap.size();
	for (auto it : _weightMap) {
		huffmannode t;
		t.ch = it.first;
		t.weight = it.second;
		_huffmanTree.push_back(t);
	}
	int s1 = 0, s2 = 0;
	for (int i = n; i < 2 * n - 1; ++i) {
		huffmannode t;
		selectTwo(_huffmanTree, i, s1, s2);
		_huffmanTree.at(s1).father = i;
		_huffmanTree.at(s2).father = i;
		t.lc = s1;
		t.rc = s2;
		t.weight = _huffmanTree.at(s1).weight + _huffmanTree.at(s2).weight;
		_huffmanTree.push_back(t);
	}
}

4.根据哈夫曼树得到每个字符(包括汉字)的哈夫曼编码

//根据哈夫曼树得到每个字符的哈夫曼编码
void getPassworld(vector<huffmannode>& _huffmanTree, map<string, string>& _passworldMap)
{
	int n = (_huffmanTree.size() + 1) / 2;
	for (int i = 0; i < n; ++i) {
		string passworld = "";
		int t = i;
		int fa = _huffmanTree.at(i).father;
		while (fa) {
			if (_huffmanTree.at(fa).lc == t)
				passworld = '0' + passworld;
			else if (_huffmanTree.at(fa).rc == t)
				passworld = '1' + passworld;
			t = fa;
			fa = _huffmanTree.at(t).father;
		}
		_passworldMap.emplace(huffmanTree.at(i).ch, passworld);
	}
}

5.压缩

得到哈夫曼编码后，原文的每个字符（包括中文）都可以翻译成对应的不定长二进制字符串，然后将原文所翻译得到的2进制字符串以8位为一个uchar再重新输入到另一个文件中，即可得到压缩后的文件。

//压缩文件
void compressFile(ifstream& fin, const char* fileName, ofstream& fout, const char* compressFileName)
{
	if (!weightMap.empty())
		weightMap.clear();
	if (!huffmanTree.empty())
		huffmanTree.clear();
	if (!passworldMap.empty())
		passworldMap.clear();
	getWeightMap(fin, fileName, weightMap);
	getHuffmanTree(weightMap, huffmanTree);
	getPassworld(huffmanTree, passworldMap);

	string binary = "";
	fin.open(fileName, ios::in);
	unsigned char c;
	binary = "";
	while (!fin.eof())
	{
		c = fin.get();
		if (fin.eof())
			break;
		string s = "";
		if (judgeEng(c))
			s += c;
		else
		{
			s += c;
			c = fin.get();
			s += c;
			c = fin.get();
			s += c;
		}
		binary += passworldMap[s];
	}
	fin.close();
	//不足8位的后面补0
	int add0Num = binary.size() % 8;
	if (add0Num)
		add0Num = 8 - add0Num;
	for (int i = 0; i < add0Num; ++i, binary += '0');

	fout.open(compressFileName, ios::out | ios::binary);

	// 将辅助信息(每个字符和字符的权重，以及补充0的个数)写入压缩文件中
	fout << weightMap.size() << " " << add0Num << " ";
	for (auto it : weightMap)
	{
		fout << it.first << " " << it.second << " ";
	}

	//将得到的二进制字符串每8位转为一个uchar类型写入压缩文件
	for (int i = 0; i < binary.size(); i += 8)
	{
		string k = binary.substr(i, 8);
		c = binaryStringToChar(k);
		fout << c;
	}
	fout.close();
}

//将得到的二进制字符串每8位转为一个uchar类型写入压缩文件
	for (int i = 0; i < binary.size(); i += 8)
	{
		std::string k = binary.substr(i, 8);
		c = binaryStringToChar(k);
		fout << c;
	}

在这个过程中我们需要解决两个问题：

1.原文翻译后的二进制字符串长度不是8的倍数

这个问题很简单，不足8位在末尾补0或者1都行。

//在末尾补0 
int add0Num = binary.size() % 8;
	if (add0Num)
		add0Num = 8 - add0Num;
	for (int i = 0; i < add0Num; ++i, binary += '0');

2.压缩文件如何解压

因为需要对压缩文件解压，所以我们在压缩时还需要加入一些辅助信息，比如哈夫曼树的权重图和上个问题中末尾补0或1的个数都是我们解压时需要的。所以在压缩时这些信息也需要一同放入压缩文件中。

// 将 辅助信息写入压缩文件中
	fout << weightMap.size() << " "<<add0Num<<" ";//字符个数和末尾补0的个数
	for (auto it : weightMap) //权重图也放入其中
	{
		fout << it.first << " " << it.second << " ";
	}

四、解压缩

解压首先要将压缩文件的辅助信息读出，并利用辅助信息还原哈夫曼树，然后根据哈夫曼将压缩文件中的二进制字符串还原。

//解压文件
void decompressFile(ifstream& fin, const char* compressFileName, ofstream& fout, const char* decompressfileName)
{
	fin.open(compressFileName, ios::in | ios::binary);

	if (!fin.is_open())
		cout << "错误";

	unsigned char c;
	int size, add0;
	map<string, int>_weightmap;
	fin >> size >> add0;	//读取辅助信息中补充0的个数
	fin.get();
	for (int i = 0; i < size; ++i)	//读取辅助信息中每个字符及其权重，得到原文件的权重图
	{
		string s = "";
		int weight = 0;
		c = fin.get();
		if (judgeEng(c))
			s += c;
		else
		{
			s += c;
			c = fin.get();
			s += c;
			c = fin.get();
			s += c;
		}
		c = fin.get();
		c = fin.get();
		for (; c != ' '; c = fin.get())
		{
			weight = weight * 10 + c - '0';
		}
		_weightmap.emplace(s, weight);
	}

	//通过权重图还原哈夫曼树
	vector<huffmannode>_huffmantree;
	getHuffmanTree(_weightmap, _huffmantree);

	string binary = "";
	while (!fin.eof())	//将压缩文件中的uchar转换成 2进制字符串(即原文件中的哈夫曼编码)
	{
		c = fin.get();
		if (fin.eof())
			break;
		binary += ucharToBinaryString(c);
	}
	fin.close();

	for (int i = 0; i < add0; binary.pop_back(), ++i);

	//根据哈夫曼树将压缩文件的内容解压
	fout.open(decompressfileName, ios::out);
	int n = binary.size();
	int len = _huffmantree.size();
	int fa = len - 1;
	for (int k = 0; k < n; ++k) {
		if (binary.at(k) == '0')
			fa = _huffmantree.at(fa).lc;
		else if (binary.at(k) == '1')
			fa = _huffmantree.at(fa).rc;

		if (_huffmantree.at(fa).lc == _huffmantree.at(fa).rc)
		{
			fout << _huffmantree.at(fa).ch;
			fa = len - 1;
		}
	}
	fout.close();
}

五、效果展示

原文

压缩

解压后

可以看到解压后的文件和原文一致

六、完整代码

#include<string>
#include<fstream>
#include <iostream>
#include<vector>
#include<map>
using namespace std;
typedef struct huffmannode {
	string ch;	//字符(既可以是英文字符，也可以是汉字)
	int weight = 0;	//权重
	int father = 0;	//父节点
	int lc = 0, rc = 0;	//子节点
}huffmanNode;

map<string, int>weightMap;//权重图
vector<huffmanNode>huffmanTree;//哈夫曼树
map<string,string>passworldMap;//译码表（存储每个字符对应的哈夫曼编码）

//判断字符是否为英文字符还是中文汉字(假设文本只包含中英文)
bool judgeEng(unsigned char c)
{
	if (!(c & 0x80))
		return true;
	return false;
}

//读取要压缩的文件,获取每个字符的权重
void getWeightMap(ifstream& fin, const char* fileName,map<string, int>& _weightMap)
{
	fin.open(fileName, ios::in);
	if (!fin.is_open())
	{
		cout << "错误" << endl;
		return;
	}
	unsigned char c;
	string s;
	while (!fin.eof())
	{
		c = fin.get();
		if (fin.eof())
			break;
		s = "";
		if (judgeEng(c))
			s += c;
		else
		{
			s += c;
			c = fin.get();
			s += c;
			c = fin.get();
			s += c;
		}
		_weightMap[s]++;
	}
	fin.close();
}

//选择两个最小的权重
void selectTwo(vector<huffmannode>& _huffmanTree, int n, int& s1, int& s2)
{
	int min = 200000000;
	for (int i = 0; i < n; ++i) {
		if (_huffmanTree.at(i).father == 0) {
			if (_huffmanTree.at(i).weight < min) {
				min = _huffmanTree.at(i).weight;
				s1 = i;
			}
		}
	}
	min = 200000000;
	for (int i = 0; i < n; ++i) {
		if (_huffmanTree.at(i).father == 0) {
			if (_huffmanTree.at(i).weight < min && i != s1) {
				min = _huffmanTree.at(i).weight;
				s2 = i;
			}
		}
	}
}

//构建哈夫曼树
void getHuffmanTree(map<string, int>& _weightMap, vector<huffmanNode>& _huffmanTree)
{
	int n = _weightMap.size();
	for (auto it : _weightMap) {
		huffmannode t;
		t.ch = it.first;
		t.weight = it.second;
		_huffmanTree.push_back(t);
	}
	int s1 = 0, s2 = 0;
	for (int i = n; i < 2 * n - 1; ++i) {
		huffmannode t;
		selectTwo(_huffmanTree, i, s1, s2);
		_huffmanTree.at(s1).father = i;
		_huffmanTree.at(s2).father = i;
		t.lc = s1;
		t.rc = s2;
		t.weight = _huffmanTree.at(s1).weight + _huffmanTree.at(s2).weight;
		_huffmanTree.push_back(t);
	}
}

//根据哈夫曼树得到每个字符的哈夫曼编码
void getPassworld(vector<huffmannode>& _huffmanTree, map<string, string>& _passworldMap)
{
	int n = (_huffmanTree.size() + 1) / 2;
	for (int i = 0; i < n; ++i) {
		string passworld = "";
		int t = i;
		int fa = _huffmanTree.at(i).father;
		while (fa) {
			if (_huffmanTree.at(fa).lc == t)
				passworld = '0' + passworld;
			else if (_huffmanTree.at(fa).rc == t)
				passworld = '1' + passworld;
			t = fa;
			fa = _huffmanTree.at(t).father;
		}
		_passworldMap.emplace(huffmanTree.at(i).ch, passworld);
	}
}

//将得到的所有字符的哈夫曼编码每8位转换成一个uchar
unsigned char binaryStringToChar(string binarystring)
{
	int sum = 0;
	for (int i = 0; i < binarystring.size(); i++)
	{
		if (binarystring[i] == '1')
		{
			int j = pow(2, binarystring.size() - i - 1);
			sum += j;
		}
	}
	unsigned char ch = sum;
	return ch;
}

//压缩文件
void compressFile(ifstream& fin, const char* fileName, ofstream& fout, const char* compressFileName)
{
	if (!weightMap.empty())
		weightMap.clear();
	if (!huffmanTree.empty())
		huffmanTree.clear();
	if (!passworldMap.empty())
		passworldMap.clear();
	getWeightMap(fin, fileName, weightMap);
	getHuffmanTree(weightMap, huffmanTree);
	getPassworld(huffmanTree, passworldMap);

	string binary = "";
	fin.open(fileName, ios::in);
	unsigned char c;
	binary = "";
	while (!fin.eof())
	{
		c = fin.get();
		if (fin.eof())
			break;
		string s = "";
		if (judgeEng(c))
			s += c;
		else
		{
			s += c;
			c = fin.get();
			s += c;
			c = fin.get();
			s += c;
		}
		binary += passworldMap[s];
	}
	fin.close();
	//不足8位的后面补0
	int add0Num = binary.size() % 8;
	if (add0Num)
		add0Num = 8 - add0Num;
	for (int i = 0; i < add0Num; ++i, binary += '0');

	fout.open(compressFileName, ios::out | ios::binary);

	// 将辅助信息(每个字符和字符的权重，以及补充0的个数)写入压缩文件中
	fout << weightMap.size() << " " << add0Num << " ";
	for (auto it : weightMap)
	{
		fout << it.first << " " << it.second << " ";
	}

	//将得到的二进制字符串每8位转为一个uchar类型写入压缩文件
	for (int i = 0; i < binary.size(); i += 8)
	{
		string k = binary.substr(i, 8);
		c = binaryStringToChar(k);
		fout << c;
	}
	fout.close();
}

//将uchar字符转换成2进制的字符串
string ucharToBinaryString(unsigned char value)
{
	string b = "";
	for (int n = 0x80; n > 0; n >>= 1)
	{
		if (n & value)
			b += '1';
		else
			b += '0';
	}
	return b;
}

//解压文件
void decompressFile(ifstream& fin, const char* compressFileName, ofstream& fout, const char* decompressfileName)
{
	fin.open(compressFileName, ios::in | ios::binary);

	if (!fin.is_open())
		cout << "错误";

	unsigned char c;
	int size, add0;
	map<string, int>_weightmap;
	fin >> size >> add0;	//读取辅助信息中补充0的个数
	fin.get();
	for (int i = 0; i < size; ++i)	//读取辅助信息中每个字符及其权重，得到原文件的权重图
	{
		string s = "";
		int weight = 0;
		c = fin.get();
		if (judgeEng(c))
			s += c;
		else
		{
			s += c;
			c = fin.get();
			s += c;
			c = fin.get();
			s += c;
		}
		c = fin.get();
		c = fin.get();
		for (; c != ' '; c = fin.get())
		{
			weight = weight * 10 + c - '0';
		}
		_weightmap.emplace(s, weight);
	}

	//通过权重图还原哈夫曼树
	vector<huffmannode>_huffmantree;
	getHuffmanTree(_weightmap, _huffmantree);

	string binary = "";
	while (!fin.eof())	//将压缩文件中的uchar转换成 2进制字符串(即原文件中的哈夫曼编码)
	{
		c = fin.get();
		if (fin.eof())
			break;
		binary += ucharToBinaryString(c);
	}
	fin.close();

	for (int i = 0; i < add0; binary.pop_back(), ++i);

	//根据哈夫曼树将压缩文件的内容解压
	fout.open(decompressfileName, ios::out);
	int n = binary.size();
	int len = _huffmantree.size();
	int fa = len - 1;
	for (int k = 0; k < n; ++k) {
		if (binary.at(k) == '0')
			fa = _huffmantree.at(fa).lc;
		else if (binary.at(k) == '1')
			fa = _huffmantree.at(fa).rc;

		if (_huffmantree.at(fa).lc == _huffmantree.at(fa).rc)
		{
			fout << _huffmantree.at(fa).ch;
			fa = len - 1;
		}
	}
	fout.close();
}

int main()
{
	ifstream fin;
	ofstream fout;
	char filename[] = "D:\\Desktop\\src.txt";//需要压缩的文件路径
	char compressfilename[] = "D:\\Desktop\\src.huffmanzip";//压缩后的文件路径
	char decompressfilename[] = "D:\\Desktop\\decompress_src.txt";//解压缩后的文件路径
	compressFile(fin, filename, fout, compressfilename);
	decompressFile(fin, compressfilename, fout, decompressfilename);
	return 0;
}