目录
一、什么是哈夫曼编码?
哈夫曼编码,又称为哈夫曼编码(Huffman Coding)是一种可变长编码( VLC, variable length coding))方式,该方法完全依据字符出现概率来构造异字头的平均长度最短的码字,有时称之为最佳编码。
二、utf-8编码
UTF-8是Unicode的实现方式之一 并不是唯一 也不等同于Unicode。除了UTF-8 还有UTF-16和UTF-32 只是很少被使用。
UTF-8的特点是对不同范围的字符使用不同长度的编码 它可以使用1~4个字节表示一个符号 根据不同的符号而变化字节长度。
其编码规则很简单对于单字节的符号 ,字节的第一位设为0 ,后面7位为这个符号的Unicode码。因此对于英语字母 UTF-8编码和ASCII码是相同的。对于n字节的符号,其第一个字节的前n位都设为1 第n+1位设为0 后面字节的前两位一律设为10 剩下的没有提及的二进制位 全部为这个符号的Unicode码。
三、压缩流程
1.哈夫曼节点及汉字判断
在utf-8中一个英文字符占一个字节,用一个char表示足矣,但对中文而言,汉字占了三个字节,所以在这里我采用c++string类来统一表示中英文。
typedef struct huffmannode {
string ch; //字符(既可以是英文字符,也可以是汉字)
int weight = 0; //权重
int father = 0; //父节点
int lc = 0, rc = 0; //子节点
}huffmanNode;
在读取文件之前我们要先判断所读取的字符是汉字还是英文,由上面的utf-8编码规则我们可以知道汉字的第一个字节的编码一定是 1110 xxxx 而英文字符的编码一定是 0xxx xxxx,所以我们只要判断字节的第一位是不是1就行。
//判断字符是否为英文字符还是中文汉字(假设文本只包含中英文)
bool judgeEng(unsigned char c)
{
if (!(c & 0x80))
return true;
return false;
}
2.获取字符权重
在判断一个字符是否为汉字后,如果是汉字,就接着再读两个字节,这三个连着的字节就是一个汉字。按此方法将文本遍历到结尾,统计出每个字符的出现次数。(为方便,在这里我用了map)
//读取要压缩的文件,获取每个字符的权重
void getWeightMap(ifstream& fin, const char* fileName,map<string, int>& _weightMap)
{
fin.open(fileName, ios::in);
if (!fin.is_open())
{
cout << "错误" << endl;
return;
}
unsigned char c;
string s;
while (!fin.eof())
{
c = fin.get();
if (fin.eof())
break;
s = "";
if (judgeEng(c))
s += c;
else
{
s += c;
c = fin.get();
s += c;
c = fin.get();
s += c;
}
_weightMap[s]++;
}
fin.close();
}
3.根据权重构造哈夫曼树
//构建哈夫曼树
void getHuffmanTree(map<string, int>& _weightMap, vector<huffmanNode>& _huffmanTree)
{
int n = _weightMap.size();
for (auto it : _weightMap) {
huffmannode t;
t.ch = it.first;
t.weight = it.second;
_huffmanTree.push_back(t);
}
int s1 = 0, s2 = 0;
for (int i = n; i < 2 * n - 1; ++i) {
huffmannode t;
selectTwo(_huffmanTree, i, s1, s2);
_huffmanTree.at(s1).father = i;
_huffmanTree.at(s2).father = i;
t.lc = s1;
t.rc = s2;
t.weight = _huffmanTree.at(s1).weight + _huffmanTree.at(s2).weight;
_huffmanTree.push_back(t);
}
}
4.根据哈夫曼树得到每个字符(包括汉字)的哈夫曼编码
//根据哈夫曼树得到每个字符的哈夫曼编码
void getPassworld(vector<huffmannode>& _huffmanTree, map<string, string>& _passworldMap)
{
int n = (_huffmanTree.size() + 1) / 2;
for (int i = 0; i < n; ++i) {
string passworld = "";
int t = i;
int fa = _huffmanTree.at(i).father;
while (fa) {
if (_huffmanTree.at(fa).lc == t)
passworld = '0' + passworld;
else if (_huffmanTree.at(fa).rc == t)
passworld = '1' + passworld;
t = fa;
fa = _huffmanTree.at(t).father;
}
_passworldMap.emplace(huffmanTree.at(i).ch, passworld);
}
}
5.压缩
得到哈夫曼编码后,原文的每个字符(包括中文)都可以翻译成对应的不定长二进制字符串,然后将原文所翻译得到的2进制字符串以8位为一个uchar再重新输入到另一个文件中,即可得到压缩后的文件。
//压缩文件
void compressFile(ifstream& fin, const char* fileName, ofstream& fout, const char* compressFileName)
{
if (!weightMap.empty())
weightMap.clear();
if (!huffmanTree.empty())
huffmanTree.clear();
if (!passworldMap.empty())
passworldMap.clear();
getWeightMap(fin, fileName, weightMap);
getHuffmanTree(weightMap, huffmanTree);
getPassworld(huffmanTree, passworldMap);
string binary = "";
fin.open(fileName, ios::in);
unsigned char c;
binary = "";
while (!fin.eof())
{
c = fin.get();
if (fin.eof())
break;
string s = "";
if (judgeEng(c))
s += c;
else
{
s += c;
c = fin.get();
s += c;
c = fin.get();
s += c;
}
binary += passworldMap[s];
}
fin.close();
//不足8位的后面补0
int add0Num = binary.size() % 8;
if (add0Num)
add0Num = 8 - add0Num;
for (int i = 0; i < add0Num; ++i, binary += '0');
fout.open(compressFileName, ios::out | ios::binary);
// 将辅助信息(每个字符和字符的权重,以及补充0的个数)写入压缩文件中
fout << weightMap.size() << " " << add0Num << " ";
for (auto it : weightMap)
{
fout << it.first << " " << it.second << " ";
}
//将得到的二进制字符串每8位转为一个uchar类型写入压缩文件
for (int i = 0; i < binary.size(); i += 8)
{
string k = binary.substr(i, 8);
c = binaryStringToChar(k);
fout << c;
}
fout.close();
}
//将得到的二进制字符串每8位转为一个uchar类型写入压缩文件
for (int i = 0; i < binary.size(); i += 8)
{
std::string k = binary.substr(i, 8);
c = binaryStringToChar(k);
fout << c;
}
在这个过程中我们需要解决两个问题:
1.原文翻译后的二进制字符串长度不是8的倍数
这个问题很简单,不足8位在末尾补0或者1都行。
//在末尾补0
int add0Num = binary.size() % 8;
if (add0Num)
add0Num = 8 - add0Num;
for (int i = 0; i < add0Num; ++i, binary += '0');
2.压缩文件如何解压
因为需要对压缩文件解压,所以我们在压缩时还需要加入一些辅助信息,比如哈夫曼树的权重图和上个问题中末尾补0或1的个数都是我们解压时需要的。所以在压缩时这些信息也需要一同放入压缩文件中。
// 将 辅助信息写入压缩文件中
fout << weightMap.size() << " "<<add0Num<<" ";//字符个数和末尾补0的个数
for (auto it : weightMap) //权重图也放入其中
{
fout << it.first << " " << it.second << " ";
}
四、解压缩
解压首先要将压缩文件的辅助信息读出,并利用辅助信息还原哈夫曼树,然后根据哈夫曼将压缩文件中的二进制字符串还原。
//解压文件
void decompressFile(ifstream& fin, const char* compressFileName, ofstream& fout, const char* decompressfileName)
{
fin.open(compressFileName, ios::in | ios::binary);
if (!fin.is_open())
cout << "错误";
unsigned char c;
int size, add0;
map<string, int>_weightmap;
fin >> size >> add0; //读取辅助信息中补充0的个数
fin.get();
for (int i = 0; i < size; ++i) //读取辅助信息中每个字符及其权重,得到原文件的权重图
{
string s = "";
int weight = 0;
c = fin.get();
if (judgeEng(c))
s += c;
else
{
s += c;
c = fin.get();
s += c;
c = fin.get();
s += c;
}
c = fin.get();
c = fin.get();
for (; c != ' '; c = fin.get())
{
weight = weight * 10 + c - '0';
}
_weightmap.emplace(s, weight);
}
//通过权重图还原哈夫曼树
vector<huffmannode>_huffmantree;
getHuffmanTree(_weightmap, _huffmantree);
string binary = "";
while (!fin.eof()) //将压缩文件中的uchar转换成 2进制字符串(即原文件中的哈夫曼编码)
{
c = fin.get();
if (fin.eof())
break;
binary += ucharToBinaryString(c);
}
fin.close();
for (int i = 0; i < add0; binary.pop_back(), ++i);
//根据哈夫曼树将压缩文件的内容解压
fout.open(decompressfileName, ios::out);
int n = binary.size();
int len = _huffmantree.size();
int fa = len - 1;
for (int k = 0; k < n; ++k) {
if (binary.at(k) == '0')
fa = _huffmantree.at(fa).lc;
else if (binary.at(k) == '1')
fa = _huffmantree.at(fa).rc;
if (_huffmantree.at(fa).lc == _huffmantree.at(fa).rc)
{
fout << _huffmantree.at(fa).ch;
fa = len - 1;
}
}
fout.close();
}
五、效果展示
原文
压缩
解压后
可以看到解压后的文件和原文一致
六、完整代码
#include<string>
#include<fstream>
#include <iostream>
#include<vector>
#include<map>
using namespace std;
typedef struct huffmannode {
string ch; //字符(既可以是英文字符,也可以是汉字)
int weight = 0; //权重
int father = 0; //父节点
int lc = 0, rc = 0; //子节点
}huffmanNode;
map<string, int>weightMap;//权重图
vector<huffmanNode>huffmanTree;//哈夫曼树
map<string,string>passworldMap;//译码表(存储每个字符对应的哈夫曼编码)
//判断字符是否为英文字符还是中文汉字(假设文本只包含中英文)
bool judgeEng(unsigned char c)
{
if (!(c & 0x80))
return true;
return false;
}
//读取要压缩的文件,获取每个字符的权重
void getWeightMap(ifstream& fin, const char* fileName,map<string, int>& _weightMap)
{
fin.open(fileName, ios::in);
if (!fin.is_open())
{
cout << "错误" << endl;
return;
}
unsigned char c;
string s;
while (!fin.eof())
{
c = fin.get();
if (fin.eof())
break;
s = "";
if (judgeEng(c))
s += c;
else
{
s += c;
c = fin.get();
s += c;
c = fin.get();
s += c;
}
_weightMap[s]++;
}
fin.close();
}
//选择两个最小的权重
void selectTwo(vector<huffmannode>& _huffmanTree, int n, int& s1, int& s2)
{
int min = 200000000;
for (int i = 0; i < n; ++i) {
if (_huffmanTree.at(i).father == 0) {
if (_huffmanTree.at(i).weight < min) {
min = _huffmanTree.at(i).weight;
s1 = i;
}
}
}
min = 200000000;
for (int i = 0; i < n; ++i) {
if (_huffmanTree.at(i).father == 0) {
if (_huffmanTree.at(i).weight < min && i != s1) {
min = _huffmanTree.at(i).weight;
s2 = i;
}
}
}
}
//构建哈夫曼树
void getHuffmanTree(map<string, int>& _weightMap, vector<huffmanNode>& _huffmanTree)
{
int n = _weightMap.size();
for (auto it : _weightMap) {
huffmannode t;
t.ch = it.first;
t.weight = it.second;
_huffmanTree.push_back(t);
}
int s1 = 0, s2 = 0;
for (int i = n; i < 2 * n - 1; ++i) {
huffmannode t;
selectTwo(_huffmanTree, i, s1, s2);
_huffmanTree.at(s1).father = i;
_huffmanTree.at(s2).father = i;
t.lc = s1;
t.rc = s2;
t.weight = _huffmanTree.at(s1).weight + _huffmanTree.at(s2).weight;
_huffmanTree.push_back(t);
}
}
//根据哈夫曼树得到每个字符的哈夫曼编码
void getPassworld(vector<huffmannode>& _huffmanTree, map<string, string>& _passworldMap)
{
int n = (_huffmanTree.size() + 1) / 2;
for (int i = 0; i < n; ++i) {
string passworld = "";
int t = i;
int fa = _huffmanTree.at(i).father;
while (fa) {
if (_huffmanTree.at(fa).lc == t)
passworld = '0' + passworld;
else if (_huffmanTree.at(fa).rc == t)
passworld = '1' + passworld;
t = fa;
fa = _huffmanTree.at(t).father;
}
_passworldMap.emplace(huffmanTree.at(i).ch, passworld);
}
}
//将得到的所有字符的哈夫曼编码每8位转换成一个uchar
unsigned char binaryStringToChar(string binarystring)
{
int sum = 0;
for (int i = 0; i < binarystring.size(); i++)
{
if (binarystring[i] == '1')
{
int j = pow(2, binarystring.size() - i - 1);
sum += j;
}
}
unsigned char ch = sum;
return ch;
}
//压缩文件
void compressFile(ifstream& fin, const char* fileName, ofstream& fout, const char* compressFileName)
{
if (!weightMap.empty())
weightMap.clear();
if (!huffmanTree.empty())
huffmanTree.clear();
if (!passworldMap.empty())
passworldMap.clear();
getWeightMap(fin, fileName, weightMap);
getHuffmanTree(weightMap, huffmanTree);
getPassworld(huffmanTree, passworldMap);
string binary = "";
fin.open(fileName, ios::in);
unsigned char c;
binary = "";
while (!fin.eof())
{
c = fin.get();
if (fin.eof())
break;
string s = "";
if (judgeEng(c))
s += c;
else
{
s += c;
c = fin.get();
s += c;
c = fin.get();
s += c;
}
binary += passworldMap[s];
}
fin.close();
//不足8位的后面补0
int add0Num = binary.size() % 8;
if (add0Num)
add0Num = 8 - add0Num;
for (int i = 0; i < add0Num; ++i, binary += '0');
fout.open(compressFileName, ios::out | ios::binary);
// 将辅助信息(每个字符和字符的权重,以及补充0的个数)写入压缩文件中
fout << weightMap.size() << " " << add0Num << " ";
for (auto it : weightMap)
{
fout << it.first << " " << it.second << " ";
}
//将得到的二进制字符串每8位转为一个uchar类型写入压缩文件
for (int i = 0; i < binary.size(); i += 8)
{
string k = binary.substr(i, 8);
c = binaryStringToChar(k);
fout << c;
}
fout.close();
}
//将uchar字符转换成2进制的字符串
string ucharToBinaryString(unsigned char value)
{
string b = "";
for (int n = 0x80; n > 0; n >>= 1)
{
if (n & value)
b += '1';
else
b += '0';
}
return b;
}
//解压文件
void decompressFile(ifstream& fin, const char* compressFileName, ofstream& fout, const char* decompressfileName)
{
fin.open(compressFileName, ios::in | ios::binary);
if (!fin.is_open())
cout << "错误";
unsigned char c;
int size, add0;
map<string, int>_weightmap;
fin >> size >> add0; //读取辅助信息中补充0的个数
fin.get();
for (int i = 0; i < size; ++i) //读取辅助信息中每个字符及其权重,得到原文件的权重图
{
string s = "";
int weight = 0;
c = fin.get();
if (judgeEng(c))
s += c;
else
{
s += c;
c = fin.get();
s += c;
c = fin.get();
s += c;
}
c = fin.get();
c = fin.get();
for (; c != ' '; c = fin.get())
{
weight = weight * 10 + c - '0';
}
_weightmap.emplace(s, weight);
}
//通过权重图还原哈夫曼树
vector<huffmannode>_huffmantree;
getHuffmanTree(_weightmap, _huffmantree);
string binary = "";
while (!fin.eof()) //将压缩文件中的uchar转换成 2进制字符串(即原文件中的哈夫曼编码)
{
c = fin.get();
if (fin.eof())
break;
binary += ucharToBinaryString(c);
}
fin.close();
for (int i = 0; i < add0; binary.pop_back(), ++i);
//根据哈夫曼树将压缩文件的内容解压
fout.open(decompressfileName, ios::out);
int n = binary.size();
int len = _huffmantree.size();
int fa = len - 1;
for (int k = 0; k < n; ++k) {
if (binary.at(k) == '0')
fa = _huffmantree.at(fa).lc;
else if (binary.at(k) == '1')
fa = _huffmantree.at(fa).rc;
if (_huffmantree.at(fa).lc == _huffmantree.at(fa).rc)
{
fout << _huffmantree.at(fa).ch;
fa = len - 1;
}
}
fout.close();
}
int main()
{
ifstream fin;
ofstream fout;
char filename[] = "D:\\Desktop\\src.txt";//需要压缩的文件路径
char compressfilename[] = "D:\\Desktop\\src.huffmanzip";//压缩后的文件路径
char decompressfilename[] = "D:\\Desktop\\decompress_src.txt";//解压缩后的文件路径
compressFile(fin, filename, fout, compressfilename);
decompressFile(fin, compressfilename, fout, decompressfilename);
return 0;
}