Huffman树的概念
Huffman树是由n个带权叶子节点构成的所有二叉树中带权路径长度最短的二叉树。
节点的带权路径长度
树根到某一节点的路径长度与该节点的权的乘积。
树的带权路径长度
树的带权路径长度为树中从根节点到所有叶子节点的各个带权路径长度之和。
Huffman树的构造步骤:
- 初始化:将给定的节点都看作根节点,构成森林。
- 找最小树:在森林中选出两棵根节点的权值最小的二叉树作为一棵新树的左,右 子树,且置新树的根节点的权值为其左,右子树上根节点的权值之和。(huffman树并没有对左右权值的大小进行规定)在这里我们规定:左子树的权值小于右子树。
- 删除和加入:从森林中删除被选中的两颗二叉树,同时把新构成的二叉树加入到森林中。
- 判断:重复2,3操作,直到森林中只含有一棵二叉树。
下面画图说明:
文件压缩的步骤:
- 统计字符个数 (字符的个数对应权值)
- 生成Huffman树
- 生成Huffman编码
- 压缩
4.1:压缩之前写入字符及其出现的次数(结构体),方便解压缩时重建Huffman树。
4.2:进行压缩。
文件解压缩的步骤:
- 打开压缩文件
- 重建Huffman树
- 进行解压缩
压缩和解压缩过程中常见的问题:
- 文件在进行读写时都应该进行二进制读写,否则遇到转义字符可能会使文件读写提前结束。二进制读写则不会将转义字符进行转义。
- 为什么压缩之前要写入字符及其出现次数(结构体,次数>0)?因为压缩和解压缩是两个分开的操作,解压缩时不可能根据压缩时的构建Huffman树进行解压缩,所以要进行重建Huffman树。
- 写入字符及其出现次数(结构体)之后,还要写入一个出现次数为0的结构体。要做到与正文的分割。
- 进行压缩时,当最后构不成一个字符时,编译器会自动将其高位补充0。
- 进行解压缩时如何才能不将填充的0进行解压?定义一个变量filecount统计字符的个数。每解压一个字符进行filecount减一。
代码如下:
HuffmanTree.hpp
#pragma once
#include<iostream>
using namespace std;
#include<fstream>
#include<assert.h>
#include<queue>
#include<vector>
#include<string>
template<class W>
struct HuffmanTreeNode
{
HuffmanTreeNode<W>* _root;
HuffmanTreeNode<W>* _left;
HuffmanTreeNode<W>* _right;
W _w;
HuffmanTreeNode(const W& w)
:_root(NULL)
,_left(NULL)
,_right(NULL)
,_w(w)
{}
};
template<class W>
class HuffmanTree
{
typedef HuffmanTreeNode<W> Node;
public:
HuffmanTree()
:_root(NULL)
{}
struct NodeCompare
{
bool operator()(const Node* left, const Node* right)
{
return left->_w > right->_w;
}
};
HuffmanTree(W* w, size_t n,const W& invalid)
{
//构建Huffman树优先级队列
priority_queue<Node*, vector<Node*>, NodeCompare> minheap;
for (size_t i = 0; i < n; ++i)
{
if (w[i] != invalid)
{
minheap.push(new Node(w[i]));
}
}
while (minheap.size()>1)
{
Node* left = minheap.top();
minheap.pop();
Node* right = minheap.top();
minheap.pop();
Node* parent = new Node(left->_w + right->_w);
parent->_left = left;
parent->_right = right;
minheap.push(parent);
}
_root = minheap.top();
}
~HuffmanTree()
{
Destory(_root);
_root = NULL;
}
void Destory(Node* root)
{
if (root == NULL)
{
return;
}
Destory(root->_left);
Destory(root->_right);
delete root;
}
Node* GetRoot()
{
return _root;
}
private://防拷贝
HuffmanTree(const HuffmanTree<W>& w)
{}
HuffmanTree<W>& operator=(const HuffmanTree<W>& w)
{}
private:
Node* _root;
};
FileCompress.hpp
#pragma once
#include"HuffmanTree.hpp"
typedef long long LongType;
struct CharInfo
{
char _ch;
LongType _count;
string _code;
bool operator!=(const CharInfo& info)
{
return _count != info._count;
}
CharInfo operator+(const CharInfo& info)
{
CharInfo ret;
ret._count = _count + info._count;
return ret;
}
bool operator>(const CharInfo& info) const
{
return _count > info._count;
}
};
class FileCompress
{
typedef HuffmanTreeNode<CharInfo> Node;
public:
struct ConfigInfo
{
char _ch;
LongType _count;
};
FileCompress()
{
for (int i = 0; i < 256; ++i)
{
_hashInfos[i]._ch = i;
_hashInfos[i]._count = 0;
}
}
void Compress(const char* file)
{
//1.统计字符个数
ifstream ifs(file, ios_base::in | ios_base::binary);
char ch;
while (ifs.get(ch))
{
_hashInfos[(unsigned char)ch]._count++;
}
//2.生成Huffman树
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_hashInfos, 256, invalid);
//3.生成Huffman code
GenerateHuffmanCode(tree.GetRoot());
//4.压缩
string compressfile = file;
compressfile += ".compress";
ofstream ofs(compressfile.c_str(), ios_base::out | ios_base::binary);
//4.1压缩前写入字符次数,方便解压缩时构建Huffman树
for (int i = 0; i < 256; ++i)
{
if (_hashInfos[i]._count > 0)
{
ConfigInfo info;
info._ch = _hashInfos[i]._ch;
info._count = _hashInfos[i]._count;
ofs.write((char*)&info, sizeof(ConfigInfo));
}
}
ConfigInfo end;
end._count = 0;
ofs.write((char*)&end, sizeof(ConfigInfo));
//4.2进行压缩
char value = 0;
int pos = 0;
ifs.clear();
ifs.seekg(0);
while (ifs.get(ch))
{
string& code = _hashInfos[(unsigned char)ch]._code;
for (size_t i = 0; i < code.size(); ++i)
{
if (code[i] == '0')
{
value &= (~(1 << pos));
}
else if (code[i] == '1')
{
value |= (1 << pos);
}
else
{
assert(false);
}
++pos;
if (pos == 8)
{
ofs.put(value);
//printf("%x ", value);
pos = 0;
value = 0;
}
}
}
if (pos > 0)
{
//printf("%x ", value);
ofs.put(value);
}
}
void GenerateHuffmanCode(Node* root)
{
if (root == NULL)
return;
if (root->_left == NULL&&root->_right == NULL)
{
_hashInfos[(unsigned char)root->_w._ch]._code = root->_w._code;
}
if (root->_left != NULL)
{
root->_left->_w._code = root->_w._code + '0';
GenerateHuffmanCode(root->_left);
}
if (root->_right != NULL)
{
root->_right->_w._code = root->_w._code + '1';
GenerateHuffmanCode(root->_right);
}
}
void UnCompress(const char* file)
{
//1.打开压缩文件
ifstream ifs(file, ios_base::in | ios_base::binary);
string uncompressfile = file;
size_t pos = uncompressfile.rfind('.');
assert(pos != string::npos);
uncompressfile.erase(pos);//缺省值为npos
#ifdef _DEBUG
uncompressfile += ".uncompress";
#endif
ofstream ofs(uncompressfile.c_str(), ios_base::out | ios_base::binary);
//2.重建Huffman树
while (1)
{
ConfigInfo info;
ifs.read((char*)&info, sizeof(ConfigInfo));
if (info._count > 0)
{
_hashInfos[(unsigned char)info._ch]._count = info._count;
}
else
{
break;
}
}
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_hashInfos, 256, invalid);
//3.解压缩
Node* root = tree.GetRoot();
LongType filecount = root->_w._count;
Node* cur = root;
char ch;
while (ifs.get(ch))
{
for (size_t pos = 0; pos < 8; ++pos)
{
if (ch&(1 << pos))//1
{
cur = cur->_right;
}
else//0
{
cur = cur->_left;
}
if (cur->_left == NULL&&cur->_right == NULL)
{
ofs.put(cur->_w._ch);
cur = root;
if (--filecount == 0)
{
break;
}
}
}
}
}
private:
CharInfo _hashInfos[256];
};
void TestCompress()
{
FileCompress fc;
fc.Compress("input.txt");
}
void TestUncompress()
{
FileCompress fc;
fc.UnCompress("input.txt.compress");
}
test.cpp
#include"FileCompress.hpp"
int main()
{
TestCompress();
TestUncompress();
return 0;
}
最后可以根据UltraCompare工具进行压缩前后文件的比较。