HuffmanTree哈夫曼树:
又称最优二叉树,是加权路径长度最短的二叉树;
如何建树,思想如下:
以此类推最终将其构建成以下的样子:
代码如下:
#include"heap_review.h"//自己实现的堆模板类,见上篇堆文章
#include<queue>
template <class T>
struct HuffmanTreeNode
{
T _data;
HuffmanTreeNode<T>* _left;
HuffmanTreeNode<T>* _right;
HuffmanTreeNode(const T& data)
:_data(data),_left(NULL),_right(NULL)
{}
};
template <class T>
class HuffmanTree
{
typedef HuffmanTreeNode<T> Node;
protected:
Node* _root;
public:
HuffmanTree()
:_root(NULL)
{}
struct NodeCompare
{
bool operator()(const Node* left, const Node* right)
{
return left->_data < right->_data;
}
};
HuffmanTree(T* arr,int n,T invalid)
{
//创建一个小堆
Heap<Node*,NodeCompare> minHeap;
for(int i = 0; i < n; i++)
{
if(arr[i] != invalid)
{
minHeap.Push(new Node(arr[i]));
}
}
//创建Huffman树
while(minHeap.Size() > 1)
{
//取出前两个最小的值
Node* left = minHeap.Top();
minHeap.Pop();
Node* right = minHeap.Top();
minHeap.Pop();
//创建一个父节点,值为两个孩子之和
Node* parent = new Node(left->_data + right->_data);
parent->_left = left;
parent->_right = right;
minHeap.Push(parent);
}
_root = minHeap.Top();//将根节点赋给_root
}
void showHuffman()
{
cout<<"层序遍历哈弗曼树:"<<endl;
_showHuffman(_root);
cout<<endl;
}
//层序遍历
void _showHuffman(Node* root)
{
if(NULL == root)
return;
queue<Node*> q;
q.push(root);
while(!q.empty())
{
Node* cur = q.front();
cout<<cur->_data<<" ";
if(cur->_left != NULL)
{
q.push(cur->_left);
}
if(cur->_right != NULL)
{
q.push(cur->_right);
}
q.pop();
}
}
};
void TestHuffmanTree()
{
int arr[] = {0,1,2,3,'*','*',4,5,6,7,8,9,'*'};
size_t len = sizeof(arr)/sizeof(arr[0]);
HuffmanTree<int> hf(arr,len,'*');
hf.showHuffman();
}
然后将_root指针指向根节点45即可,这颗哈夫曼树就已经建好了,实现结果如下:
**Huffman树项目:
文件压缩步骤:**
1. 统计字符出现的次数;
2. 构建HuffmanTree哈夫曼树;
3. 生成Huffman编码;(自由定义,这里我们定义为左0,右1)
4. 压缩文件;
#include"HuffmanTree_review.h"
typedef long LongType;
struct charInfo
{
char _ch;//字符
LongType _count;//字符出现的次数
string _code;//字符的huffman编码
charInfo operator+(const charInfo& t)const
{
charInfo ret;
ret._count = _count + t._count;
return ret;
}
bool operator<(const charInfo& t)const
{
return this->_count < t._count;
}
bool operator!=(const charInfo& t)const
{
return this->_count != t._count;
}
};
//压缩文件类
class FileCompress
{
typedef HuffmanTreeNode<charInfo> Node;
protected:
charInfo _infor[256];
public:
FileCompress()
{
//初始化数组
for(size_t i = 0; i < 256; ++i)
{
_infor[i]._ch = i;
_infor[i]._count = 0;
}
}
//统计字符出现的次数
//创建Huffman树
void CountCharCreateTree(const char* filename)
{
//统计所有字符出现的次数
assert(filename);
FILE* fd = fopen(filename,"r");
if(NULL == fd)
{
perror("open file is error");
return;
}
int ch = fgetc(fd);//**获取第一个字符,如果把ch设置成char那将无法压缩汉字,unsigned char也不行,这样会陷入死循环**
while(ch != EOF)
{
_infor[ch]._count++;
ch = fgetc(fd);
}
//构建Huffman树
charInfo invalid;
invalid._count = 0;//出现次数<=0的直接丢弃
HuffmanTree<charInfo> heap(_infor,256,invalid);
//生成Huffman编码
string code;
CreateHuffmanCodeR(heap.GetRoot(),code);
//压缩
//生成压缩文件,将压缩信息存储在压缩文件中
string compressFile = filename;
compressFile += "huffman.txt";
//将压缩文件名转化成const char* 类型
FILE* fw = fopen(compressFile.c_str(),"w");
if(fw == NULL)
{
perror("fopen compressFile is error");
return;
}
//将原文件指针移动到文件开头的位置
fseek(fd, 0, SEEK_SET);
ch = fgetc(fd);
unsigned char value = 0;
int count = 0;
/*fputc('a',fw);*/
while(ch != EOF)
{
code = _infor[ch]._code;//获取读取的字符的编码
for(size_t i = 0; i < code.size(); i++)
{
value <<= 1;
value |= (code[i]-'0');
if(++count == 8)//一个字节8个比特位
{
fputc(value,fw);
value = 0;
count = 0;
}
}
ch = fgetc(fd);
}
if(count < 8)
{
value <<= 8 - count;
fputc(value,fw);
}
fclose(fw);
fclose(fd);
//解压缩
Uncompress(compressFile.c_str());
}
//生成Huffman编码
//两种方法:递归法一(从下往上,最后逆置)和递归法二(从上往下直接生成)
//递归法一
void CreateHuffmanCode(Node* root)
{
if(NULL == root)
return;
if(root->_left == NULL && root->_right == NULL)//叶子节点
{
Node* cur = root;
Node* parent = cur->_parent;
string& code = _infor[cur->_data._ch]._code;
while(parent)
{
if(parent->_left == cur)
{
code.push_back('0');
}
else
{
code.push_back('1');
}
cur = parent;
parent = cur->_parent;
}
//逆置
reverse(code.begin(),code.end());
_infor[root->_data._ch]._code = code;
return;
}
CreateHuffmanCode(root->_left);
CreateHuffmanCode(root->_right);
}
//递归法二
void CreateHuffmanCodeR(Node* root,string& code)//递归法
{
if(NULL == root)
return;
if(root->_left == NULL && root->_right == NULL)
{
_infor[root->_data._ch]._code = code;
return;
}
CreateHuffmanCodeR(root->_left,code+'0');
CreateHuffmanCodeR(root->_right,code+'1');
}
//解压缩
void Uncompress(const char* filename)//传入的文件是压缩文件
{
assert(filename);
string uncompressFile = filename;//生成解压缩文件
size_t point = uncompressFile.rfind('.');
assert(point != string::npos);
uncompressFile = uncompressFile.substr(0,point);
uncompressFile += "uncompressFile.txt";
FILE* fw = fopen(uncompressFile.c_str(),"w");
if(NULL == fw)
{
perror("open uncompressFile is error");
return;
}
//重构Huffman树
charInfo invalid;
invalid._count = 0;
HuffmanTree<charInfo> hufftree(_infor,256,invalid);
Node* root = hufftree.GetRoot();
LongType count = root->_data._count;//求出所有字符出现的次数之和
FILE* fr = fopen(filename,"r");//以只读的方式打开压缩文件
if(NULL == fr)
{
perror("fopen uncompressFile is error");
return;
}
unsigned char ch = fgetc(fr);
Node* cur = root;
while(ch != EOF)
{
for(int pos = 7; pos >= 0; --pos)
{
if((1<<pos) & ch)//说明是1,右边
{
cur = cur->_right;
}
else//说明是0,左边
{
cur = cur->_left;
}
if(cur->_left == NULL && cur->_right == NULL)//到叶子节点
{
fputc(cur->_data._ch,fw);
if(--count == 0)//字符已经读完了
{
break;
}
cur = root;
}
}
if(count == 0)
break;
ch = fgetc(fr);
}
fclose(fr);
fclose(fw);
}
};
那么这样写会不会感觉很尴尬,压缩文件和解压文件一起执行?当然不行了,所以这时候我们就需要将我们的信息进行配置存储,然后一起放入压缩文件中,等解压的时候重新配置我们的信息,构建我们Huffman树,然后解压缩,这样就可以将压缩和解压缩分开进行了。
见如下改进版本:
#include<assert.h>
#include<string>
#include"HuffmanTree_review.h"
typedef long LongType;
struct charInfo
{
unsigned char _ch;//字符
LongType _count;//字符出现的次数
string _code;//字符的huffman编码
charInfo operator+(const charInfo& t)const
{
charInfo ret;
ret._count = _count + t._count;
return ret;
}
bool operator<(const charInfo& t)const
{
return this->_count < t._count;
}
bool operator!=(const charInfo& t)const
{
return this->_count != t._count;
}
};
//压缩文件类
class FileCompress
{
typedef HuffmanTreeNode<charInfo> Node;
protected:
//字符读写
charInfo _infor[256];
//二进制读写
struct _NodeInfo
{
unsigned char _ch;
LongType _count;
};
public:
FileCompress()
{
//初始化数组
for(size_t i = 0; i < 256; ++i)
{
_infor[i]._ch = i;
_infor[i]._count = 0;
}
}
//统计字符出现的次数
//创建Huffman树
void Compress(const char* filename)
{
//统计所有字符出现的次数
assert(filename);
FILE* fd = fopen(filename,"rb");
if(NULL == fd)
{
perror("open file is error");
return;
}
int ch = fgetc(fd);//获取第一个字符
while(ch != EOF)
{
_infor[ch]._count++;
ch = fgetc(fd);
}
//构建Huffman树
charInfo invalid;
invalid._count = 0;//出现次数<=0的直接丢弃
HuffmanTree<charInfo> heap(_infor,256,invalid);
//生成Huffman编码
/*string code;
CreateHuffmanCodeR(heap.GetRoot(),code);*/
CreateHuffmanCode(heap.GetRoot());
//压缩
//生成压缩文件,将压缩信息存储在压缩文件中
string compressFile = filename;
compressFile += "huffman.txt";
//将压缩文件名转化成const char* 类型
FILE* fwb = fopen(compressFile.c_str(),"wb");//以二进制的读写方式打开
if(fwb == NULL)
{
perror("fopen compressFile is error");
return;
}
//配置文本信息,将字符信息重新打包封装好放入压缩文件中
for(size_t i = 0; i < 256; i++)
{
if(_infor[i]._count != 0)
{
_NodeInfo info;
info._ch = _infor[i]._ch;
info._count = _infor[i]._count;
size_t size = fwrite(&info,sizeof(_NodeInfo),1,fwb);
assert(size != sizeof(_NodeInfo));
}
}
//这个类似于设置一个结尾信息,以便于后期提取信息
_NodeInfo info;
info._count = 0;
size_t size = fwrite(&info,sizeof(_NodeInfo),1,fwb);
assert(size != sizeof(_NodeInfo));
////将压缩文件名转化成const char* 类型
//FILE* fw = fopen(compressFile.c_str(),"w");
//if(fw == NULL)
//{
// perror("fopen compressFile is error");
// return;
//}
//将原文件指针移动到文件开头的位置
fseek(fd, 0, SEEK_SET);
ch = fgetc(fd);
unsigned char value = 0;
int count = 0;
while(ch != EOF)
{
string& code = _infor[ch]._code;//获取读取的字符的编码
for(size_t i = 0; i < code.size(); i++)
{
value <<= 1;
value |= (code[i]-'0');
if(++count == 8)//一个字节8个比特位
{
fputc(value,fwb);
value = 0;
count = 0;
}
}
ch = fgetc(fd);
}
if(count < 8)
{
value <<= 8 - count;
fputc(value,fwb);
}
//解压缩
/*fseek(fw,0,SEEK_SET);*/
fclose(fwb);
fclose(fd);
/*Uncompress(compressFile.c_str());*/
}
//生成Huffman编码
//两种方法:递归法一(从下往上,最后逆置)和递归法二(从上往下直接生成)
//递归法一
void CreateHuffmanCode(Node* root)//非递归法
{
if(NULL == root)
return;
if(root->_left == NULL && root->_right == NULL)//叶子节点
{
Node* cur = root;
Node* parent = cur->_parent;
string& code = _infor[cur->_data._ch]._code;
while(parent)
{
if(parent->_left == cur)
{
code.push_back('0');
}
else
{
code.push_back('1');
}
cur = parent;
parent = cur->_parent;
}
//逆置
reverse(code.begin(),code.end());
_infor[root->_data._ch]._code = code;
return;
}
CreateHuffmanCode(root->_left);
CreateHuffmanCode(root->_right);
}
//递归法二
void CreateHuffmanCodeR(Node* root,string& code)//递归法
{
if(NULL == root)
return;
if(root->_left == NULL && root->_right == NULL)
{
_infor[root->_data._ch]._code = code;
return;
}
CreateHuffmanCodeR(root->_left,code+'0');
CreateHuffmanCodeR(root->_right,code+'1');
}
//解压缩
void Uncompress(const char* filename)//传入的文件是压缩文件
{
assert(filename);
string uncompressFile = filename;//生成解压缩文件
size_t point = uncompressFile.rfind('.');
assert(point != string::npos);
uncompressFile = uncompressFile.substr(0,point);
uncompressFile += "uncompressFile.txt";
FILE* fw = fopen(uncompressFile.c_str(),"w");
if(NULL == fw)
{
perror("open uncompressFile is error");
return;
}
////以文本读写的方式打开压缩文件
//FILE* fr = fopen(filename,"r");//以只读的方式打开压缩文件
//if(NULL == fr)
//{
// perror("fopen uncompressFile is error");
// return;
//}
//以二进制读写的方式打开压缩文件
FILE* frb = fopen(filename,"rb");//以只读的方式打开压缩文件
if(NULL == frb)
{
perror("fopen uncompressFile is error");
return;
}
//恢复数组的配置信息
while(1)
{
_NodeInfo info;
size_t size = fread(&info,sizeof(_NodeInfo),1,frb);
assert(size != sizeof(_NodeInfo));
if(info._count > 0)
{
_infor[info._ch]._ch = info._ch;
_infor[info._ch]._count = info._count;
}
else
break;//退出
}
//重构Huffman树
charInfo invalid;
invalid._count = 0;
HuffmanTree<charInfo> hufftree(_infor,256,invalid);
Node* root = hufftree.GetRoot();
LongType count = root->_data._count;//求出所有字符出现的次数之和
int ch = fgetc(frb);
Node* cur = root;
while(ch != EOF)
{
for(int pos = 7; pos >= 0; --pos)
{
if((1<<pos) & ch)//说明是1,右边
{
cur = cur->_right;
}
else//说明是0,左边
{
cur = cur->_left;
}
if(cur->_left == NULL && cur->_right == NULL)//到叶子节点
{
fputc(cur->_data._ch,fw);
if(--count == 0)//字符已经读完了
{
break;
}
cur = root;
}
}
if(count == 0)
break;
ch = fgetc(frb);
}
/*fclose(fr);*/
fclose(frb);
fclose(fw);
}
};
void TestFileCompress()
{
/*const char* filename = "filecompresstest.txt";*/
FileCompress compress;
/*compress.Compress("filecompresstest.txt");*/
compress.Uncompress("filecompresstest.txthuffman.txt");
}
项目补充:
问:文件压缩一定会压小吗?
答:不一定;当编码<8的时候,压缩节省空间可以实现当编码>8时,压缩反而增大;当文件字符出现的次数频率都差不多的时候,不会起到压缩效果,而字符出现次数相差较大时会有压缩效果。
问:一个文件是否可以多次压缩,越压越小?
答:错,一般第一次压缩效果最好。
问:Huffman压缩的意义是什么?
答:实际应用其实是zip压缩。