注:本演示代码采用自上而下得到huffman编码(二叉树)
关于huffman树及相关算法这里就略过,这里探讨的是如何进行编码和解压缩。先说一下大致步骤
1.首先,读取文档(txt格式),将其存入string类型的变量pretext里
2.进行词频统计
3.创建Huffman树,并以此得到各字符的二进制编码
4.对pretext进行遍历,通过上面得到编码表,将其转化为二进制字符串code,这个二进制串可能十分长
5.将二进制串进行八位一编码,得到压缩后的compresstext文本
至此,压缩完成
还原步骤如下:
1.将compress化为二进制串newcode
2.将二进制串newcode对应如上得到的huffman树,根据二进制串依次遍历该树,得到对应原始字符,这些字符连接起来,即得到原文本newtext(也可对二进制串进行打表,注:打表也需要逐个遍历,查看对应二进制字符是否在表内,经博主测试,大量数据下,根据huffman树解码快一些)
需要注意的问题:
1.将二进制八位一编码,可能最后剩余并非是整八位,因此需要对最后几位进行补位,使其成为整八位,并记录补位长度(补了几位,记录有效位也可),这里代码为了简便,将补位长度直接放在compresstext的首部
2.一些数字和二进制之间的转换可能需要自己写函数实现
3.创建Huffman树时,采用stl的优先队列注意排序写法
4.中文为16位宽,处理时看做两个字符即可
#include <iostream>
#include <vector>
#include <queue>
#include <string>
#include <map>
#include <stack>
#include <unordered_map>
#include <sstream>
#include <algorithm>
#include <ctime>
#include <fstream>
using namespace std;
#define MaxBitL 256;
struct HuffmanTreeNode
{
unsigned char leafchar;
bool bit;
int weight;
HuffmanTreeNode *left,*right;
HuffmanTreeNode(unsigned char c,unsigned int w,HuffmanTreeNode *l,HuffmanTreeNode *r):leafchar(c),bit(0),weight(w),left(l),right(r){}
};
struct cmp
{
bool operator ()(HuffmanTreeNode* &a,HuffmanTreeNode* &b) const
{
return a->weight>b->weight;
}
};
unordered_map<unsigned char,unsigned int> freqTable;//字符频率统计表
unordered_map<unsigned char,string> dictTable;//字符到二进制序列的映射
map<string,unsigned char> RedictTable;//二进制序列到字符的映射
unordered_map<string,char> codeToCharTable;//二进制到字符(压缩)
unordered_map<char,string> charToCodeTable;//二进制到字符(解压)
priority_queue<HuffmanTreeNode*,vector<HuffmanTreeNode*>,cmp> pq;//优先队列
void ComputeFreqTable(const string &text)
{
for(unsigned char c:text)
freqTable[c]++;
cout<<"ComputeFreqTable Finished"<<endl;
}
HuffmanTreeNode * CreatHuffmanTree()
{
HuffmanTreeNode *root=NULL;
for(auto &x:freqTable)
{
HuffmanTreeNode *node=new HuffmanTreeNode(x.first,x.second,NULL,NULL);
pq.push(node);
}
while(!pq.empty())
{
HuffmanTreeNode *first=pq.top();
pq.pop();
if(pq.empty())
{
root=first;
break;
}
HuffmanTreeNode *second=pq.top();
pq.pop();
if(first->weight>second->weight) swap(first,second);
HuffmanTreeNode *s=new HuffmanTreeNode('\0',first->weight+second->weight,first,second);
s->right->bit=1;
pq.push(s);
}
cout<<"CreatHuffmanTree Finished"<<endl;
return root;
}
void dictHelp(HuffmanTreeNode *r,string &bin)
{
if(r)
{
bin.push_back(r->bit+'0');
if(!r->left&&!r->right)
{
dictTable[r->leafchar]=bin;
RedictTable[bin]=r->leafchar;
}
dictHelp(r->left,bin);
dictHelp(r->right,bin);
bin.pop_back();
}
}
void ComputeDictTable(HuffmanTreeNode *r)
{
string bin;
dictHelp(r,bin);
cout<<"ComputeDictTable Finished"<<endl;
}
string enCode(const string &Text)
{
string code;
for(auto &x:Text)
{
code+=dictTable[x];
}
cout<<"encode Finished"<<endl;
return code;
}
string ConvertDecToBinStr(int n,int bitnum)
{
string bin;
while(n)
{
bin+='0'+n%2;
n/=2;
}
string tail(bitnum-bin.size(),'0');
return bin+tail;
}
void CreatcodeToCharTable()
{
for(int i=0;i<256;++i)
{
string b=ConvertDecToBinStr(i,8);
charToCodeTable[i+'\0']=b;
codeToCharTable[b]=i+'\0';
}
cout<<"CreatcodeToCharTable Finished"<<endl;
}
string compressCode(const string &code)
{
string compressText,codetemp=code;
int len=0;
cout<<"try! : ";
while(!codetemp.empty())
{
//cout<<".";
string temp;
if(codetemp.size()>=8)
{
temp=codetemp.substr(0,8);
codetemp=codetemp.substr(8,codetemp.size()-8);
}
else
{
len=8-codetemp.size();
string tail(len,'0');
codetemp+=tail;
compressText+=codeToCharTable[codetemp];
break;
}
compressText+=codeToCharTable[temp];
}
char head='0'+len;
compressText=head+compressText;
cout<<"compressCode Finished"<<endl;
return compressText;
}
string decompressCode(const string &compressText)
{
string comTexttemp=compressText,newcode;
int len=comTexttemp[0]-'0';
comTexttemp=comTexttemp.substr(1,compressText.size()-1);
for(unsigned char x:comTexttemp)
{
newcode+=charToCodeTable[x-'\0'];
}
cout<<"decompressCode Finished"<<endl;
return newcode.substr(0,newcode.size()-len);
}
string ConvertCodeToText(const string &code)
{
string newtext,bin;
for(unsigned char x:code)
{
bin+=x;
auto it=RedictTable.find(bin);
if(it!=RedictTable.end())
{
newtext+=it->second;
bin.clear();
}
}
cout<<"ConvertCodeToText Finished"<<endl;
return newtext;
}
void disStr(const string &Text)
{
cout<<"------------------------"<<endl;
cout<<Text<<endl;
cout<<"------------------------"<<endl;
}
void WriteTextByRand(string &Text,unsigned int size)
{
srand((unsigned)time(NULL));
for(unsigned int i=0;i<size;++i)
{
int r=rand()%(126-33)+33;
cout<<(char)r;
Text.push_back((char)r);
}
}
void WriteTextByFile(string &Text)
{
ifstream in;
in.open("1.txt");//文件名
if(in.is_open())
{
std::stringstream buffer;
buffer<<in.rdbuf();
Text=buffer.str();
}
else cout<<"can not find this file"<<endl;
in.close();
cout<<"WriteTextByFile Finished"<<endl;
}
void prtime(clock_t s)
{
cout<<"usetime : "<<clock()-s<<" ms"<<endl;
}
char HelpPlus(const string &str,unsigned int &i,HuffmanTreeNode *r)
{
unsigned int len=str.size();
while(i<len&&r)
{
if(!r->left&&!r->right) {return r->leafchar;}
int bit=str[++i]-'0';
if(bit==1) r=r->right;
else r=r->left;
}
return '\0';
}
string DeCodeToText(string &decompressText,HuffmanTreeNode *r)
{
string NewText="";
unsigned int i=0;
while(i<decompressText.size())
{
NewText.push_back(HelpPlus(decompressText,i,r));
++i;
}
return NewText;
}
int main()
{
clock_t start_time=clock();
string PreText="";//文本
//WriteTextByRand(PreText,10000);
WriteTextByFile(PreText);
prtime(start_time);
//disStr(PreText);
ComputeFreqTable(PreText);prtime(start_time);
HuffmanTreeNode *huff=CreatHuffmanTree();prtime(start_time);
ComputeDictTable(huff);prtime(start_time);
CreatcodeToCharTable();prtime(start_time);
string code=enCode(PreText);prtime(start_time);
//cout<<code<<endl;
string compressText=compressCode(code);prtime(start_time);
//disStr(compressText);
string decode=decompressCode(compressText);prtime(start_time);
//cout<<decode<<endl;
//string newtext=ConvertCodeToText(decode);//根据打表解码
string newtext=DeCodeToText(decode,huff);//根据huffman树解码
prtime(start_time);
//disStr(newtext);
if(code==decode) cout<<"Same Code"<<endl;
else cout<<"Different Code"<<endl;
if(PreText==newtext) cout<<"Same Text"<<endl;
else cout<<"Different Text"<<endl;
cout<<"PreText Length : "<<PreText.size()<<" ComText Length : "<<compressText.size()<<endl;
cout<<"Rate : "<<100.0*compressText.size()/PreText.size()<<"%"<<endl;
ofstream o("compressText.txt");
o<<compressText<<endl;
o.close();
ofstream out("newtext.txt");
out<<newtext<<endl;
out.close();
clock_t end_time=clock();
cout<<"rt : "<<(end_time-start_time)*1.0/CLOCKS_PER_SEC<<" s"<<endl;
return 0;
}
运行结果:
1.对随机字符:随机生成10000个字符,进行huffman编码,压缩率仅仅为95%左右
ComputeFreqTable Finished
usetime : 488 ms
CreatHuffmanTree Finished
usetime : 493 ms
ComputeDictTable Finished
usetime : 497 ms
CreatcodeToCharTable Finished
usetime : 504 ms
encode Finished
usetime : 508 ms
try! : compressCode Finished
usetime : 573 ms
decompressCode Finished
usetime : 577 ms
usetime : 581 ms
Same Code
Same Text
PreText Length : 10000 ComText Length : 9481
Rate : 94.81%
rt : 0.591 s
2.对英语文章:文章选自china daily,题材来源不同,共计12199字,多次测试,压缩率稳定在70%左右
WriteTextByFile Finished
usetime : 1 ms
ComputeFreqTable Finished
usetime : 3 ms
CreatHuffmanTree Finished
usetime : 3 ms
ComputeDictTable Finished
usetime : 3 ms
CreatcodeToCharTable Finished
usetime : 4 ms
encode Finished
usetime : 7 ms
try! : compressCode Finished
usetime : 61 ms
decompressCode Finished
usetime : 64 ms
usetime : 66 ms
Same Code
Same Text
PreText Length : 12199 ComText Length : 8489
Rate : 69.5877%
rt : 0.068 s
3.中文字符:文章选自《福尔摩斯探案集》东方探案部分,文字总计437700字,压缩率89%左右,花费时间较长,为342.452 s
WriteTextByFile Finished
usetime : 4 ms
ComputeFreqTable Finished
usetime : 59 ms
CreatHuffmanTree Finished
usetime : 60 ms
ComputeDictTable Finished
usetime : 60 ms
CreatcodeToCharTable Finished
usetime : 61 ms
encode Finished
usetime : 168 ms
try! : compressCode Finished
usetime : 342272 ms
decompressCode Finished
usetime : 342356 ms
usetime : 342420 ms
Same Code
Same Text
PreText Length : 437700 ComText Length : 389238
Rate : 88.928%
rt : 342.452 s
4.时间对比:将英文累积到432673,花费时间为193.675s,远远小于编码中文时间,压缩率稳定在70%左右
WriteTextByFile Finished
usetime : 7 ms
ComputeFreqTable Finished
usetime : 80 ms
CreatHuffmanTree Finished
usetime : 81 ms
ComputeDictTable Finished
usetime : 81 ms
CreatcodeToCharTable Finished
usetime : 82 ms
encode Finished
usetime : 183 ms
try! : compressCode Finished
usetime : 193529 ms
decompressCode Finished
usetime : 193597 ms
usetime : 193652 ms
Same Code
Same Text
PreText Length : 432673 ComText Length : 300885
Rate : 69.541%
rt : 193.675 s
总结:Huffman编码对英文支持较好,时间主要花费在将二进制字符串转化为字符这一过程上,但是在实际计算机中,二进制序列是自动化为字符的,这里仅仅是为了模拟这一过程,因此,huffman编码在通信中还是有较大的应用价值