Huffman编码压缩和解压文档,C++实现

注:本演示代码采用自上而下得到huffman编码(二叉树)
关于huffman树及相关算法这里就略过,这里探讨的是如何进行编码和解压缩。先说一下大致步骤
1.首先,读取文档(txt格式),将其存入string类型的变量pretext里
2.进行词频统计
3.创建Huffman树,并以此得到各字符的二进制编码
4.对pretext进行遍历,通过上面得到编码表,将其转化为二进制字符串code,这个二进制串可能十分长
5.将二进制串进行八位一编码,得到压缩后的compresstext文本
至此,压缩完成
还原步骤如下:
1.将compress化为二进制串newcode
2.将二进制串newcode对应如上得到的huffman树,根据二进制串依次遍历该树,得到对应原始字符,这些字符连接起来,即得到原文本newtext(也可对二进制串进行打表,注:打表也需要逐个遍历,查看对应二进制字符是否在表内,经博主测试,大量数据下,根据huffman树解码快一些)

需要注意的问题:
1.将二进制八位一编码,可能最后剩余并非是整八位,因此需要对最后几位进行补位,使其成为整八位,并记录补位长度(补了几位,记录有效位也可),这里代码为了简便,将补位长度直接放在compresstext的首部
2.一些数字和二进制之间的转换可能需要自己写函数实现
3.创建Huffman树时,采用stl的优先队列注意排序写法
4.中文为16位宽,处理时看做两个字符即可

#include <iostream>
#include <vector>
#include <queue>
#include <string>
#include <map>
#include <stack>
#include <unordered_map>
#include <sstream>
#include <algorithm>
#include <ctime>
#include <fstream>
using namespace std;
#define MaxBitL 256;
struct HuffmanTreeNode
{
    unsigned char leafchar;
    bool bit;
    int weight;
    HuffmanTreeNode *left,*right;
    HuffmanTreeNode(unsigned char c,unsigned int w,HuffmanTreeNode *l,HuffmanTreeNode *r):leafchar(c),bit(0),weight(w),left(l),right(r){}
};
struct cmp
{
    bool operator ()(HuffmanTreeNode* &a,HuffmanTreeNode* &b) const
    {
        return a->weight>b->weight;
    }
};
unordered_map<unsigned char,unsigned int> freqTable;//字符频率统计表
unordered_map<unsigned char,string> dictTable;//字符到二进制序列的映射
map<string,unsigned char> RedictTable;//二进制序列到字符的映射
unordered_map<string,char> codeToCharTable;//二进制到字符(压缩)
unordered_map<char,string> charToCodeTable;//二进制到字符(解压)
priority_queue<HuffmanTreeNode*,vector<HuffmanTreeNode*>,cmp> pq;//优先队列
void ComputeFreqTable(const string &text)
{
    for(unsigned char c:text)
        freqTable[c]++;
    cout<<"ComputeFreqTable Finished"<<endl;
}
HuffmanTreeNode * CreatHuffmanTree()
{
    HuffmanTreeNode *root=NULL;
    for(auto &x:freqTable)
    {
        HuffmanTreeNode *node=new HuffmanTreeNode(x.first,x.second,NULL,NULL);
        pq.push(node);
    }
    while(!pq.empty())
    {
        HuffmanTreeNode *first=pq.top();
        pq.pop();
        if(pq.empty())
        {
            root=first;
            break;
        }
        HuffmanTreeNode *second=pq.top();
        pq.pop();
        if(first->weight>second->weight) swap(first,second);
        HuffmanTreeNode *s=new HuffmanTreeNode('\0',first->weight+second->weight,first,second);
        s->right->bit=1;
        pq.push(s);
    }
    cout<<"CreatHuffmanTree Finished"<<endl;
    return root;
}
void dictHelp(HuffmanTreeNode *r,string &bin)
{
    if(r)
    {
        bin.push_back(r->bit+'0');
        if(!r->left&&!r->right)
        {
            dictTable[r->leafchar]=bin;
            RedictTable[bin]=r->leafchar;
        }
        dictHelp(r->left,bin);
        dictHelp(r->right,bin);
        bin.pop_back();
    }
}
void ComputeDictTable(HuffmanTreeNode *r)
{
    string bin;
    dictHelp(r,bin);
    cout<<"ComputeDictTable Finished"<<endl;
}
string enCode(const string &Text)
{
    string code;
    for(auto &x:Text)
    {
        code+=dictTable[x];
    }
    cout<<"encode Finished"<<endl;
    return code;
}
string ConvertDecToBinStr(int n,int bitnum)
{
    string bin;
    while(n)
    {
        bin+='0'+n%2;
        n/=2;
    }
    string tail(bitnum-bin.size(),'0');
    return bin+tail;
}
void CreatcodeToCharTable()
{
    for(int i=0;i<256;++i)
    {
        string b=ConvertDecToBinStr(i,8);
        charToCodeTable[i+'\0']=b;
        codeToCharTable[b]=i+'\0';
    }
    cout<<"CreatcodeToCharTable Finished"<<endl;
}
string compressCode(const string &code)
{
    string compressText,codetemp=code;
    int len=0;
    cout<<"try! : ";
    while(!codetemp.empty())
    {
        //cout<<".";
        string temp;
        if(codetemp.size()>=8)
        {
            temp=codetemp.substr(0,8);
            codetemp=codetemp.substr(8,codetemp.size()-8);
        }
        else 
        {
            len=8-codetemp.size();
            string tail(len,'0');
            codetemp+=tail;
            compressText+=codeToCharTable[codetemp];
            break;
        }
        compressText+=codeToCharTable[temp];
    }
    char head='0'+len;
    compressText=head+compressText;
    cout<<"compressCode Finished"<<endl;
    return compressText;
}
string decompressCode(const string &compressText)
{
    string comTexttemp=compressText,newcode;
    int len=comTexttemp[0]-'0';
    comTexttemp=comTexttemp.substr(1,compressText.size()-1);
    for(unsigned char x:comTexttemp)
    {
        newcode+=charToCodeTable[x-'\0'];
    }
    cout<<"decompressCode Finished"<<endl;
    return newcode.substr(0,newcode.size()-len);
}
string ConvertCodeToText(const string &code)
{
    string newtext,bin;
    for(unsigned char x:code)
    {
        bin+=x;
        auto it=RedictTable.find(bin);
        if(it!=RedictTable.end())
        {
            newtext+=it->second;
            bin.clear();
        }
    }
    cout<<"ConvertCodeToText Finished"<<endl;
    return newtext;
}
void disStr(const string &Text)
{
    cout<<"------------------------"<<endl;
    cout<<Text<<endl;
    cout<<"------------------------"<<endl;
}
void WriteTextByRand(string &Text,unsigned int size)
{
    srand((unsigned)time(NULL));
    for(unsigned int i=0;i<size;++i)
    {
        int r=rand()%(126-33)+33;
        cout<<(char)r;
        Text.push_back((char)r);
    }
}
void WriteTextByFile(string &Text)
{
    ifstream in;
    in.open("1.txt");//文件名
    if(in.is_open())
    {
        std::stringstream buffer;  
        buffer<<in.rdbuf();
        Text=buffer.str();
    }
    else cout<<"can not find this file"<<endl;
    in.close();
    cout<<"WriteTextByFile Finished"<<endl;
}
void prtime(clock_t s)
{
    cout<<"usetime : "<<clock()-s<<" ms"<<endl;
}
char HelpPlus(const string &str,unsigned int &i,HuffmanTreeNode *r)
{
    unsigned int len=str.size();
    while(i<len&&r)
    {
        if(!r->left&&!r->right) {return r->leafchar;}
        int bit=str[++i]-'0';
        if(bit==1) r=r->right;
        else r=r->left;
    }
    return '\0';
}
string DeCodeToText(string &decompressText,HuffmanTreeNode *r)
{
    string NewText="";
    unsigned int i=0;
    while(i<decompressText.size())
    {
        NewText.push_back(HelpPlus(decompressText,i,r));
        ++i;
    }
    return NewText;
}
int main()
{
    clock_t start_time=clock();
    string PreText="";//文本
    //WriteTextByRand(PreText,10000);
    WriteTextByFile(PreText);
    prtime(start_time);
    //disStr(PreText);
    ComputeFreqTable(PreText);prtime(start_time);
    HuffmanTreeNode *huff=CreatHuffmanTree();prtime(start_time);
    ComputeDictTable(huff);prtime(start_time);
    CreatcodeToCharTable();prtime(start_time);
    string code=enCode(PreText);prtime(start_time);
    //cout<<code<<endl;
    string compressText=compressCode(code);prtime(start_time);
    //disStr(compressText);
    string decode=decompressCode(compressText);prtime(start_time);
    //cout<<decode<<endl;
    //string newtext=ConvertCodeToText(decode);//根据打表解码
    string newtext=DeCodeToText(decode,huff);//根据huffman树解码
    prtime(start_time);
    //disStr(newtext);
    if(code==decode) cout<<"Same Code"<<endl;
    else cout<<"Different Code"<<endl;
    if(PreText==newtext) cout<<"Same Text"<<endl;
    else cout<<"Different Text"<<endl;
    cout<<"PreText Length : "<<PreText.size()<<"   ComText Length : "<<compressText.size()<<endl;
    cout<<"Rate : "<<100.0*compressText.size()/PreText.size()<<"%"<<endl;
    ofstream o("compressText.txt");
    o<<compressText<<endl;
    o.close();
    ofstream out("newtext.txt");
    out<<newtext<<endl;
    out.close();
    clock_t end_time=clock();
    cout<<"rt : "<<(end_time-start_time)*1.0/CLOCKS_PER_SEC<<" s"<<endl;
    return 0;
}

运行结果:
1.对随机字符:随机生成10000个字符,进行huffman编码,压缩率仅仅为95%左右

ComputeFreqTable Finished
usetime : 488 ms
CreatHuffmanTree Finished
usetime : 493 ms
ComputeDictTable Finished
usetime : 497 ms
CreatcodeToCharTable Finished
usetime : 504 ms
encode Finished
usetime : 508 ms
try! : compressCode Finished
usetime : 573 ms
decompressCode Finished
usetime : 577 ms
usetime : 581 ms
Same Code
Same Text
PreText Length : 10000   ComText Length : 9481
Rate : 94.81%
rt : 0.591 s

2.对英语文章:文章选自china daily,题材来源不同,共计12199字,多次测试,压缩率稳定在70%左右

WriteTextByFile Finished
usetime : 1 ms
ComputeFreqTable Finished
usetime : 3 ms
CreatHuffmanTree Finished
usetime : 3 ms
ComputeDictTable Finished
usetime : 3 ms
CreatcodeToCharTable Finished
usetime : 4 ms
encode Finished
usetime : 7 ms
try! : compressCode Finished
usetime : 61 ms
decompressCode Finished
usetime : 64 ms
usetime : 66 ms
Same Code
Same Text
PreText Length : 12199   ComText Length : 8489
Rate : 69.5877%
rt : 0.068 s

3.中文字符:文章选自《福尔摩斯探案集》东方探案部分,文字总计437700字,压缩率89%左右,花费时间较长,为342.452 s

WriteTextByFile Finished
usetime : 4 ms
ComputeFreqTable Finished
usetime : 59 ms
CreatHuffmanTree Finished
usetime : 60 ms
ComputeDictTable Finished
usetime : 60 ms
CreatcodeToCharTable Finished
usetime : 61 ms
encode Finished
usetime : 168 ms
try! : compressCode Finished
usetime : 342272 ms
decompressCode Finished
usetime : 342356 ms
usetime : 342420 ms
Same Code
Same Text
PreText Length : 437700   ComText Length : 389238
Rate : 88.928%
rt : 342.452 s

4.时间对比:将英文累积到432673,花费时间为193.675s,远远小于编码中文时间,压缩率稳定在70%左右

WriteTextByFile Finished
usetime : 7 ms
ComputeFreqTable Finished
usetime : 80 ms
CreatHuffmanTree Finished
usetime : 81 ms
ComputeDictTable Finished
usetime : 81 ms
CreatcodeToCharTable Finished
usetime : 82 ms
encode Finished
usetime : 183 ms
try! : compressCode Finished
usetime : 193529 ms
decompressCode Finished
usetime : 193597 ms
usetime : 193652 ms
Same Code
Same Text
PreText Length : 432673   ComText Length : 300885
Rate : 69.541%
rt : 193.675 s

总结:Huffman编码对英文支持较好,时间主要花费在将二进制字符串转化为字符这一过程上,但是在实际计算机中,二进制序列是自动化为字符的,这里仅仅是为了模拟这一过程,因此,huffman编码在通信中还是有较大的应用价值

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
综合实验: 1. 问题描述 利用哈夫曼编码进行通信可以大大提高信道利用率,缩短信息传输时间,降低传输成本。这要求在发送端通过一个编码系统对待传输数据预先编码,在接收端将传来的数据进行译码(复原)。对于双工信道(即可以双向传输信息的信道),每端都需要一个完整的编/译码系统。试为这样的信息收发站编写一个哈夫曼码的编/译码系统。 2. 基本要求 一个完整的系统应具有以下功能: (1) I:初始化(Initialization)。从终端读入字符集大小n,以及n个字符和n个权值,建立哈夫曼树,并将它存于文件hfmTree中。 (2) E:编码(Encoding)。利用已建好的哈夫曼树(如不在内存,则从文件hfmTree中读入),对文件ToBeTran中的正文进行编码,然后将结果存入文件CodeFile中。 (3) D:译码(Decoding)。利用已建好的哈夫曼树将文件CodeFile中的代码进行译码,结果存入文件Textfile中。 (4) P:印代码文件(Print)。将文件CodeFile以紧凑格式显示在终端上,每行50个代码。同时将此字符形式的编码文件写入文件CodePrin中。 (5) T:印哈夫曼树(Tree printing)。将已在内存中的哈夫曼树以直观的方式(比如树)显示在终端上,同时将此字符形式的哈夫曼树写入文件TreePrint 中。 3. 测试数据 用下表给出的字符集和频度的实际统计数据建立哈夫曼树,并实现以下报文的编码和译码:“THIS PROGRAME IS MY FAVORITE”。 字符 A B C D E F G H I J K L M 频度 186 64 13 22 32 103 21 15 47 57 1 5 32 20 字符 N O P Q R S T U V W X Y Z 频度 57 63 15 1 48 51 80 23 8 18 1 16 1
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值