//============================================================================ //huffmanCompress.cpp //huffman compress uncompress //by leo //5.03.2011 //============================================================================ #include <iostream> #include <fstream> #include <string> #include <cmath> #include <iomanip> using namespace std; //---------------------------------------------------------------------------- struct node { unsigned char b; //记录字符 long count; //权重 int parent,lch,rch; //定义双亲,左孩子,右孩子 char bits[256]; //存放哈夫曼编码的数组 }nodes[512],tmp; //因为结点最多可达256,所有结点数最多可达511 //---------------------------------------------------------------------------- unsigned char ctoa(char a[]); //将数组的前八位转成二进制形式比特位 char *code(unsigned char temp,int leafnum); //寻找对应字符的编码串,并返回 void initCount(char* infile, long & flength);//统计文件中字符出现的频率,计算文件长度 void sortNode();//对结点进行冒泡排序,权重大的放在上面,编码时效率高 void creatHuffTree(int& leafNum); //根据频度建立huffman Tree void huffCode(int leafNum); //根据哈夫曼树编码 //对文件进行编码,写入新文件 void codeFile(char* infilename, char* outfilename,long flength,int leafnum,long& clength); //将字符编码对照表写入文件 void writeRefTable(char* outfilename,long flength,int leafnum,long& clength); void compress(char *infilename,char *outfilename);//将infilename压缩至outfilename void ctoa(unsigned char a,char code[]); //字符转a为二进制形式存入8位数组code中 //将buf字符串与nodes[i].bits[]中匹配,成功后对应的字符由c带回 int strcmp1(char buf[],struct node node[],int n,unsigned char &c); void strcpy1(char buf[],char a[],int j) ; //将字符串a中长度为j的部分复制到buf数组中 //从压缩文件infilename中读取压缩文件的相关信息 void readFileInfo(char* infilename,long &flength,long& clength,int& leafNum); void sortNode(int n); //对读入的编码对照表进行排序,长度短的排在前面 void readRefTable(char* infilename, long& clength,int n);//读入编码对照表,放入nodes[i].bits[]数组中 //将编码读入内容,进行解码工作 void uncompFile(char *infilename,char *outfilename,long& flength,long& clength,int n); void uncompress(char *infilename,char *outfilename);//将infilename文件解压到outfilename中 //---------------------------------------------------------------------------- //将数组的前八位转成二进制形式比特位 //---------------------------------------------------------------------------- unsigned char ctoa(char a[]) { unsigned char c=0; for(int i=0;i<8;i++) if(a[i]!=0) { c=c+(int)(a[i]-'0')*pow(2,8-1-i); } return c; } //---------------------------------------------------------------------------- //寻找对应字符的编码串,并返回 //---------------------------------------------------------------------------- char *code(unsigned char temp,int leafnum) { for(int i=0;i<leafnum;i++) if(temp==nodes[i].b) return nodes[i].bits; return NULL; } //---------------------------------------------------------------------------- //统计文件中字符出现的频率,计算文件长度 //---------------------------------------------------------------------------- void initCount(char* infile, long & flength) { ifstream ifsInfile(infile,ios::in|ios::binary); unsigned char temp; while(ifsInfile.peek()!=EOF) { ifsInfile.read((char *)&temp,sizeof(unsigned char)); //读入一个字符 nodes[temp].count++; //统计对应结点字符权重 flength++; //统计文件长度 } ifsInfile.close(); //关闭文件 } //---------------------------------------------------------------------------- //对结点进行冒泡排序,权重大的放在上面,编码时效率高 //---------------------------------------------------------------------------- void sortNode() { node tmp; for(int i=0; i<256-1; i++) for(int j=0;j<256-1-i;j++) if(nodes[j].count < nodes[j+1].count) { tmp=nodes[j]; nodes[j]=nodes[j+1]; nodes[j+1]=tmp; } } //---------------------------------------------------------------------------- //根据频度建立huffman Tree //---------------------------------------------------------------------------- void creatHuffTree(int& leafNum) { int i; int leafnum; //定义叶子结点 int pointnum; //定义总结点 for(i=0;i<256;i++) if(nodes[i].count==0) break; leafNum = leafnum=i; //哈夫曼树中叶子结点数 pointnum=2*leafnum-1; //哈夫曼树中总结点数目 long min; //尽量用long,如果文件过大,这里建树可能不是最优树了 int s1,s2; for(i=leafnum;i<pointnum;i++) { min=999999999; for(int j=0;j<i;j++) //挑权重最小的结点 if(nodes[j].parent==0 && nodes[j].count<min) { min=nodes[j].count; s1=j; } nodes[s1].parent=i; //填写第一个叶子结点信息 min=999999999; for(j=0;j<i;j++) //挑权重第二最小的结点 if(nodes[j].parent==0 && nodes[j].count < min) { min=nodes[j].count; s2=j; } nodes[s2].parent=i; nodes[i].count=nodes[s1].count+nodes[s2].count;//填写父结点信息 nodes[i].lch=s1; nodes[i].rch=s2; } } //---------------------------------------------------------------------------- //根据哈夫曼树编码 //---------------------------------------------------------------------------- void huffCode(int leafNum) { char tmp[256]; //定义临时变量,暂存编码 tmp[255]='/0'; //添加结束标志 int start; int c; //记录当前叶结点下标 int f; //存储父结点的下标 for(int i=0;i<leafNum;i++) { start=255; //另开始等于数组最后位 for(c=i,f=nodes[i].parent;f!=0;c=f,f=nodes[f].parent) //对叶结点进行编码 if(nodes[f].lch==c) tmp[--start]='0'; else tmp[--start]='1'; strcpy(nodes[i].bits,&tmp[start]); } } //---------------------------------------------------------------------------- //对文件进行编码,写入新文件 //---------------------------------------------------------------------------- void codeFile(char* infilename, char* outfilename,long flength,int leafnum,long& clength) { unsigned char temp; // long clength=8; //编码从偏移量8记录,统计压缩后编码长度加8 ifstream infile(infilename,ios::in|ios::binary); //打开待压缩的文件,以二进制形式io if (!infile) { cerr << "error: unable to open input file: " << infile << endl; exit(0); } infile.clear(); //清除流状态,防止以前用过此流,有残留 infile.seekg(0); //设置开始读时的位置 ofstream outfile(outfilename,ios::out|ios::binary);//打开压缩后将生成的文件 outfile.write((char *)&flength,sizeof(long)); //写入原文件长度 char buf[513]="/0"; //初始化编码缓冲区 outfile.seekp(8); //指针定向偏移量8 while(infile.peek()!=EOF) { infile.read((char *)&temp,sizeof(unsigned char));//读入字符 strcat(buf,code(temp,leafnum)); //检索出字符对应编码,连到buf[]中 while(strlen(buf)>=8) //当buf中字符长度大于8时,一直处理写入,直至小于8 { temp=ctoa(buf); //上面临时变量已经完成使命,可以赋新值了 outfile.write((char *)&temp,sizeof(unsigned char));//转成二进制写入 clength++; //统计代码结尾偏移加1,用于找到叶子结点位置 strcpy(buf,buf+8); //字符串前移八位 }//当此循环结束时,表示buf[]中已经小于8了,没到文件末尾,读下一个,继续,否则退出 }//while 此层循环退出时,表示已到末尾,再判断buf中是否写完,没写完,连满至少8个字符,再写一个字节,就够了 if(strlen(buf)>0) { strcat(buf,"0000000"); temp=ctoa(buf); //前八位转成二进制形式 outfile.write((char *)&temp,sizeof(unsigned char)); clength++; //统计代码结尾偏移加1,用于找到叶子结点位置 } outfile.seekp(4); /// outfile.write((char *)&clength,sizeof(long)); //写入文件中将记录叶子结点位置 infile.close(); } //---------------------------------------------------------------------------- //将字符编码对照表写入文件 //---------------------------------------------------------------------------- void writeRefTable(char* outfilename,long flength,int leafnum,long& clength) { ofstream outfile(outfilename,ios::app|ios::binary);//以追加模式打开压缩后将生成的文件 unsigned char temp; long bytelen; //记录编码以二进制存储时需要占多少个字节 outfile.clear(); outfile.seekp(clength); //将文件指针移到编码后面的第一位置,在此处记录叶子结点数 outfile.write((char *)&leafnum,sizeof(long)); //写入叶子结点数 for(int i=0;i<leafnum;i++) { outfile.write((char *)&nodes[i].b,sizeof(unsigned char));//写入字符 nodes[i].count=strlen(nodes[i].bits); //不再设置其他变量,权值这时已无使用价值,可以用相应结点的权值变量记录长度 outfile.write((char *)&nodes[i].count,sizeof(unsigned char));//写入长度的ASCII码 if(nodes[i].count%8==0) bytelen=nodes[i].count/8; else { bytelen=nodes[i].count/8+1; strcat(nodes[i].bits,"0000000"); //在编码后面补0,使其最后凑满8的倍数, //超过无妨,可以用bytelen控制好写入字节的长度 } for(int j=0;j<bytelen;j++) { temp=ctoa(nodes[i].bits); //将数组的前8位转换成二进制bit outfile.write((char *)&temp,sizeof(unsigned char)); strcpy(nodes[i].bits,nodes[i].bits+8); } }//此循环结束后就完成了编码对照表的写入 outfile.close(); } //---------------------------------------------------------------------------- //将infilename压缩至outfilename //---------------------------------------------------------------------------- void compress(char *infilename,char *outfilename) { long flength=0; //记录压缩前文件长度 long clength=8; //编码从偏移量8记录,统计压缩后编码长度加8 int leafnum; //定义叶子结点 // int pointnum; //定义总结点 for(int i=0;i<256;i++) { nodes[i].count=0; //初始化权重 nodes[i].b=(unsigned char)i; //初始化字符 } initCount(infilename,flength); sortNode(); creatHuffTree(leafnum); huffCode(leafnum); ofstream outfile(outfilename,ios::out|ios::binary);//打开压缩后将生成的文件 codeFile(infilename,outfilename,flength,leafnum,clength); writeRefTable(outfilename,flength,leafnum,clength); cout<< "success to compress !" <<endl; }//compress //---------------------------------------------------------------------------- //字符转a为二进制形式存入8位数组code中 //---------------------------------------------------------------------------- void ctoa(unsigned char a,char code[]) { int n=9; for(int i=0;i<n;i++) code[i]='0'; //整个字符串初始化 code[n-1]='/0'; //添加结束标志 n=n-2; int c=(int)a; while(c>0) { code[n--]=c%2+'0'; c=c/2; } } //---------------------------------------------------------------------------- //将buf字符串与nodes[i].bits[]中匹配,成功后对应的字符由c带回 //---------------------------------------------------------------------------- int strcmp1(char buf[],struct node node[],int n,unsigned char &c) { for(int i=0;i<n;i++) if(strcmp(buf,node[i].bits)==0) { c=node[i].b; return 1; } return 0; } //---------------------------------------------------------------------------- //将字符串a中长度为j的部分复制到buf数组中 //---------------------------------------------------------------------------- void strcpy1(char buf[],char a[],int j) { for(int i=0;i<j;i++) buf[i]=a[i]; buf[i]='/0'; } //---------------------------------------------------------------------------- //从压缩文件infilename中读取压缩文件的相关信息 //---------------------------------------------------------------------------- void readFileInfo(char* infilename,long &flength,long& clength,int& leafNum) { void ctoa(unsigned char a,char code[]); //需要调用的函数的声明 ifstream infile(infilename,ios::binary); if(!infile) { cerr<<"文件打开失败"<<endl; return; } infile.read((char *)&flength,sizeof(long)); //读入原始文件长度,用于解码时判断 infile.read((char *)&clength,sizeof(long)); //读入叶子结点起始位置 infile.seekg(clength); infile.read((char *)&leafNum,sizeof(int)); //读入叶子结点数 infile.close(); } //---------------------------------------------------------------------------- //对读入的编码对照表进行排序,长度短的排在前面 //---------------------------------------------------------------------------- void sortNode(int n) { for(int i=0;i<n;i++) { for(int j=0;j<n-i-1;j++) { if(nodes[j].count>nodes[j+1].count) { tmp=nodes[j]; nodes[j]=nodes[j+1]; nodes[j+1]=tmp; } } } } //---------------------------------------------------------------------------- //读入编码对照表,放入nodes[i].bits[]数组中 //---------------------------------------------------------------------------- void readRefTable(char* infilename, long& clength,int n) { char code[9]; //将字符转为二进制数组形式暂存 long clen; //临时变量,读取字符编码对照表时使用 unsigned char temp; //读取字符存放此临时变量 ifstream infile(infilename,ios::binary); infile.seekg(clength+4); //文件指针定位到字符编码对照表的起始 for(int i=0;i<n;i++) //逐个读入叶子结点数,将编码进行读入 { infile.read((char *)&nodes[i].b,sizeof(unsigned char)); //读入字符 infile.read((char *)&nodes[i].count,sizeof(unsigned char));//读入编码长度 clen=(int)nodes[i].count; int diff=clen%8; if(0==clen%8) //计算需要读取多少个字节 clen=clen/8; else clen=clen/8+1; nodes[i].bits[0]='/0'; //初始化,方便后面进行连接 for(int j=0;j<clen;j++) //连接编码,使之存入nodes[i].bits[]中 { infile.read((char *)&temp,1); ctoa(temp,code); strcat(nodes[i].bits,code); //将转换过来的编码进行连接 } int bitlen=strlen(nodes[i].bits); if(0!=diff) nodes[i].bits[bitlen-8+diff]='/0'; }//for(int i=0;i<n;i++) infile.close(); } //---------------------------------------------------------------------------- //将编码读入内容,进行解码工作 //---------------------------------------------------------------------------- void uncompFile(char *infilename,char *outfilename,long& flength,long& clength,int n) { unsigned char temp; //读取字符存放此临时变量 char code[9]; //将字符转为二进制数组形式暂存 long readlen=0; //记录已经读取的长度(读文件解码时用) long writelen=0; //记录已经写入的字节 ifstream infile(infilename,ios::binary); ofstream outfile(outfilename,ios::binary|ios::out); //打开编码后文件 if(!outfile) { cerr<<"输出文件打开失败"<<endl; return; } char buf[513]="/0"; //读入编码缓冲区 char buf1[257]="/0"; infile.seekg(8); while(1) // 读取编码,解压连入缓冲区 { while(readlen<(clength-8)&&strlen(buf)<=256) //读满缓冲区 { infile.read((char *)&temp,sizeof(temp)); ctoa(temp,code); //将字节转为数组 strcat(buf,code); readlen++; }//while while(strlen(buf)>=256) //处理缓冲区,直到少于256位,再读满它 { for(int i=0;i<strlen(buf);i++) { strcpy1(buf1,buf,i+1); //逐渐增多取,放入buf1,进行匹配 if(strcmp1(buf1,nodes,n,temp)==1) { outfile.write((char *)&temp,sizeof(unsigned char)); writelen++; strcpy(buf,buf+i+1); //缓冲区前移 break; } }//for if(writelen>=flength) break; //如果写入达到原文件长度,退出 }//while if(readlen>=(clength-8)/*编码长度*/||writelen>=flength) break; //如果写入或者读入编码完毕,退出 }//退出此循环后,还有未解码完成的buf[] //对buf[]缓冲的善后处理 while(writelen<flength) { for(int i=0;i<strlen(buf);i++) { strcpy1(buf1,buf,i+1); if(strcmp1(buf1,nodes,n,temp)==1) { outfile.write((char *)&temp,sizeof(unsigned char)); writelen++; strcpy(buf,buf+i+1); break; } }//for } infile.close(); //关闭文件 outfile.close(); } //---------------------------------------------------------------------------- //将infilename文件解压到outfilename中 //---------------------------------------------------------------------------- void uncompress(char *infilename,char *outfilename) { long flength; //定义原文件长度,从压缩后文件前四字节获取值 long clength; //获取编码长度后的第一偏移量,从压缩文件第五字节开始获取值 int n; //叶子结点数,从编码尾端开始获取 string str; //读取编码到字符串,好进行统一解码 ifstream infile(infilename,ios::binary); readFileInfo(infilename,flength,clength,n); readRefTable(infilename,clength,n); sortNode(n); uncompFile(infilename,outfilename, flength,clength,n); cout<< "success to uncompress !" <<endl; }//uncompress() //---------------------------------------------------------------------------- int main() { char infilename[256],outfilename[256],select[255]; while(1) { cout<<"command: compress: c ; ucompress: u ; exit: e"<<endl; cin>>select; if(!strcmp(select,"e")) exit(0); if(!(strcmp(select,"c") && strcmp(select,"u"))) { cout<<"source file:"<<endl; cin>>infilename; cout<<"objet file:"<<endl; cin>>outfilename; } if(!strcmp(select,"c")) compress(infilename,outfilename); else if(!strcmp(select,"u")) uncompress(infilename,outfilename); else if(!strcmp(select,"e")) exit(0); else cout<<"bad command!"<<endl; } return 0; } //============================================================================