huffman树压缩字符文档,就是将8bit的字符根据出现频率进行重新编码,使编码之后每个字符的编码在文件读入时都能被唯一确定。故字符的编码必须是不头包含的。huffman树是一种简单的压缩编码方式。
本文将采用c语言实现。编译环境为gcc4.9.2,故可以采用c++的引用传递使代码更加简单。
首先,先编一个头文件“huffman.h”实现实现huffman树的基本操作,代码:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
typedef struct{
unsigned int weight;
unsigned int parent,lchild,rchild;
}HTNode,*HuffmanTree; //动态分配数组存储赫夫曼树
typedef char * *HuffmanCode; //动态分配数组存储赫夫曼编码表
void Select(HuffmanTree HT,int n,int &s1,int &s2)
{int i,te;
int p1,p2;
for(i=1;i<=n;i++)
{if(HT[i].parent==0)
{
p1=i;
i++;
break;
}
}
for( ;i<=n;i++)
{if(HT[i].parent==0)
{
p2=i;
i++;
break;
}
}
if(HT[p1].weight>HT[p2].weight)
{
te=p1;
p1=p2;
p2=te;
} //p1,p2赋初始值
for( ;i<=n;i++)
{if(HT[i].parent==0)
{
if(HT[i].weight<HT[p1].weight)
{
p2=p1;
p1=i;
}
else
if(HT[i].weight<HT[p2].weight)
p2=i;
}
}
s1=p1;
s2=p2;
}
void HuffmanCoding (HuffmanTree &HT,HuffmanCode &HC ,int *w,int n){
//w存放n个字符的权值(均>0),构造赫夫曼树HT,并求出n个字符的赫夫曼编码HC
//HT,HC空间在函数中分配,数组w,总数n
HuffmanTree p=NULL;
int m,i;
int s1,s2;
char *cd;
int start,c;
int f;//分配变量
if(n<=1)return ;
m=2*n-1;//i m s1 s2 HT[0] HT[1] HT[2] HT[3] HT[4] HT[5] HT[6]
HT=(HuffmanTree)malloc((m+1)*sizeof(HTNode));//0号单元未用
for(i=1;i<=n;i++)
HT[i]={w[i],0,0,0};
for( ;i<=m;++i){//赫夫曼树
//在HT[1..i-1]选择parent为0且weight最小的两个节点,其序号分别为s1和s2
Select(HT,i-1,s1,s2);
HT[s1].parent=i;HT[s2].parent=i;
HT[i].lchild=s1;HT[i].rchild=s2;
HT[i].weight=HT[s1].weight+HT[s2].weight;
HT[i].parent=0;
}
//---从叶子到根逆向求每个字符的赫夫曼编码---
HC=(HuffmanCode)malloc((n+1)*sizeof(char *));// 分配n个字符编码的头指针向量
cd=(char *)malloc(n*sizeof(char));//分配求码的工作空间
cd[n-1]='\0'; //编码结束符
for(i=1;i<=n;++i){//逐个字符求赫夫曼编码
start = n-1;//编码结束符位置
for(c=i,f=HT[i].parent;f!=0;c=f,f=HT[f].parent)//从叶子到根逆向求编码
if(HT[f].lchild==c) cd[--start]='0';
else cd[--start]='1';
HC[i]=(char *)malloc((n-start)*sizeof(char));//为第i个字符编码分配空间
strcpy(HC[i],&cd[start]);//从cd复制编码(串)到HC
}
free(cd); //释放工作空间
}//HuffmanCoding
压缩文件的程序“压缩.cpp”,代码:
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#include"huffman.h"
int main()
{HuffmanTree HT;
HuffmanCode HC;
char te;
int w[257];//存放权值
int i;
unsigned int a;
for(a=0;a<=128;a++) w[a]=0;
FILE * f1;
FILE * f2;
char name[1004],name1[1004];
scanf("%s",name);
if((f1=fopen(name,"r"))==NULL){
printf("can't open this file.\n");
exit(0);
}
while(te=fgetc(f1)){
if(te==-1)break;
a=te;
w[a+1]++;
} //第一次读入计算权值
fseek(f1,0L,SEEK_SET);
HuffmanCoding(HT,HC,w,128);
for(i=1;i<=128;i++)
printf("%d %s\n",i-1,HC[i]);
printf("\n\n\n"); //输出Huffman字符和编码
strcpy(name1,name);
strcat(name1,".out");
if((f2=fopen(name1,"wb+"))==NULL){
printf("can't open this file.\n");
exit(0);
}
for(i=1;i<=128;i++)
fwrite(&w[i],4,1,f2);
int da[9];
int j=1;
unsigned char c1=0;
while(te=fgetc(f1)){
if(te==-1)break;
for(i=0;i<strlen(HC[te+1]);i++)
{if(j<9)
if(HC[te+1][i]=='1'){da[j]=1;j++;}
else {da[j]=0;j++;}
if(j==9)
{c1=da[1]*128+da[2]*64+da[3]*32+da[4]*16+da[5]*8+da[6]*4+da[7]*2+da[8];
//fwrite(&c1,1,1,f2);
a=0;
j=1;
}
}
}//粗略压缩
printf("%d\n",j-1);
for(i=1;i<j;i++)
printf("%d\n",da[i]);
j--;
fwrite(&j,4,1,f2);
for(i=1;i<=j;i++)
{fwrite(&da[i],4,1,f2);
printf("%d\n",da[i]);
}
j=1;
fseek(f1,0,SEEK_SET);
while(te=fgetc(f1)){
//printf("%c",te);
if(te==-1)break;
for(i=0;i<strlen(HC[te+1]);i++)
{if(j<9)
if(HC[te+1][i]=='1'){da[j]=1;j++;}
else {da[j]=0;j++;}
if(j==9)
{c1=da[1]*128+da[2]*64+da[3]*32+da[4]*16+da[5]*8+da[6]*4+da[7]*2+da[8];
fwrite(&c1,1,1,f2);
a=0;
j=1;
}
}
}//粗略压缩
fclose(f1);
fclose(f2);
return 0;
}
之后是解压
#include<stdio.h>
#include<stdlib.h>
#include"huffman.h"
int main()
{FILE * f1;
FILE * f2;
bool da[9];
HuffmanTree HT;
HuffmanCode HC;
char name[1004],name1[1004];
scanf("%s",name);
strcpy(name1,name);
strcat(name1,".out");
f1=fopen(name1,"rb");
int w[129];
int i,j;
for(i=1;i<=128;i++)
{fread(&w[i],4,1,f1);
printf("%d ",w[i]);
}
printf("\n");
fread(&j,4,1,f1);
printf("%d ",j);
for(i=1;i<=j;i++)
{fread(&da[i],4,1,f1);
printf("%d ",da[i]);
}
HuffmanCoding(HT,HC,w,128);
for(i=1;i<=128;i++)
{printf("%s\n",HC[i]);
}
//fclose(f1);
f2=fopen(name,"w");
//strcpy(name1,name);
//strcat(name1,".out");
// f2=fopen(name1,"rb");
int buf;
int a;
int *bu;
bu=&buf;
int end=0;
end=255;//вС0ср1
while(fread(bu,1,1,f1))
{a=buf/128;
buf=buf-a*128;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf/64;
buf=buf-a*64;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf/32;
buf=buf-a*32;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf/16;
buf=buf-a*16;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf/8;
buf=buf-a*8;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf/4;
buf=buf-a*4;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf/2;
buf=buf-a*2;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
a=buf;
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
}
for(i=1;i<=j;i++)
{a=da[i];
if(a==0) end=HT[end].lchild;
else end=HT[end].rchild;
if(end<=128)
{//printf("%c",end-1);
fprintf(f2,"%c",end-1);
end=255;
}
}
}
P.S.
以上为算法实现,最终写完这篇博客居然用了一年;现在看以前的代码发现有很多不成熟的地方;
我最终实现了将压缩和解压合并为一个程序;源码已经上传,欢迎下载;