赫夫曼编码
通过赫夫曼编码可以节省存储空间,在计算机科学中有广泛的应用。本文件生成的文件也得到了有效的压缩,中间应用了大量的位操作。这些操作用c语言写多少有点不方便。
以下是hfmTree.h的内容,这是个公共的头文件,其余源文件都需要包含它。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//这是树的节点
struct Node {
char ch;
int weight;
int filepoint,lf,rf;//事实上这是白浪费空间,但我不想再定义一个结构体维护它了
struct Node* lchild;
struct Node* rchild;
};
//这是一个编码数组 ,在编码时用
struct Code {
char ch;
unsigned int code;
int length;
};
以下是Init.c中的内容,这个文件主要是根据输入的数据建立赫夫曼树,并保存在hfmTree文件中,这个文件编码和译码要用到
#include "hfmTree.h"
int n=0;
int filepoint=0;
struct Code* code;
struct Node* root;
int input();
int showCode();
int toCode(struct Node* root,int ncode,int length);
int writetofile(const char *filename);
int static writefilepoint(struct Node* root,FILE* fout);
int static makefilepoint(struct Node* root);
int deltree(struct Node* tree);
int main() {
int i;
input();
//writetofile("hfmTree");
code=(struct Code*)malloc(sizeof(struct Code)*n);
for(i=0; i<n; i++) {
code[i].length=0;
}
toCode(root,0,0);
writetofile("hfmTree");
showCode();
free(code);
deltree(root);
return 0;
}
int showCode() { //将所有编码显示出来
int i;
for(i=0; i<n; i++) {
printf("%c:",code[i].ch);
int j;
for(j=code[i].length-1; j>=0; j--) {
if((code[i].code)&(1<<j))
printf("1");
else
printf("0");
}
printf("\n");
}
}
int toCode(struct Node* root,int ncode,int length) {//根据树建立编码数组
if(root->lchild!=NULL&&root->rchild!=NULL) {
toCode(root->lchild,ncode<<1,length+1);
toCode(root->rchild,(ncode<<1)+1,length+1);
} else {
int i;
for(i=0; i<n; i++) {
if(code[i].length==0)
break;
}
code[i].ch=root->ch;
code[i].code=ncode;
code[i].length=length;
}
return 0;
}
int writetofile(const char *filename) {//将树写入文件
FILE* fout=fopen(filename,"wb");
if(fout==NULL) {
printf("file open error!\n");
return -1;
}
fwrite(&n,sizeof(int),1,fout);
filepoint=8;
makefilepoint(root);
writefilepoint(root,fout);
fseek(fout,4,SEEK_SET);
fwrite(&filepoint,sizeof(int),1,fout);
fseek(fout,filepoint,SEEK_SET);
fwrite(code,sizeof(struct Code)*n,1,fout);
fclose(fout);
//需要
return 0;
}
int static writefilepoint(struct Node* root,FILE* fout){
if(root!=NULL){
fseek(fout,root->filepoint,SEEK_SET);
fwrite(root,sizeof(struct Node),1,fout);
writefilepoint(root->lchild,fout);
writefilepoint(root->rchild,fout);
}
return 0;
}
int static makefilepoint(struct Node* root) {
if(root!=NULL) {
root->filepoint=filepoint;
filepoint+=sizeof(struct Node);
if(root->lchild!=NULL) {
root->lf=filepoint;
makefilepoint(root->lchild);
} else {
root->lf=0;
}
if(root->rchild!=NULL) {
root->rf=filepoint;
makefilepoint(root->rchild);
} else {
root->rf=0;
}
}
return 0;
}
int deltree(struct Node* tree) {
if(tree!=NULL) {
deltree(tree->lchild);
deltree(tree->rchild);
free(tree);
}
return 0;
}
int input() {
int i,weight;
int si;//暂存根节点
char ch;
printf("Input:n=");
scanf("%d",&n);
struct Node* temp;
struct Node** table=(struct Node**)malloc(sizeof(struct Node*)*n);
printf("example(they are in different line):w 34\ne 56\n");
for(i=0; i<n; i++) {
temp=(struct Node*)malloc(sizeof(struct Node));
fflush(stdin);
scanf("%c %d",&(temp->ch),&(temp->weight));
temp->lchild=NULL;
temp->rchild=NULL;
table[i]=temp;
}
//将输入的数组转换成树
while(1) {
int flag=0;
int sm1=0,sm2=0;
int weight=0x7fffffff;
//判断树是否建成
for(i=0; i<n; i++) {
if(table[i]!=NULL) {
flag++;
si=i;
}
}
if(flag==1)
break;
for(i=0; i<n; i++) { //获得最小权值的数
if(table[i]==NULL)
continue;
if(weight>table[i]->weight) {
weight=table[i]->weight;
sm1=i;
}
}
weight=0x7fffffff;
for(i=0; i<n; i++) { //获得第二个最小权值的数
if(table[i]==NULL)
continue;
if(i==sm1)
continue;
if(weight>table[i]->weight) {
weight=table[i]->weight;
sm2=i;
}
}
//合并
temp=(struct Node*)malloc(sizeof(struct Node));
temp->weight=table[sm1]->weight+table[sm2]->weight;
temp->lchild=table[sm1];
temp->rchild=table[sm2];
table[sm2]=NULL;
table[sm1]=temp;
}
root=table[si];
free(table);
return 0;
}
以下是Encod.c的内容,这个文件主要是根据建立的赫夫曼树进行编码,要读的文件是A.txt,写出的文件是B.dat。当数据量较大时,通过试验10个阿拉伯数字的编码,B.dat的大小约为A.txt一半。
#include "hfmTree.h"
int n=0;
struct Code* code;
int Encoding(char c);
int Encodtofile(const char* fnin,const char* fnout);
int showCode();
int readcode(const char* filename);
int deltree(struct Node* tree);
int main() {
readcode("hfmTree");
showCode();
Encodtofile("A.txt","B.dat");
free(code);
return 0;
}
int Encoding(char c) { //找到则返回0-(n-1),否则返回n
int i;
for(i=0; i<n; i++) {
if(code[i].ch==c)
break;
}
return i;
}
int Encodtofile(const char* fnin,const char* fnout) {//编码
FILE* fin,*fout;
fin=fopen(fnin,"r");
fout=fopen(fnout,"wb");
if(fin==NULL||fout==NULL) {
printf("file open error!\n");
return -1;
}
unsigned int buf=0;
int length=0;//这表示最后一个字节的长度
//intel的机器是大端模式,以四个字节为一个单位往文件写,字节顺序是倒的
fseek(fout,4,SEEK_SET);
while(!feof(fin)) {
char ch=fgetc(fin);
int i=Encoding(ch);
if(i==n) {
//文件末尾可能会有一个未知的编码,这不会影响结果
printf("Unknown Code!\n");
continue;
}
if((length+code[i].length)>32) { //会溢出,赶紧往文件写
int temp=code[i].length-32+length;
//以下操作用了大量位运算,主要是屏蔽无效位以及位移操作
buf=((buf<<(32-length))|(((code[i].code)&(~((1<<temp)-1)))>>temp));
fwrite(&buf,sizeof(int),1,fout);
buf=code[i].code&((1<<temp)-1);
length=temp;
continue;
}
buf=(buf<<(code[i].length))|code[i].code;
length+=code[i].length;
}
buf=buf<<(32-length);
fwrite(&buf,sizeof(int),1,fout);
fseek(fout,0,SEEK_SET);
fwrite(&length,sizeof(int),1,fout);
fclose(fin);
fclose(fout);
return 0;
}
int showCode() { //将所有编码显示出来
int i;
for(i=0; i<n; i++) {
printf("%c:",code[i].ch);
int j;
for(j=code[i].length-1; j>=0; j--) {
if((code[i].code)&(1<<j))
printf("1");
else
printf("0");
}
printf("\n");
}
return 0;
}
int readcode(const char* filename){
FILE* fin=fopen(filename,"rb");
if(fin==NULL) {
printf("file open error!\n");
return -1;
}
int offset;
fread(&n,sizeof(int),1,fin);
fread(&offset,sizeof(int),1,fin);
code=(struct Code*)malloc(sizeof(struct Code)*n);
fseek(fin,offset,SEEK_SET);
fread(code,sizeof(struct Code)*n,1,fin);
fclose(fin);
return 0;
}
以下是Decod.c的内容,用于译码。将上一步生成的文件B.dat译码成C.txt,通过比较C.txt和A.txt的内容,前面基本基本一致,在结尾处没能处理好,出现了差异。
#include "hfmTree.h"
int n=0;
struct Node* root;
int Dcoding(int temp,struct Node* root,int dep);
int Dcodingfromfile(const char* fnin,const char* fnout);
int Dcodingfromfile2(const char* fnin,const char* fnout);
int readtree(const char* filename);
int readnode(FILE* fin,struct Node* root,int seek);
int main() {
//struct Node* root=(struct Node*)malloc(sizeof(struct Node));
readtree("hfmTree");
//printf("%c\n",Dcoding(0xffffffff,root,1)&(0xffff));
Dcodingfromfile2("B.dat","C.txt");
deltree(root);
return 0;
}
int Dcodingfromfile2(const char* fnin,const char* fnout){
FILE* fin,*fout;
fin=fopen(fnin,"rb");
fout=fopen(fnout,"wb");
if(fin==NULL||fout==NULL) {
printf("file open error!\n");
return -1;
}
fseek(fin,4,SEEK_SET);
unsigned int buf,buf2,buf3;
int ret;
fread(&buf,sizeof(int),1,fin);
fread(&buf2,sizeof(int),1,fin);
int pos2=32;//指明二缓冲有效位个数
while(1){
ret=Dcoding(buf,root,0);
char ch=(char)(ret&0xff);
fwrite(&ch,sizeof(char),1,fout);
ret=(ret&0xffff0000)>>16;
buf=buf<<ret;
if(pos2>ret){
buf=buf|(buf2>>(32-ret));
buf2=buf2<<ret;
pos2=pos2-ret;
}else{
fread(&buf3,sizeof(int),1,fin);
buf2=buf2|((buf3&(0xffffffff<<pos2))>>pos2);
buf=buf|(buf2>>(32-ret));
buf2=buf2<<ret;
//
buf2=buf2|((buf3&(0xffffffff>>(32-pos2)))<<(ret-pos2));
//buf2=buf2|(buf3&(0xffffffff>>(32-pos2)));
//pos2=32-pos2;
pos2=32-ret+pos2;
if(feof(fin)){
break;
}
}
}
int pos3,oldpos2=pos2-32+ret;
fseek(fin,0,SEEK_SET);
fread(&pos3,sizeof(int),1,fin);
if(ret-oldpos2-pos3>0){
pos3=ret-oldpos2-pos3;
}else{
pos3=32+pos3-ret+oldpos2;
}
while(pos3>0){
ret=Dcoding(buf,root,0);
char ch=(char)(ret&0xff);
fwrite(&ch,sizeof(char),1,fout);
ret=(ret&0xffff0000)>>16;
buf=buf<<ret;
if(pos3>32){
buf=buf|(buf2>>(32-ret));
buf2=buf2<<ret;
}
pos3=pos3-ret;
}
fclose(fin);
fclose(fout);
return 0;
}
int Dcodingfromfile(const char* fnin,const char* fnout){
FILE* fin,*fout;
fin=fopen(fnin,"rb");
fout=fopen(fnout,"wb");
if(fin==NULL||fout==NULL) {
printf("file open error!\n");
return -1;
}
int pos1=32,pos2=32;//有效位
unsigned int buf=0;//两级缓冲
unsigned int buf2=0;
//int pos=0;
fseek(fin,4,SEEK_SET);
fread(&buf,sizeof(int),1,fin);
fread(&buf2,sizeof(int),1,fin);
while(1){
int ret=Dcoding(buf,root,0);
fwrite(&ret,sizeof(char),1,fout);
ret=(ret&0xffff0000)>>16;//这是要移入的位数
buf=buf<<ret;
if(pos2>=ret){
buf=buf&(0xffffffff<<ret);
buf=buf|((buf2&((0xffffffff)<<(32-ret)))>>(32-ret));
buf2=buf2<<ret;
buf2=buf2&(0xffffffff<<ret);
pos2-=ret;
}else{
pos1=ret-pos2;
buf=buf&(0xffffffff<<ret);
buf=buf+((buf2&((0xffffffff)<<(32-pos2)))>>(32-ret));
//pos1=pos1-ret+pos2;
//pos2=0;
fread(&buf2,sizeof(int),1,fin);
if(feof(fin)){
fseek(fin,0,SEEK_SET);
fread(&pos2,sizeof(int),1,fin);
break;
}
buf=buf+((buf2&((0xffffffff)<<(32-pos1)))>>(32-ret));
buf2=buf2<<pos1;
buf2=buf2&(0xffffffff<<pos1);
pos2=32-pos1;
pos1=32;
}
}
//还需要处理后续的不超过8个字节
fclose(fin);
fclose(fout);
return 0;
}
int Dcoding(int temp,struct Node* root,int dep){//高两字节是深度,低两字节是ch
int wei=0x80000000;
if(root->lchild==NULL&&root->rchild==NULL)
return root->ch+(dep<<16);
if(temp&wei){
return Dcoding(temp<<1,root->rchild,dep+1);
}else{
return Dcoding(temp<<1,root->lchild,dep+1);
}
}
int readtree(const char* filename){
root=(struct Node*)malloc(sizeof(struct Node));
FILE* fin=fopen(filename,"rb");
if(fin==NULL) {
printf("file open error!\n");
return -1;
}
fread(&n,sizeof(int),1,fin);
readnode(fin,root,8);
fclose(fin);
return 0;
}
int readnode(FILE* fin,struct Node* root,int seek){
fseek(fin,seek,SEEK_SET);
fread(root,sizeof(struct Node),1,fin);
if(root->lf!=0){
root->lchild=(struct Node*)malloc(sizeof(struct Node));
readnode(fin,root->lchild,root->lf);
}else{
root->lchild=NULL;
}
if(root->rf!=0){
root->rchild=(struct Node*)malloc(sizeof(struct Node));
readnode(fin,root->rchild,root->rf);
}else{
root->rchild=NULL;
}
}
int deltree(struct Node* tree) {
if(tree!=NULL) {
deltree(tree->lchild);
deltree(tree->rchild);
free(tree);
}
}
以上就是所有程序,执行的先后顺序是Init.c->Encod.c->Decod.c,其中要求预先写好A.txt,文件出现的字符应该在Init.c时输入。对于未知的编码,程序直接忽略,由于有一个文件结束符,文件末尾总会有一个未知的编码,好在这并不影响测试结果。
B.dat是二进制文件,以四个字节为一个单位,由于Intel是小端模式,用二进制文件查看是需要注意字节顺序。