打开一篇英文文章,统计该文章的每个字符频度,构建哈夫曼树,通过哈夫曼树进行编码译码。
步骤:
(1)打开原文件,创建哈夫曼树,依照哈夫曼树以0,1进行编码。将编码文件输出到code_file.dat文件中,将哈夫曼树输出到tree_file.txt中。
(2)打开编码文件,将编码文件(保存为1,0数据的文件),利用位运算将每八个字符合成为一个字符以实现编码文件的压缩,输出压缩文件到.haf文件中。
(3)打开压缩文件,利用位运算展开压缩文件,得到原编码文件,输出编码文件到new_codefile.dat。
(4)打开哈夫曼树文件,将哈夫曼树重新读入内存,打开编码文件,利用哈夫曼树与编码还原为原始文件。
代码:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/stat.h>
char glo_name[32]; //保存要操作的文件的文件名(不包括后缀)
//定义树结构
typedef struct tree_node
{
char data; //储存字符
int frequency; //储存频度
struct tree_node * lchild;
struct tree_node * rchild;
}tree_node;
//获取字符频度并构建树节点及以及按频度排序
int get_frequency(tree_node** ptr_array,char *array,int len)
{
int ch[128]={0};
int i;
int j=0;
int ascii;
for(i=0;i<len;i++)
{
ascii=array[i];
ch[ascii]++;
}
tree_node* ptr_new;
for(i=0;i<128;i++)
{
if(ch[i]>0)
{
ptr_new=(tree_node*)malloc(len*sizeof( tree_node));
ptr_new->data=i;
ptr_new->frequency=ch[i];
ptr_new->lchild=NULL;
ptr_new->rchild=NULL;
ptr_array[j]=ptr_new;
j++;
ptr_new=NULL;
}
}
//对树节点排序
int k;
for(i=0;i<j;i++)//控制排序轮数
{
for(k=0;k<j-i-1;k++)//控制每轮比较次数
{
if(ptr_array[k]->frequency>ptr_array[k+1]->frequency)
{
int tmp_frequency=ptr_array[k]->frequency;
char tmp_data=ptr_array[k]->data;
ptr_array[k]->frequency=ptr_array[k+1]->frequency;
ptr_array[k]->data=ptr_array[k+1]->data;
ptr_array[k+1]->frequency=tmp_frequency;
ptr_array[k+1]->data=tmp_data;
}
}
}
return j; //返回字符数量
}
//构建哈夫曼树
tree_node* build_haffman_tree(tree_node** ptr_array,int len)
{
tree_node* ptr_new;
int index=0;
int num=0;
while(1)
{
//创建子树根节点
ptr_new=(tree_node*)malloc(1*sizeof(tree_node));
ptr_new->frequency=ptr_array[index]->frequency+ptr_array[index+1]->frequency;
ptr_new->data='0';
ptr_new->lchild=ptr_array[index];
ptr_new->rchild=ptr_array[index+1];
ptr_array[index]=NULL;
ptr_array[index+1]=NULL;
//子树根节点存入到数组中
int j=index+2;
index++;
if(index==len-1)
{
break;
}
while(ptr_new->frequency>ptr_array[j]->frequency)
{
ptr_array[j-1]=ptr_array[j];
j++;
if(j==len)
break;
}
ptr_array[j-1]=ptr_new;
num++;
}
return ptr_new;
}
//编码
char * get_haffman_code(char **ptr,tree_node * root,int len)
{
static char code_array[10]={0}; //保存每个字符的编码值
if(root->lchild==NULL && root->rchild==NULL) //将每个字符的编码值存入指针数组ptr
{
int j;
j=root->data;
//printf("ascii:%d\t",j);
//printf("长度:%d\t",len);
ptr[j]=(char*)malloc(len+1);
bzero(ptr[j],len+1);
memcpy(ptr[j],code_array,len);
//printf("存入数组编码值:%s\n",ptr[j]);
}
else
{
code_array[len]='0';
get_haffman_code(ptr,root->lchild,len+1);
code_array[len]='1';
get_haffman_code(ptr,root->rchild,len+1);
}
return *ptr;
}
//创建编码文件
void creat_code_file(char * buffer,char **ptr,int len)
{
FILE * fp=fopen("code_file.dat","ab+");
if(fp==NULL)
{
printf("fopen error\n");
return;
}
int i=0;
int tmp=0;
for(i=0;i<len;i++)
{
tmp=buffer[i]+'0'-'0';
if(tmp>0 && ptr[tmp]!=NULL)
{
fwrite(ptr[tmp],strlen(ptr[tmp]),1,fp);
}
}
fclose(fp);
}
//对编码后的文件进行译码
void translate(tree_node * root,char * code_buffer,int size)
{
char new_name[32]={0};
sprintf(new_name,"new_%s.txt",glo_name);
int fd=open(new_name,O_CREAT|O_RDWR|O_TRUNC,0666);
if(fd==-1)
{
perror("open");
return;
}
tree_node * ptr_tmp=root;
int i=0;
while(i<size)
{
printf("%.2f%%\r",((double)i/size*100)); //进度显示
fflush(stdout);
if(code_buffer[i]=='0')
{
ptr_tmp=ptr_tmp->lchild;
if(ptr_tmp->lchild==NULL && ptr_tmp->rchild==NULL)
{
write(fd,&ptr_tmp->data,1);
ptr_tmp=root;
}
}
else
{
ptr_tmp=ptr_tmp->rchild;
if(ptr_tmp->lchild==NULL && ptr_tmp->rchild==NULL)
{
write(fd,&ptr_tmp->data,1);
ptr_tmp=root;
}
}
i++;
}
close(fd);
}
//压缩操作
void pack(char *ch)
{
int i;
int j;
char dest;
int count=0; //计算字符串个数
int num; //存储字符需要的多少个字节
int left; //字符串剩余不足8位的个数
FILE *fp=NULL;
char *p=NULL;
char new_name[32]={0};
sprintf(new_name,"%s.haf",glo_name);
if (NULL==(fp=fopen(new_name, "wb")))
{
printf("open file error\n");
return;
}
for(i=0;ch[i]!=0;i++)
{
count++; //统计字符个数
}
num=count/8;
left=count%8;
printf("\n编码文件共占%d字节\n压缩后,占%d字节\n",count,(left!=0?4+num+1:4+num));
if(left==0) //如果left为0,那么num个字节刚好能够存放,否则需要num + 1字节
{
p=(char*)malloc(sizeof(char)*num);
memset(p,0,num);
}
else
{
p=(char *)malloc(sizeof(char)*(num+1));
memset(p,0,num+1);
}
j=-1;
for(i=0;i<count;i++)//位运算,每8个字符以2进制的形式储存在一个字符中
{
if(i%8==0)
{
j++;
}
p[j]<<=1;
ch[i]-='0';
p[j]|=ch[i];
}
if(left!=0) //如果left不为0,需要把剩余的几个位向左边靠拢
{
p[j]<<=8-left; //例如left为2,即00000011,需要左移6位变为11000000
fwrite(&count,sizeof(count),1,fp);
fwrite(p,1,num+1,fp);
}
else
{
fwrite(&count, sizeof(count),1,fp);
fwrite(p,1,num,fp);
}
free(p);
fclose(fp);
}
//解压操作
void unpack()
{
FILE *fp = NULL;
int count;
int num;
int left;
int i, j;
unsigned char flag = 128; //即0b1000000,用于做位运算 ,注意要用无符号的字符型
char *p = NULL;
char new_name[32]={0};
sprintf(new_name,"%s.haf",glo_name);
if (NULL == (fp = fopen(new_name, "rb")))
{
printf("open file error\n");
return;
}
fread(&count, sizeof(count), 1, fp);
num = count / 8;
left = count % 8;
if (left == 0)
{
p = (char *)malloc(sizeof(char) * num);
fread(p, 1, num, fp);
}
else
{
p = (char *)malloc(sizeof(char) * (num + 1));
fread(p, 1, num + 1, fp);
}
fclose(fp);
j = -1;
char *buffer=(char *)malloc(count+1);
bzero(buffer,count+1);
for (i = 0; i < count; i++)
{
if (i % 8 == 0)
{
j++;
flag = 128;
}
if ((p[j] & flag))//通过改变flag字符2进制1的位置判读,一个字节哪个位是1,哪个位是0
{ //并输出字符形式
buffer[i]='1';
flag /= 2;
}
else
{
buffer[i]='0';
flag /= 2;
}
}
free(p);
int fd=open("new_codefile.dat",O_CREAT|O_TRUNC|O_RDWR,0666);
if(fd==-1)
{
perror("open");
return;
}
write(fd,buffer,count);
free(buffer);
close(fd);
}
//前序遍历将树写入文件
void cread_tree_file(tree_node * root,int fd)
{
const char p='*'; //以*为叶子节点结束字符
if(root==NULL)
{
write(fd,&p,sizeof(char));
}
if(root!=NULL)
{
write(fd,&(root->data),sizeof(char));
cread_tree_file(root->lchild,fd);
cread_tree_file(root->rchild,fd);
}
}
//树写入文件
void write_tree(tree_node * root)
{
int fd=open("tree_file.txt",O_RDWR|O_CREAT,0666);
if(fd==-1)
{
perror("write_tree");
return;
}
cread_tree_file(root,fd);
close(fd);
}
//重建树
tree_node* rebuild_tree(char *buffer)
{
if(*buffer==0)
{
return NULL;
}
char value;
static char *buf;
buf=buffer;
value=*buffer;
buf++;
tree_node* root;
if(value=='*')//'以*为结束字符' 原文本中不能有*号 防止冲突
{
root=NULL;
}
else
{
root=(tree_node*)malloc(1*sizeof(tree_node));
root->data=value;
root->lchild=rebuild_tree(buf);
root->rchild=rebuild_tree(buf);
}
return root;
}
//树读入内存
tree_node * read_tree()
{
int fd=open("tree_file.txt",O_RDONLY);
if(fd==-1)
{
perror("fd");
return NULL;
}
char buffer[512]={0};
ssize_t len=read(fd,buffer,sizeof buffer);
if(len==-1)
{
perror("read");
}
close(fd);
tree_node* root=rebuild_tree(buffer);
return root;
}
//编码
void creat_code()
{
char name[32]={0};
printf("请输入要编码的文件:\n");
scanf("%s",name);
sscanf(name,"%[^.]",glo_name);
int fd=open(name,O_RDWR);
if(fd==-1)
{
perror("open");
}
struct stat st;
stat(name,&st);
char *buffer=malloc(st.st_size);
ssize_t len=read(fd,buffer,st.st_size);
if(len==-1)
{
perror("read");
}
close(fd);
bzero(name,sizeof(name));
int size=len;
tree_node** ptr_array=(tree_node**)malloc(len*sizeof( tree_node*));
//按频度排序并构建树节点
len=get_frequency(ptr_array,buffer,len);
//构建树
tree_node* root=NULL;
root=build_haffman_tree(ptr_array,len);
write_tree(root);
printf("树写入文件:tree_file.txt\n");
char *ptr[200]; //创建指针数组,用以保存每个字符的编码值指针
int i;
for(i=0;i<200;i++)
{
ptr[i]=NULL;
}
get_haffman_code(ptr,root,0);
creat_code_file(buffer,ptr,size);
printf("编码成功,生成文件code_file.dat\n");
printf("===================================\n");
}
void menu(void)
{
while(1)
{
printf("欢迎使用本程序\n");
fflush(stdin);
int a=0;
printf("1:文件编码\n2:编码文件压缩\n3:压缩文件解压\n4:文件译码\n5:退出程序\n");
printf("请输入要执行的步骤:\n");
scanf("%d",&a);
if(a==1)
{
creat_code();
}
if(a==2)
{
int fd1=open("code_file.dat",O_RDWR);
if(fd1==-1)
{
perror("open");
}
struct stat st1;
stat("code_file.dat",&st1);
char *buffer1=malloc(st1.st_size+1);
bzero(buffer1,st1.st_size+1);
ssize_t len1=read(fd1,buffer1,st1.st_size);
if(len1==-1)
{
perror("read");
}
close(fd1);
pack(buffer1);
free(buffer1);
printf("压缩成功,生成文件%s.haf\n",glo_name);
printf("===================================\n");
}
if(a==3)
{
char name[32]={0};
printf("请输入要解压的文件(.haf文件)\n");
scanf("%s",name);
bzero(glo_name,sizeof(glo_name));
sscanf(name,"%[^.]",glo_name);
unpack();
printf("解压成功,生成文件new_codefile.dat\n");
printf("===================================\n");
}
if(a==4)
{
int fd2=open("new_codefile.dat",O_RDONLY);
if(fd2==-1)
{
perror("open");
}
struct stat st1;
stat("new_codefile.dat",&st1);
char *buffer1=malloc(st1.st_size+1);
bzero(buffer1,st1.st_size+1);
read(fd2,buffer1,st1.st_size);
close(fd2);
tree_node *root=read_tree();
translate(root,buffer1,st1.st_size);
free(buffer1);
printf("译码成功\n");
printf("===================================\n");
}
if(a==5)
{
break;
}
}
}
//前序输出树(测试用)
void pre_tree(tree_node * root)
{
static int count=0;
if(root==NULL)
{
printf("q");
}
if(root!=NULL)
{
printf("%c",root->data);
pre_tree(root->lchild);
pre_tree(root->rchild);
}
}
int main(void)
{
menu();
}