赫夫曼编码\译码

最新推荐文章于 2024-07-26 10:34:47 发布

星星92014

最新推荐文章于 2024-07-26 10:34:47 发布

阅读量901

点赞数

分类专栏：数据结构 c语言文章标签： c语言计算机编码位运算 encoding

本文链接：https://blog.csdn.net/star92014/article/details/46277473

版权

c语言同时被 2 个专栏收录

7 篇文章 1 订阅

订阅专栏

数据结构

4 篇文章 1 订阅

订阅专栏

赫夫曼编码

通过赫夫曼编码可以节省存储空间，在计算机科学中有广泛的应用。本文件生成的文件也得到了有效的压缩，中间应用了大量的位操作。这些操作用c语言写多少有点不方便。

以下是hfmTree.h的内容，这是个公共的头文件，其余源文件都需要包含它。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

//这是树的节点 
struct Node {
	char ch;
	int weight;
	int filepoint,lf,rf;//事实上这是白浪费空间，但我不想再定义一个结构体维护它了
	struct Node* lchild;
	struct Node* rchild;
};

//这是一个编码数组 ，在编码时用 
struct Code {
	char ch;
	unsigned int code;
	int length;
};

以下是Init.c中的内容，这个文件主要是根据输入的数据建立赫夫曼树，并保存在hfmTree文件中，这个文件编码和译码要用到

#include "hfmTree.h"

int n=0;
int filepoint=0;
struct Code* code;
struct Node* root;

int input();
int showCode();
int toCode(struct Node* root,int ncode,int length);
int writetofile(const char *filename);
int static writefilepoint(struct Node* root,FILE* fout);
int static makefilepoint(struct Node* root);
int deltree(struct Node* tree);

int main() {
	int i;
	input();
	//writetofile("hfmTree");
	code=(struct Code*)malloc(sizeof(struct Code)*n);
	for(i=0; i<n; i++) {
		code[i].length=0;
	}
	toCode(root,0,0);
	writetofile("hfmTree");
	showCode();
	free(code);
	deltree(root);
	return 0;
}


int showCode() { //将所有编码显示出来
	int i;
	for(i=0; i<n; i++) {
		printf("%c:",code[i].ch);
		int j;
		for(j=code[i].length-1; j>=0; j--) {
			if((code[i].code)&(1<<j))
				printf("1");
			else
				printf("0");
		}
		printf("\n");
	}
}

int toCode(struct Node* root,int ncode,int length) {//根据树建立编码数组 
	if(root->lchild!=NULL&&root->rchild!=NULL) {
		toCode(root->lchild,ncode<<1,length+1);
		toCode(root->rchild,(ncode<<1)+1,length+1);
	} else {
		int i;
		for(i=0; i<n; i++) {
			if(code[i].length==0)
				break;
		}
		code[i].ch=root->ch;
		code[i].code=ncode;
		code[i].length=length;
	}
	return 0;
}

int writetofile(const char *filename) {//将树写入文件 
	FILE* fout=fopen(filename,"wb");
	if(fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	fwrite(&n,sizeof(int),1,fout);
	filepoint=8;
	makefilepoint(root);
	writefilepoint(root,fout);
	fseek(fout,4,SEEK_SET);
	fwrite(&filepoint,sizeof(int),1,fout);
	fseek(fout,filepoint,SEEK_SET);
	fwrite(code,sizeof(struct Code)*n,1,fout);
	fclose(fout);
	//需要
	return 0;
}
int static writefilepoint(struct Node* root,FILE* fout){
	if(root!=NULL){
		fseek(fout,root->filepoint,SEEK_SET);
		fwrite(root,sizeof(struct Node),1,fout);
		writefilepoint(root->lchild,fout);
		writefilepoint(root->rchild,fout);
	}
	return 0;
}
int static makefilepoint(struct Node* root) {
	if(root!=NULL) {
		root->filepoint=filepoint;
		filepoint+=sizeof(struct Node);
		if(root->lchild!=NULL) {
			root->lf=filepoint;
			makefilepoint(root->lchild);
		} else {
			root->lf=0;
		}
		if(root->rchild!=NULL) {
			root->rf=filepoint;
			makefilepoint(root->rchild);
		} else {
			root->rf=0;
		}
	}
	return 0;
}

int deltree(struct Node* tree) {
	if(tree!=NULL) {
		deltree(tree->lchild);
		deltree(tree->rchild);
		free(tree);
	}
	return 0;
}
int input() {
	int i,weight;
	int si;//暂存根节点
	char ch;
	printf("Input:n=");
	scanf("%d",&n);
	struct Node* temp;
	struct Node** table=(struct Node**)malloc(sizeof(struct Node*)*n);
	printf("example(they are in different line):w 34\ne 56\n");
	for(i=0; i<n; i++) {
		temp=(struct Node*)malloc(sizeof(struct Node));
		fflush(stdin);
		scanf("%c %d",&(temp->ch),&(temp->weight));
		temp->lchild=NULL;
		temp->rchild=NULL;
		table[i]=temp;
	}

	//将输入的数组转换成树 
	while(1) {
		int flag=0;
		int sm1=0,sm2=0;
		int weight=0x7fffffff;
		//判断树是否建成 
		for(i=0; i<n; i++) {
			if(table[i]!=NULL) {
				flag++;
				si=i;
			}
		}
		if(flag==1)
			break;
		for(i=0; i<n; i++) { //获得最小权值的数
			if(table[i]==NULL)
				continue;
			if(weight>table[i]->weight) {
				weight=table[i]->weight;
				sm1=i;
			}
		}
		weight=0x7fffffff;
		for(i=0; i<n; i++) { //获得第二个最小权值的数
			if(table[i]==NULL)
				continue;
			if(i==sm1)
				continue;
			if(weight>table[i]->weight) {
				weight=table[i]->weight;
				sm2=i;
			}
		}
		//合并
		temp=(struct Node*)malloc(sizeof(struct Node));
		temp->weight=table[sm1]->weight+table[sm2]->weight;
		temp->lchild=table[sm1];
		temp->rchild=table[sm2];
		table[sm2]=NULL;
		table[sm1]=temp;
	}
	root=table[si];
	free(table);
	return 0;
}

以下是Encod.c的内容，这个文件主要是根据建立的赫夫曼树进行编码，要读的文件是A.txt，写出的文件是B.dat。当数据量较大时，通过试验10个阿拉伯数字的编码，B.dat的大小约为A.txt一半。

#include "hfmTree.h"

int n=0;
struct Code* code;

int Encoding(char c);
int Encodtofile(const char* fnin,const char* fnout);
int showCode();
int readcode(const char* filename);
int deltree(struct Node* tree);

int main() {
	readcode("hfmTree");
	showCode();
	Encodtofile("A.txt","B.dat");
	free(code);
	return 0;
}

int Encoding(char c) { //找到则返回0-(n-1),否则返回n
	int i;
	for(i=0; i<n; i++) {
		if(code[i].ch==c)
			break;
	}
	return i;
}
int Encodtofile(const char* fnin,const char* fnout) {//编码 
	FILE* fin,*fout;
	fin=fopen(fnin,"r");
	fout=fopen(fnout,"wb");
	if(fin==NULL||fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	unsigned int buf=0;
	int length=0;//这表示最后一个字节的长度
	//intel的机器是大端模式，以四个字节为一个单位往文件写，字节顺序是倒的 
	fseek(fout,4,SEEK_SET);
	while(!feof(fin)) {
		char ch=fgetc(fin);
		int i=Encoding(ch);
		if(i==n) {
			//文件末尾可能会有一个未知的编码，这不会影响结果 
			printf("Unknown Code!\n");
			continue;
		}
		if((length+code[i].length)>32) { //会溢出，赶紧往文件写
			int temp=code[i].length-32+length;
			//以下操作用了大量位运算，主要是屏蔽无效位以及位移操作
			buf=((buf<<(32-length))|(((code[i].code)&(~((1<<temp)-1)))>>temp));
			fwrite(&buf,sizeof(int),1,fout);
			buf=code[i].code&((1<<temp)-1);
			length=temp;
			continue;
		}
		buf=(buf<<(code[i].length))|code[i].code;
		length+=code[i].length;
	}
	buf=buf<<(32-length);
	fwrite(&buf,sizeof(int),1,fout);
	fseek(fout,0,SEEK_SET);
	fwrite(&length,sizeof(int),1,fout);
	fclose(fin);
	fclose(fout);
	return 0;
}

int showCode() { //将所有编码显示出来
	int i;
	for(i=0; i<n; i++) {
		printf("%c:",code[i].ch);
		int j;
		for(j=code[i].length-1; j>=0; j--) {
			if((code[i].code)&(1<<j))
				printf("1");
			else
				printf("0");
		}
		printf("\n");
	}
	return 0;
}

int readcode(const char* filename){
	FILE* fin=fopen(filename,"rb");
	if(fin==NULL) {
		printf("file open error!\n");
		return -1;
	}
	int offset;
	fread(&n,sizeof(int),1,fin);
	fread(&offset,sizeof(int),1,fin);
	code=(struct Code*)malloc(sizeof(struct Code)*n);
	fseek(fin,offset,SEEK_SET);
	fread(code,sizeof(struct Code)*n,1,fin);
	fclose(fin);
	return 0;
}

以下是Decod.c的内容，用于译码。将上一步生成的文件B.dat译码成C.txt，通过比较C.txt和A.txt的内容，前面基本基本一致，在结尾处没能处理好，出现了差异。

#include "hfmTree.h"

int n=0;
struct Node* root; 

int Dcoding(int temp,struct Node* root,int dep);
int Dcodingfromfile(const char* fnin,const char* fnout);
int Dcodingfromfile2(const char* fnin,const char* fnout);
int readtree(const char* filename);
int readnode(FILE* fin,struct Node* root,int seek);

int main() {
	//struct Node* root=(struct Node*)malloc(sizeof(struct Node));
	readtree("hfmTree");
	//printf("%c\n",Dcoding(0xffffffff,root,1)&(0xffff));
	Dcodingfromfile2("B.dat","C.txt");
	deltree(root);
	return 0;
}
int Dcodingfromfile2(const char* fnin,const char* fnout){
	FILE* fin,*fout;
	fin=fopen(fnin,"rb");
	fout=fopen(fnout,"wb");
	if(fin==NULL||fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	fseek(fin,4,SEEK_SET);
	unsigned int buf,buf2,buf3;
	int ret;
	fread(&buf,sizeof(int),1,fin);
	fread(&buf2,sizeof(int),1,fin);
	int pos2=32;//指明二缓冲有效位个数
	while(1){
		ret=Dcoding(buf,root,0);
		char ch=(char)(ret&0xff);
		fwrite(&ch,sizeof(char),1,fout);
		ret=(ret&0xffff0000)>>16;
		buf=buf<<ret;
		if(pos2>ret){
			buf=buf|(buf2>>(32-ret));
			buf2=buf2<<ret;
			pos2=pos2-ret;
		}else{
			fread(&buf3,sizeof(int),1,fin);
			buf2=buf2|((buf3&(0xffffffff<<pos2))>>pos2);
			buf=buf|(buf2>>(32-ret));
			buf2=buf2<<ret;
			//
			
			buf2=buf2|((buf3&(0xffffffff>>(32-pos2)))<<(ret-pos2));
			//buf2=buf2|(buf3&(0xffffffff>>(32-pos2)));
			//pos2=32-pos2;
			pos2=32-ret+pos2;
			if(feof(fin)){
				break;
			}
		}
	}
	int pos3,oldpos2=pos2-32+ret;
	fseek(fin,0,SEEK_SET);
	fread(&pos3,sizeof(int),1,fin);
	if(ret-oldpos2-pos3>0){
		pos3=ret-oldpos2-pos3;
	}else{
		pos3=32+pos3-ret+oldpos2;
	}
	while(pos3>0){
		ret=Dcoding(buf,root,0);
		char ch=(char)(ret&0xff);
		fwrite(&ch,sizeof(char),1,fout);
		ret=(ret&0xffff0000)>>16;
		buf=buf<<ret; 
		if(pos3>32){
			buf=buf|(buf2>>(32-ret));
			buf2=buf2<<ret;
		}
		pos3=pos3-ret;
	}
	fclose(fin);
	fclose(fout); 
	return 0;
}
int Dcodingfromfile(const char* fnin,const char* fnout){
	FILE* fin,*fout;
	fin=fopen(fnin,"rb");
	fout=fopen(fnout,"wb");
	if(fin==NULL||fout==NULL) {
		printf("file open error!\n");
		return -1;
	}
	int pos1=32,pos2=32;//有效位 
	unsigned int buf=0;//两级缓冲 
	unsigned int buf2=0;
	
	//int pos=0;
	fseek(fin,4,SEEK_SET);
	fread(&buf,sizeof(int),1,fin);
	fread(&buf2,sizeof(int),1,fin);
	while(1){
		int ret=Dcoding(buf,root,0);
		fwrite(&ret,sizeof(char),1,fout);
		ret=(ret&0xffff0000)>>16;//这是要移入的位数 
		buf=buf<<ret;
		if(pos2>=ret){
			buf=buf&(0xffffffff<<ret);
			buf=buf|((buf2&((0xffffffff)<<(32-ret)))>>(32-ret));
			buf2=buf2<<ret;
			buf2=buf2&(0xffffffff<<ret);
			pos2-=ret;
		}else{
			pos1=ret-pos2;
			buf=buf&(0xffffffff<<ret);
			buf=buf+((buf2&((0xffffffff)<<(32-pos2)))>>(32-ret));
			
			//pos1=pos1-ret+pos2;
			//pos2=0;
			fread(&buf2,sizeof(int),1,fin);
			if(feof(fin)){
				fseek(fin,0,SEEK_SET);
				fread(&pos2,sizeof(int),1,fin);
				break;
			}
			buf=buf+((buf2&((0xffffffff)<<(32-pos1)))>>(32-ret));
			buf2=buf2<<pos1;
			buf2=buf2&(0xffffffff<<pos1);
			pos2=32-pos1;
			pos1=32;
		}
	}
	//还需要处理后续的不超过8个字节 
	fclose(fin);
	fclose(fout);
	return 0;
}

int Dcoding(int temp,struct Node* root,int dep){//高两字节是深度，低两字节是ch 
	int wei=0x80000000;
	if(root->lchild==NULL&&root->rchild==NULL)
		return root->ch+(dep<<16);
	if(temp&wei){
		return Dcoding(temp<<1,root->rchild,dep+1);
	}else{
		return Dcoding(temp<<1,root->lchild,dep+1); 
	}
}

int readtree(const char* filename){
	root=(struct Node*)malloc(sizeof(struct Node));
	FILE* fin=fopen(filename,"rb");
	if(fin==NULL) {
		printf("file open error!\n");
		return -1;
	}
	fread(&n,sizeof(int),1,fin);
	readnode(fin,root,8);
	fclose(fin);
	return 0;
}

int readnode(FILE* fin,struct Node* root,int seek){
	fseek(fin,seek,SEEK_SET);
	fread(root,sizeof(struct Node),1,fin);
	if(root->lf!=0){
		root->lchild=(struct Node*)malloc(sizeof(struct Node));
		readnode(fin,root->lchild,root->lf);
	}else{
		root->lchild=NULL;
	}
	if(root->rf!=0){
		root->rchild=(struct Node*)malloc(sizeof(struct Node));
		readnode(fin,root->rchild,root->rf);
	}else{
		root->rchild=NULL;
	}
}

int deltree(struct Node* tree) {
	if(tree!=NULL) {
		deltree(tree->lchild);
		deltree(tree->rchild);
		free(tree);
	}
}

以上就是所有程序，执行的先后顺序是Init.c->Encod.c->Decod.c，其中要求预先写好A.txt，文件出现的字符应该在Init.c时输入。对于未知的编码，程序直接忽略，由于有一个文件结束符，文件末尾总会有一个未知的编码，好在这并不影响测试结果。

B.dat是二进制文件，以四个字节为一个单位，由于Intel是小端模式，用二进制文件查看是需要注意字节顺序。