Huffman树编解码

最新推荐文章于 2024-07-14 14:55:20 发布
张逸安
最新推荐文章于 2024-07-14 14:55:20 发布
阅读量340
点赞数 11
文章标签：开发语言 c语言
本文链接：https://blog.csdn.net/m0_74983954/article/details/135464859
版权
文章介绍了如何利用Huffman树原理对文件进行压缩和解压缩的过程，包括统计字符频率、生成Huffman树、编码和解码步骤，以及相关函数的实现和测试。
摘要由CSDN通过智能技术生成
《数据结构》课程作业：利用Huffman树对文件进行压缩和解压缩。
仅作原理展示用，未经过优化，请酌情参考。
#include <stdio.h>
#include <stdlib.h>
#include <limits.h>
#include <stdbool.h>
#include <string.h>

#define DEBUG

/* Huffman树 */
typedef struct HTNode {
	unsigned char c;
	unsigned w;
	struct HTNode *parent, *lchild, *rchild;
} HTNode, *HT;

/* Huffman编码 */
typedef struct HCode {
	unsigned char c;
	unsigned long long code;
	unsigned len;
} HCode;

typedef struct HC {
	HCode *hcode;
	unsigned size;
} HC;

/* 统计信息 */
typedef struct Word {
	unsigned char c;
	unsigned w;
} Word;

typedef struct Words {
	unsigned size;
	unsigned volume;
	Word *word;
} Words;

/* 数据流 */
typedef struct Data {
	unsigned char *string;
	unsigned size;
} Data;

void ParseFile(const char *fin, Words *words, Data * data);
void GenHT(HT *ht, Words words);
void GenHC(HC *hc, HT ht);
void Encode(HC hc, Data orig, Data *news);
void Zip(const char *fin, const char *fout);
void Unzip(const char *fin, const char *fout);
void DestroyHT(HT *ht);
void DestroyHC(HC *hc);
void DestroyWords(Words *words);
void DestroyData(Data *data);

/* 栈（非递归遍历Huffman树） */
typedef struct StackNode {
	HTNode *e;
	struct StackNode *next;
} StackNode, *Stack;

void InitStack(Stack *S);
bool Push(Stack *S, HTNode *e);
bool Pop(Stack *S, HTNode **e);
bool StackEmpty(Stack S);
void DestroyStack(Stack *S);


#ifdef DEBUG

/* 字符输出数据流 */
void PrintDataC(Data data) {
	for (int i = 0; i < data.size; i++) printf("%c", data.string[i]);
	putchar('\n');
}

/* 二进制输出数据流 */
void PrintDataB(Data data) {
	char array[32] = {};
	for (int i = 0; i < data.size; i++) {
		itoa(data.string[i], array, 2);
		printf("%s", array);
	}
	putchar('\n');
}

/* 输出统计信息 */
void PrintWords(Words words) {
	for (int i = 0; i < words.size; i++)
		printf("%d: %d\n", words.word[i].c, words.word[i].w);
}

/* 打印Huffman树 */
void PrintHT(HT ht) {
	if (ht) {
		PrintHT(ht->lchild);
		PrintHT(ht->rchild);
		if (ht->lchild == NULL && ht->rchild == NULL)
			printf("| %u: %u | ", ht->c, ht->w);
	}
}

/* 打印Huffman编码 */
void PrintHC(HC hc) {
	char array[32] = {};
	for (int i = 0; i < hc.size; i++) {
		itoa(hc.hcode[i].code, array, 2);
		printf("%c: %u, %s\n", hc.hcode[i].c, hc.hcode[i].len, array);
	}
}

/* 检查文件是否相同 */
bool Check(char *file1, char *file2) {
	FILE *fp1 = fopen(file1,"r"), *fp2 = fopen(file2,"r");
	unsigned fsize1, fsize2;
	fseek(fp1, 0, SEEK_END);
	fseek(fp2, 0, SEEK_END);
	fsize1 = ftell(fp1);
	fsize2 = ftell(fp2);
	rewind(fp1);
	rewind(fp2);
	if (fsize1 != fsize2) return false;
	char c1, c2;
	for (unsigned i = 0; i < fsize1; i++) {
		fread(&c1,1,1,fp1);
		fread(&c2,1,1,fp2);
		if(c1 != c2) return printf("at fsize = %d, c1 is %d, c2 is %d\n", i, c1, c2);
	}
	fclose(fp1);
	fclose(fp2);
	return true;
}

#endif


int main(void) {
	// 压缩一个图片
	Zip("pic.png", "pic.png.hzip");
	Unzip("pic.png.hzip","picn.png");
	printf("%d\n", Check("pic.png", "picn.png"));
	// 压缩一个pdf
	Zip("lab.pdf", "lab.pdf.hzip");
	Unzip("lab.pdf.hzip","labn.pdf");
	printf("%d\n", Check("lab7.pdf", "lab7n.pdf"));
	// 压缩一个文件
	Zip("test", "test.hzip");
	Unzip("test.hzip", "testn");
	printf("%d\n", Check("test", "testn"));
	
	return 0;
}


/* 解析文件 */
void ParseFile(const char *fin, Words *words, Data * data) {
	FILE *fp = fopen(fin, "rb");
	if (!fp) exit(0);
	
	/* 载入文件 */
	fseek(fp, 0, SEEK_END);
	data->size = ftell(fp);
	rewind(fp);
	data->string = (unsigned char *)malloc(sizeof(char) * data->size);
	if (!data->string) exit(0);
	fread(data->string, sizeof(char), data->size, fp);
	fclose(fp);
	
	/* 计数 */
	unsigned cache[256] = {0}; //缓冲数组
	unsigned index = 0;
	for (int i = 0; i < data->size; i++) {
		cache[data->string[i]]++;
	}
	words->size = 0;
	for (int i = 0; i < 256; i++)
		if (cache[i]) words->size++; //统计个数
	words->word = (Word *)malloc(sizeof(Word) * words->size);
	if (!words->word) exit(0);
	for (int i = 0; i < 256; i++) {
		if (cache[i]) {
			words->word[index].c = i;
			words->word[index].w = cache[i];
			index++;
		}
	}
}

/* 用统计频数生成Huffman树 */
void GenHT(HT *ht, Words words) {
	*ht = (HT)malloc(sizeof(HTNode) * (2 * words.size - 1)); //全部结点
	if (!*ht) exit(0);
	for (int i = words.size - 1; i < 2 * words.size - 1; i++) {
		(*ht + i)->c = words.word[i + 1 - words.size].c, (*ht + i)->w = words.word[i + 1 - words.size].w;
		(*ht + i)->lchild = (*ht + i)->rchild = (*ht + i)->parent = NULL;
	}
	for (int i = 0; i < words.size - 1; i++) (*ht + i)->lchild = (*ht + i)->rchild = (*ht + i)->parent = NULL;
	
	/* 寻找最小频数结点 */
	int index1, index2;
	unsigned min1, min2;
	for (int i = words.size - 2; i != -1; i--) {
		min1 = min2 = INT_MAX;
		for (int j = i + 1; j < 2 * words.size - 1; j++) {
			if ((*ht + j)->parent) continue; //已配对则跳过
			if ((*ht + j)->w < min1) {
				min2 = min1, index2 = index1;
				min1 = (*ht + j)->w, index1 = j;
			}
			else if ((*ht + j)->w < min2) {
				min2 = (*ht + j)->w, index2 = j;
			}
		}
		(*ht + index1)->parent = (*ht + index2)->parent = *ht + i;
		(*ht + i)->lchild = *ht + index1, (*ht + i)->rchild = *ht + index2;
		(*ht + i)->w = (*ht + index1)->w + (*ht + index2)->w;
	}
	(*ht)->w = words.size;
}

/* 生成Huffman编码 */
void GenHC(HC *hc, HT ht) {
	unsigned index = 0; //HC下角标
	
	hc->size = ht->w;
	hc->hcode = (HCode *)malloc(sizeof(HCode) * hc->size);
	if (!hc->hcode) exit(0);
	for (int i = 0; i < hc->size; i++) hc->hcode[i].code = hc->hcode[i].len = 0;
	HTNode *p = ht, *recent = NULL, *temp;
	Stack S;
	InitStack(&S);
	while (p || !StackEmpty(S)) {
		if (p) {
			Push(&S, p);
			p = p->lchild;
		}
		else {
			if (S->e->rchild && S->e->rchild != recent) p = S->e->rchild;
			else {
				Pop(&S, &p);
				if (p->lchild == NULL && p->rchild == NULL) {
					hc->hcode[index].c = p->c;
					temp = p;
					while (temp->parent) {
						hc->hcode[index].code = hc->hcode[index].code >> 1;
						hc->hcode[index].len++;
						temp->parent->lchild == temp ? hc->hcode[index].code : (hc->hcode[index].code |= (1ULL << 63));
						temp = temp->parent;
					}
					index++;
				}
				recent = p;
				p = NULL;
			}
		}
	}
}

/* 编码压缩 */
void Encode(HC hc, Data orig, Data *news) {
	unsigned index = 0;
	unsigned usedbits = 0;
	unsigned long long code;
	unsigned len;
	
	news->string = (unsigned char *)malloc(sizeof(char) * orig.size);
	if (!news->string) exit(0);
	memset(news->string, 0, sizeof(char) * orig.size);
	news->size = 1;
	
	for (int i = 0; i < orig.size; i++) {
		for (int j = 0; j < hc.size; j++) {
			if (hc.hcode[j].c == orig.string[i]) {
				code = hc.hcode[j].code, len = hc.hcode[j].len;
				break;
			}
		} //获取编码
		while (usedbits + len >= 8) {
			news->string[index] |= (code >> (usedbits + 56));
			index++;
			code = code << (8 - usedbits);
			len -= 8 - usedbits;
			usedbits = 0;
			news->size++;
		}
		news->string[index] |= (code >> (usedbits + 56));
		usedbits += len;
	}
}

/* 压缩 */
void Zip(const char *fin, const char *fout) {
	Words words;
	Data orig, news;
	HT ht;
	HC hc;
	
	ParseFile(fin, &words, &orig);
	GenHT(&ht, words);
	GenHC(&hc, ht);
	Encode(hc, orig, &news);
	
	/* 保存数据 */
	FILE *fp = fopen(fout, "wb");
	if (!fp) exit(0);
	fwrite(&orig.size, sizeof(unsigned), 1, fp); //原始文件大小
	fwrite(&ht->w, sizeof(unsigned), 1, fp); //编码字符集大小
	fwrite(&news.size, sizeof(unsigned), 1, fp); //压缩后数据大小
	fwrite(ht, sizeof(HTNode), 2 * ht->w - 1, fp); //Huffman树
	fwrite(news.string, sizeof(char), news.size, fp); //压缩后数据
	fclose(fp);
	
	DestroyWords(&words);
	DestroyData(&orig);
	DestroyHT(&ht);
	DestroyHC(&hc);
}

/* 解压缩 */
void Unzip(const char *fin, const char *fout) {
	unsigned huffmansize;
	HT ht;
	Data orig, news;
	
	/* 读取数据 */
	FILE *fpin = fopen(fin, "rb"), *fpout = fopen(fout, "wb");
	if (!fpin || !fpout) exit(0);
	fread(&orig.size, sizeof(unsigned), 1, fpin);
	orig.string = (unsigned char *)malloc(sizeof(char) * orig.size);
	if (!orig.string) exit(0);
	fread(&huffmansize, sizeof(unsigned), 1, fpin);
	ht = (HT)malloc(sizeof(HTNode) * (2 * huffmansize - 1));
	if (!ht) exit(0);
	fread(&news.size, sizeof(unsigned), 1, fpin);
	news.string = (unsigned char *)malloc(sizeof(char) * news.size);
	if (!news.string) exit(0);
	fread(ht, sizeof(HTNode), 2 * huffmansize - 1, fpin);
	fread(news.string, sizeof(char), news.size, fpin);
	fclose(fpin);
	
	/* 修正Huffman树 */
	HTNode *base = (ht + 1)->parent;
	long long diff = (void *)ht - (void *)base;
	for (int i = 0; i < 2 * huffmansize - 1; i++) {
		if ((ht + i)->parent) (ht + i)->parent = (void *)(ht + i)->parent + diff;
		if ((ht + i)->lchild) (ht + i)->lchild = (void *)(ht + i)->lchild + diff;
		if ((ht + i)->rchild) (ht + i)->rchild = (void *)(ht + i)->rchild + diff;
	}
	
	/* 解码 */
	unsigned index = 0;
	HTNode *temp = ht;
	unsigned char mask = 1U << 7;
	for (int i = 0; i < news.size; i++) {
		for (int j = 0; j < 8; j++) {
			if ((news.string[i] << j) & mask) temp = temp->rchild;
			else temp = temp->lchild;
			if (temp->lchild == NULL && temp->rchild == NULL) {
				orig.string[index++] = temp->c;
				temp = ht;
			}
			if (index == orig.size) break;
		}
	}
	
	fwrite(orig.string, sizeof(char), orig.size, fpout);
	fclose(fpout);
	DestroyHT(&ht);
	DestroyData(&orig);
	DestroyData(&news);
}

/* 销毁Huffman树 */
void DestroyHT(HT *ht) {
	free(*ht);
	*ht = NULL;
}

/* 销毁Huffman编码 */
void DestroyHC(HC *hc) {
	free(hc->hcode);
}

/* 销毁统计数字 */
void DestroyWords(Words *words) {
	free(words->word);
}

/* 销毁数据流 */
void DestroyData(Data *data) {
	free(data->string);
}


void InitStack(Stack *S) {
	*S = NULL;
}

void DestroyStack(Stack *S) {
	StackNode *temp;
	while (*S) {
		temp = (*S)->next;
		free(*S);
		*S = temp;
	}
}

bool Push(Stack *S, HTNode *e) {
	if (!*S) {
		*S = (Stack)malloc(sizeof(StackNode));
		if (!*S) exit(0);
		(*S)->e = e;
		(*S)->next = NULL;
	}
	else {
		StackNode *temp = (StackNode *)malloc(sizeof(StackNode));
		if (!temp) exit(0);
		temp->e = e;
		temp->next = *S;
		*S = temp;
	}
	return true;
}

bool Pop(Stack *S, HTNode **e) {
	if (StackEmpty(*S)) return false;
	
	if (e) *e = (*S)->e;
	StackNode *temp = (*S)->next;
	free(*S);
	*S = temp;
	return true;
}

bool StackEmpty(Stack S) {
	return S ? 0 : 1;
}