C语言哈夫曼树压缩/解压器
小编是大一的菜鸡,这个题目是数据结构的一个实验题,为了完成这个作业,查找了各种资料,借鉴了很多人的代码,前后折腾了三天左右。代码可能跟网上的不一样,大佬路过请不要踩我。
温馨提醒
建议先认真学习Huffman树、文件操作、位或与等相关知识。代码小编已经加了很多标注,如果有不清楚的地方欢迎交流~
原理
建立Huffman树,可以获得权值总和最小的二叉树,对叶子结点用0或1进行二进制编码,由于一个字节的二进制含有8位,将字符转换为二进制进行存储,可以大大减小存储空间。
头文件
#include<stdio.h>
#include<stdlib.h>
#include<string>//需用到strcpy()
#define MAX_SIZE 100//文件名长度
#define ERROR -1
#define OK 1``
哈夫曼树存储结构
typedef struct {
unsigned int weight;//字符权重
unsigned int parent, lchild, rchild;
}HTNode,*HuffmanTree;//动态分配数组存储哈夫曼树
typedef char** HuffmanCode;//动态分配数组存储哈夫曼编码表
哈夫曼树相关函数
这些函数基本上都是在严蔚敏的《数据结构》上抄过来的
void HuffmanCoding(HuffmanTree& HT, HuffmanCode& HC, unsigned int* w, int n) {
//这里的w为无符号类型
Create_HuffmanTree(HT, w, n);
Create_HT_Codelist(HT, HC, n);
}
void Create_HuffmanTree(HuffmanTree& HT, unsigned int* w, int n) {
if (n <= 1)
return;
int m = 2 * n - 1;//结点数
HT = (HuffmanTree)malloc((m + 1) * sizeof(HTNode));//多出一个未用的0号单元
if (!HT)
exit(ERROR);
HTNode* p=HT+1;//跳过0号单元
int i = 1;
for (; i <= n; i++, p++)
*p = { w[i-1],0,0,0 };//n个结点初始化,课本是用*w,最后w++,这里是为防止w[0]地址的丢失
for (; i <= m; ++i, ++p)
*p = { 0,0,0,0 };//大于n的结点初始化
for (i = n + 1; i <= m; ++i) {
//建立哈夫曼树
//在HT[i..i-1]选择parent为0且weight最小的两个结点,其序号分别为s1和s2
int s1, s2;
Select(HT, i - 1, s1, s2);
HT[s1].parent = i; HT[s2].parent = i;
HT[i].lchild = s1; HT[i].rchild = s2;//对左右子树的权值大小没有要求
HT[i].weight = HT[s1].weight + HT[s2].weight;
}
}
void Create_HT_Codelist(HuffmanTree HT, HuffmanCode& HC, int n) {
HC = (HuffmanCode)malloc((n + 1) * sizeof(char*));//分配n个字符编码的头指针
char* cd = (char*)malloc(n * sizeof(char));//分配求编码的工作空间
if ((!cd)||(!HC))
exit(ERROR);
cd[n - 1] = '\0';//编码最多有n-1位,最后一位定义位编码结束符
int start;//编码结束符位置
int c;
for (int i = 1; i <= n; i++) {
//逐个字符求哈夫曼编码
start = n - 1;
c = i;
int f = HT[i].parent;
for (; f != 0; c = f, f = HT[f].parent)
//从叶子到根逆向求编码
if (HT[f].lchild == c)
cd[--start] = '0';
else
cd[--start] = '1';//左0右1
HC[i] = (char*)malloc((n - start) * sizeof(char));//第0个不用,为第i个字符编码分配空间,'\0'也被存入
if (!HC[i])
exit(ERROR);
strcpy_s(HC[i],n-start, &cd[start]);//从cd复制编码(串)到HC
}
free(cd);
}//Creat_HT_Codelist
Select函数是自己写的
void Select(HuffmanTree T, int n, int&s1, int&s2) {
if (n <= 1)
exit(ERROR);
unsigned int m1 = 4294967295; unsigned int m2 = 4294967295;//最大值
for(int i=1;i<=n;i++)
if ((T[i].weight < m1)&&(T[i].parent==0)) {
m1 = T[i].weight;
s1 = i;
}
for (int i = 1; i <= n; i++)
if ((T[i].weight < m2) && (i != s1) && (T[i].parent == 0)) {
//要确保s1跟s2不相等,结果为s1对应元素的权值<=s2
m2 = T[i].weight;
s2 = i;
}
}
压缩函数
void Compress(char in_file_name[], char out_file_name[]) {
FILE* f1,*f2;//f1为待压缩文件指针,f2为压缩后文件指针
fopen_s(&f1, in_file_name, "rb");
if (!f1)
exit(ERROR);
// fseek(f1, 0L, SEEK_SET);
fseek(f1, 0L, SEEK_END);//将文件指针移到末尾
unsigned long long FileSize = ftell(f1);//记录文件总字节数
fseek(f1, 0L, SEEK_SET);//将指针移到开头
unsigned int *FullWeight=(unsigned int *)malloc(256*sizeof(unsigned int));//记录256个字符是否存在及权值
for (int i = 0; i < 256; i++)
FullWeight[i] = 0;//初始化
for (unsigned long long i = 0; i < FileSize; i++)//将文件的全部字符一个个读取出来
FullWeight[(unsigned char)fgetc(f1)]++;//char转换为unsigned
fclose(f1);
int count=0;//字符种数
for (int i = 0; i < 256; i++)
if (FullWeight[i])
count++;
unsigned char* CharSet = (unsigned char*)malloc(count * sizeof(unsigned char));//记录文件含有的字符
unsigned int* WeightSet = (unsigned int*)malloc(count * sizeof(unsigned int));//记录字符的权值
if (!CharSet || !WeightSet)
exit(ERROR);
int j=0;//最后的结果j=count-1
for (int i = 0; i < 256; i++)
if (FullWeight[i]) {
CharSet[j] = i;//字符表种字母顺序为26个英文字母的顺序
WeightSet[j] = FullWeight[i];
j++;
}
free(FullWeight);
HuffmanTree HT;
HuffmanCode HC;
HuffmanCoding(HT, HC, WeightSet, count);
fopen_s(&f2, out_file_name, "wb");//以二进制的形式写入
if(!f2)
exit(ERROR);
fwrite(&count, sizeof(int), 1, f2);//写入字符个数
//fwrite(&FileSize, sizeof(unsigned long long), 1, f2);//写入原文件字节总数
//2n个哈夫曼树元素写入
fwrite(HT, sizeof(HTNode), 2*count, f2);
fwrite(CharSet, sizeof(unsigned char), count, f2);//写入字符表
fwrite(WeightSet, sizeof(unsigned int), count, f2);//写入权值表
//以下开始编码
int offset = 8;//龙哥这里为7,记录一个字节内剩余位数
int a = 0;//字符在CharSet里的位置
unsigned char ReadByte;//读取的一个字节
unsigned char TempByte=0;//暂时存储要写入的编码
unsigned long long BitSize = 0;//记录位数,方便进行解压
fopen_s(&f1, in_file_name, "rb");//以二进制的形式写入
if (!f1)
exit(ERROR);
for (unsigned long long i = 0; i < FileSize; i++) {
fread(&ReadByte, sizeof(unsigned char), 1, f1);
a = 0;
for (;; a++)
if (CharSet[a] == ReadByte)
break;
for (int b = 0; HC[a+1][b]; b++) {
//若读到'\0',则跳出
//第一个没用,为HC[a+1][b]
TempByte = (TempByte << 1) | (HC[a+1][b] - '0');//利用位或TempByte左移一位并写入一位
BitSize++;
offset--;
if (offset == 0) {
//字节8位已填满
offset = 8;//重置为8位
fwrite(&TempByte, sizeof(unsigned char), 1, f2);
TempByte = 0;
}
}
}
if (offset != 8) {
//若最后一个字节用不完,也要强行用到8位
TempByte <<= offset;
fwrite(&TempByte, sizeof(unsigned char), 1, f2);
}
fwrite(&BitSize, sizeof(unsigned long long), 1, f2);//将位数写在文件的最后
fclose(f1);
fclose(f2);
free(HT);
free(HC);
free(CharSet);
free(WeightSet);
printf("已成功压缩!");
}
压缩后的文件内容按顺序为:文件的字符种数count、哈夫曼树HT、字符表CharSet、权值表WeightSet、压缩后内容、文件的总位数
解压函数
void Decompress(char in_file_name[], char out_file_name[]) {
int count;
FILE * f1, * f2;//f1为in,f2为out
// HuffmanCode HC;
HuffmanTree HT;
unsigned long long BitSize;
unsigned char* CharSet;
unsigned int* WeightSet;
fopen_s(&f1, in_file_name, "rb");//用“rb”
if (!f1)
exit(ERROR);
fread(&count, sizeof(int), 1, f1);//读取字符种数count
HT = (HuffmanTree)malloc(2 * count * sizeof(HTNode));
CharSet = (unsigned char*)malloc(count * sizeof(unsigned char));
WeightSet = (unsigned int*)malloc(count * sizeof(unsigned int));
if ((!HT) || (!CharSet) || (!WeightSet))
exit(ERROR);
fread(HT, sizeof(HTNode), 2 * count, f1);//读取哈夫曼树
fread(CharSet, sizeof(unsigned char), count, f1);//读取字符表
fread(WeightSet, sizeof(unsigned int), count, f1);//读取权值表
// Create_HT_Codelist(HT, HC, count);//建立编码表
fseek(f1, -1L * sizeof(unsigned long long), SEEK_END);//跳到末尾
fread(&BitSize, sizeof(unsigned long long), 1, f1);//读取位数
//跳过文件头
fseek(f1, sizeof(int) + 2 * count * sizeof(HTNode) + count * sizeof(unsigned char) + count * sizeof(unsigned int), SEEK_SET);
//fseek(Enc, (long)(sizeof(unsigned char) + 2 * count * sizeof(HTNode) + count * sizeof(unsigned char) + count * sizeof(unsigned int)), SEEK_SET);
unsigned char TempByte = 0;
unsigned char ReadByte;
int offset = 8;
int Index = 2 * count - 1;//根节点
fopen_s(&f2, out_file_name, "wb");//打开目标文件
if (!f2)
exit(ERROR);
fread(&ReadByte, sizeof(unsigned char), 1, f1);
for (unsigned long long a = 0; a < BitSize; a++) {
TempByte = 1 & (ReadByte >> 7);//位与判断ReadByte的位为1还是0
if (TempByte)//从根结点开始寻找
Index = HT[Index].rchild;
else
Index = HT[Index].lchild;
if ((!HT[Index].lchild) && (!HT[Index].rchild)) {
//遇到了根节点
fwrite(&CharSet[Index - 1], sizeof(unsigned char), 1, f2);//由于HT有0号单元,故需-1
Index = 2 * count - 1;
}
offset--;
ReadByte = ReadByte << 1;//舍弃第一位
if (offset == 0) {
//字节的8位用完
fread(&ReadByte, sizeof(unsigned char), 1, f1);
offset = 8;
}
}
free(CharSet);
free(WeightSet);
free(HT);
fclose(f1);
fclose(f2);
printf("解压成功!");
}
主函数
主函数相当简单
int main() {
puts("--------------------------欢迎使用Jay牌压缩程序---------------------------");
puts("输入数字“1”进行压缩");
puts("输入数字“2”进行解压");
puts("输入数字“3”则退出");
puts("--------------------------------------------------------------------------");
int mark;
char in_file_name[MAX_SIZE];//待压缩&&待解压文件名
char out_file_name[MAX_SIZE];//解压后&&压缩后文件名
for (;;) {
printf("输入选项:");
scanf_s("%d", &mark);
getchar();//取掉回车字符
switch (mark) {
case 1:printf("请输入待压缩的文件路径:");//绝对还是相对路径?
scanf_s("%s", in_file_name, MAX_SIZE); getchar();
printf("请输入压缩后的文件路径:");
scanf_s("%s", out_file_name, MAX_SIZE); getchar();
Compress(in_file_name, out_file_name); break;
case 2:printf("请输入待解压的文件路径:");
scanf_s("%s", in_file_name, MAX_SIZE); getchar();
printf("请输入压缩后的文件路径:");
scanf_s("%s", out_file_name, MAX_SIZE);无getchar();
Decompress(in_file_name, out_file_name); break;
case 3:return 0;
default:printf("请输入有效数字!");
}
putchar('\n');
}
return 0;
}