【数据结构】文件“压缩”——对.txt文件进行哈夫曼编码

Matr1x_Yu

于 2024-05-01 20:32:24 发布

阅读量419

点赞数 9

文章标签：数据结构 c语言 c++ 算法霍夫曼树

本文链接：https://blog.csdn.net/yuzexuan666/article/details/138377061

版权

零、前言

此代码仅针对原文字符进行重新编码，但是最终的输出仍在.txt中。也就是说，虽然最终的文件中只含有01，但是每一个01实际为char类型，而不是真正意义上的二进制。

读者可以略微修改代码即可实现真正的二进制保存。

另一方面，等效文件压缩率应为 (“压缩”后的文件大小/8) / 原文件大小。此处的8就是因为01是char类型，而sizeof(char)=8 bit 造成的。

一、主要模块

0.禁用警告

不同ide，不同版本的c语言的警告、报错会有不同。通过下面这句话可以消除绝大部分因版本不同而带来的警告、报错

#define _CRT_SECURE_NO_WARNINGS // 禁用警告

1.结构体定义

// 定义哈夫曼树的节点结构体
typedef struct hfmnode
{
    int  data;           // 字符数据，ASCII 码值
    int  weight;         // 字符权重，即出现次数
    int  lc;             // 左孩子索引
    int  rc;             // 右孩子索引
    int  parent;         // 父节点索引
    char code[50];       // 用于存储哈夫曼编码的数组，最大长度为 50
} HTNode;

2.主函数

int main() {

    // 文件指针声明
    FILE* source_ptr, * dest_ptr;

    // 打开源文件，以二进制读取方式打开
    source_ptr = fopen("source.txt", "rb");
    if (source_ptr == NULL) {
        printf("无法打开源文件。\n");
        return 1;
    }

    char ch;//保存每次读取的一个字节内容
    int freq[256] = { 0 };//用于保存所有字符出现频次的缓冲数组

    // 统计源文件中字符出现的频次
    while ((ch = fgetc(source_ptr)) != EOF) {
        freq[(unsigned char)ch]++;
    }
    fclose(source_ptr);

    // 计算字符种类数
    int SIZE = CountChar(freq);

    // 动态分配哈夫曼树节点数组内存
    HTNode* ht = (HTNode*)malloc(sizeof(HTNode) * (2 * SIZE - 1));

    // 初始化哈夫曼树
    InitTree(ht, SIZE);

    // 将字符频次信息导入哈夫曼树
    Import(freq, ht, SIZE);

    // 构建哈夫曼树
    UnionNode(ht, SIZE);

    // 打开目标文件，以写入方式打开
    dest_ptr = fopen("dest.txt", "w");
    if (dest_ptr == NULL) {
        printf("无法打开目标文件。\n");
        return 1;
    }

    // 保存编码信息到目标文件前面
    SaveCodeToFile("dest.txt", ht, SIZE);

    // 对源文件进行编码并写入目标文件
    EncodeFile("source.txt", "dest.txt", ht, SIZE);

    // 关闭文件指针
    fclose(dest_ptr);
    free(ht); // 释放内存

    // 输出成功信息
    printf("成功将字符频次信息保存到 dest.txt 文件中，并进行编码替换。\n");

    return 0;
}

4.生成哈夫曼树（核心）

思路：每次找到两个 “权重最小” && “parent=-1” 的根节点，将两棵树结合为一棵树，结合后的树的根节点的权值是两子树根节点权值之和，并修改涉及的三个节点的索引。重复此过程，直到森林中只剩下一棵树。

// 构建哈夫曼树
void UnionNode(HTNode* ht, int SIZE)
{
    int min1, min2;
    for (int i = 0; i < 2 * SIZE - 1; i++)
    {
        ht[i].lc = -1;
        ht[i].rc = -1;
        ht[i].parent = -1;
        memset(ht[i].code, 0, sizeof(ht[i].code));
    }

    for (int round = 1; round <= SIZE - 1; round++)
    {
        min1 = -1;
        min2 = -1;
        for (int i = 0; i < SIZE - 1 + round; i++)
        {
            if (ht[i].parent == -1 && ht[i].weight)
            {
                if (min1 == -1 || ht[i].weight < ht[min1].weight)
                {
                    min2 = min1;
                    min1 = i;
                }
                else if (min2 == -1 || ht[i].weight < ht[min2].weight)
                {
                    min2 = i;
                }
            }
        }

        ht[SIZE - 1 + round].weight = ht[min1].weight + ht[min2].weight;
        ht[min1].parent = SIZE - 1 + round;
        ht[min2].parent = SIZE - 1 + round;
        ht[SIZE - 1 + round].lc = min1;
        ht[SIZE - 1 + round].rc = min2;
        
        //以下用于哈夫曼编码
        char code[256];
        GenerateHuffmanCode(ht, SIZE - 1 + round, code, 0);
    }

    // 检查是否只有一个字符，若是，则手动设置其编码为 "0"
    if (SIZE == 1) {
        strcpy(ht[0].code, "0");
    }
}

5.利用递归进行哈夫曼编码

思路：哈夫曼编码类似于树的先序遍历，对根节点进行判断--左孩子--右孩子。

// 生成哈夫曼编码
void GenerateHuffmanCode(HTNode* ht, int root, char* code, int depth)
{
    if (ht[root].lc == -1 && ht[root].rc == -1) // 判断是否为叶子节点
    {
        code[depth] = '\0'; // 叶子节点的编码结束符为 '\0'
        strcpy(ht[root].code, code); // 将编码复制到叶子节点的 code 字段中
        return;
    }

    code[depth] = '0'; // 左子树编码为 '0'
    GenerateHuffmanCode(ht, ht[root].lc, code, depth + 1);

    code[depth] = '1'; // 右子树编码为 '1'
    GenerateHuffmanCode(ht, ht[root].rc, code, depth + 1);
}

二、效果

1.对整个代码进行压缩

可以看出，空格(0010_0000)的编码最短，即其出现频率最高，符合在代码中的出现频率。

原文件

“压缩”后文件

$\eta =\frac{32.5/8}{5.72}=0.71$

2.对单字符文件压缩

$\eta =\frac{3.24/8}{3.25}=0.125$

可以看出，由于字符数量足够多，可以忽略单字符编码所占空间，与预期相符。

此情况下的压缩率也是哈夫曼编码的理论最大压缩率。

3.对一段文本压缩

在这段文本中，可以看出出现频次最高的两个字符为 “空格” (0010_0000) 与 “e” (0110_0101)。

$\eta =\frac{70.1/8}{16.1}=0.544$

三、完整代码

#define _CRT_SECURE_NO_WARNINGS // 禁用警告

#include "stdio.h"
#include "stdlib.h"
#include "string.h"

// 定义哈夫曼树的节点结构体
typedef struct hfmnode
{
    int  data;           // 字符数据，ASCII 码值
    int  weight;         // 字符权重，即出现次数
    int  lc;             // 左孩子索引
    int  rc;             // 右孩子索引
    int  parent;         // 父节点索引
    char code[50];       // 用于存储哈夫曼编码的数组，最大长度为 50
} HTNode;

// 函数声明
void InitTree(HTNode* ht, int SIZE);
void PrintTree(HTNode* ht, int SIZE);
void UnionNode(HTNode* ht, int SIZE);
int CountChar(int* freq);
void Import(int* freq, HTNode* ht, int SIZE);
void GenerateHuffmanCode(HTNode* ht, int root, char* code, int depth);
void EncodeFile(const char* sourceFile, const char* destFile, HTNode* ht, int SIZE);
void SaveCodeToFile(const char* destFile, HTNode* ht, int SIZE);

int main() {

    // 文件指针声明
    FILE* source_ptr, * dest_ptr;

    // 打开源文件，以二进制读取方式打开
    source_ptr = fopen("source.txt", "rb");
    if (source_ptr == NULL) {
        printf("无法打开源文件。\n");
        return 1;
    }

    char ch;
    int freq[256] = { 0 };

    // 统计源文件中字符出现的频次
    while ((ch = fgetc(source_ptr)) != EOF) {
        freq[(unsigned char)ch]++;
    }
    fclose(source_ptr);

    // 计算字符种类数
    int SIZE = CountChar(freq);

    // 动态分配哈夫曼树节点数组内存
    HTNode* ht = (HTNode*)malloc(sizeof(HTNode) * (2 * SIZE - 1));

    // 初始化哈夫曼树
    InitTree(ht, SIZE);

    // 将字符频次信息导入哈夫曼树
    Import(freq, ht, SIZE);

    // 构建哈夫曼树
    UnionNode(ht, SIZE);

    // 打开目标文件，以写入方式打开
    dest_ptr = fopen("dest.txt", "w");
    if (dest_ptr == NULL) {
        printf("无法打开目标文件。\n");
        return 1;
    }

    // 保存编码信息到目标文件前面
    SaveCodeToFile("dest.txt", ht, SIZE);

    // 对源文件进行编码并写入目标文件
    EncodeFile("source.txt", "dest.txt", ht, SIZE);

    // 关闭文件指针
    fclose(dest_ptr);
    free(ht); // 释放内存

    // 输出成功信息
    printf("成功将字符频次信息保存到 dest.txt 文件中，并进行编码替换。\n");

    return 0;
}

// 将字符编码信息保存到目标文件
void SaveCodeToFile(const char* destFile, HTNode* ht, int SIZE)
{
    FILE* dest_ptr = fopen(destFile, "r+");
    if (dest_ptr == NULL) {
        printf("无法打开目标文件。\n");
        return;
    }

    for (int i = 0; i < SIZE; i++) {
        int num = ht[i].data;
        char binary[9]; // 8位二进制数
        for (int j = 7; j >= 0; j--) {
            binary[j] = (num & 1) + '0'; // 将最低位的值转换为字符
            num >>= 1; // 右移一位
        }
        binary[8] = '\0'; // 添加字符串结束符
        fprintf(dest_ptr, "%s ", binary); // 写入字符的 8 位二进制形式
        for (int j = 0; ht[i].code[j] != '\0'; j++) {
            fprintf(dest_ptr, "%c", ht[i].code[j]); // 写入字符的二进制形式
        }
        fprintf(dest_ptr, "\n");
    }

    fclose(dest_ptr);
}

// 初始化哈夫曼树
void InitTree(HTNode* ht, int SIZE)
{
    for (int i = 0; i < (2 * SIZE - 1); i++)
    {
        ht[i].data = 0;
        ht[i].weight = 0;
        ht[i].lc = -1;
        ht[i].rc = -1;
        ht[i].parent = -1;
        memset(ht[i].code, 0, sizeof(ht[i].code));
    }
}

// 构建哈夫曼树
void UnionNode(HTNode* ht, int SIZE)
{
    int min1, min2;
    for (int i = 0; i < 2 * SIZE - 1; i++)
    {
        ht[i].lc = -1;
        ht[i].rc = -1;
        ht[i].parent = -1;
        memset(ht[i].code, 0, sizeof(ht[i].code));
    }

    for (int round = 1; round <= SIZE - 1; round++)
    {
        min1 = -1;
        min2 = -1;
        for (int i = 0; i < SIZE - 1 + round; i++)
        {
            if (ht[i].parent == -1 && ht[i].weight)
            {
                if (min1 == -1 || ht[i].weight < ht[min1].weight)
                {
                    min2 = min1;
                    min1 = i;
                }
                else if (min2 == -1 || ht[i].weight < ht[min2].weight)
                {
                    min2 = i;
                }
            }
        }

        ht[SIZE - 1 + round].weight = ht[min1].weight + ht[min2].weight;
        ht[min1].parent = SIZE - 1 + round;
        ht[min2].parent = SIZE - 1 + round;
        ht[SIZE - 1 + round].lc = min1;
        ht[SIZE - 1 + round].rc = min2;

        char code[256];
        GenerateHuffmanCode(ht, SIZE - 1 + round, code, 0);
    }

    // 检查是否只有一个字符，若是，则手动设置其编码为 "0"
    if (SIZE == 1) {
        strcpy(ht[0].code, "0");
    }
}

// 生成哈夫曼编码
void GenerateHuffmanCode(HTNode* ht, int root, char* code, int depth)
{
    if (ht[root].lc == -1 && ht[root].rc == -1) // 判断是否为叶子节点
    {
        code[depth] = '\0'; // 叶子节点的编码结束符为 '\0'
        strcpy(ht[root].code, code); // 将编码复制到叶子节点的 code 字段中
        return;
    }

    code[depth] = '0'; // 左子树编码为 '0'
    GenerateHuffmanCode(ht, ht[root].lc, code, depth + 1);

    code[depth] = '1'; // 右子树编码为 '1'
    GenerateHuffmanCode(ht, ht[root].rc, code, depth + 1);
}

// 将源文件内容进行哈夫曼编码并写入目标文件
void EncodeFile(const char* sourceFile, const char* destFile, HTNode* ht, int SIZE)
{
    FILE* source_ptr = fopen(sourceFile, "rb");
    FILE* dest_ptr = fopen(destFile, "a");

    if (source_ptr == NULL || dest_ptr == NULL) {
        printf("无法打开文件。\n");
        return;
    }

    char ch;
    while ((ch = fgetc(source_ptr)) != EOF) {
        for (int i = 0; i < SIZE; i++) {
            if (ht[i].data == (unsigned char)ch) {
                fprintf(dest_ptr, "%s", ht[i].code); // 将哈夫曼编码写入目标文件
                break;
            }
        }
    }

    fclose(source_ptr);
    fclose(dest_ptr);
}

// 统计非零频次字符种类数
int CountChar(int* freq)
{
    int cnt = 0;
    for (int i = 0; i < 256; i++)
        if (freq[i])
            cnt++;
    return cnt;
}

// 将非零频次字符信息导入哈夫曼树节点数组
void Import(int* freq, HTNode* ht, int SIZE)
{
    int j = 0;
    for (int i = 0; i < 256; i++)
    {
        if (freq[i])
        {
            ht[j].data = i;
            ht[j].weight = freq[i];
            j++;
        }
    }
}

// 打印哈夫曼树节点信息（调试用）
void PrintTree(HTNode* ht, int SIZE)
{
    for (int i = 0; i < 2 * SIZE - 1; i++)
    {
        printf("data: %d  freq: %d  lc: %d  rc: %d  parent: %d code: %s\n",
            ht[i].data, ht[i].weight, ht[i].lc, ht[i].rc, ht[i].parent, ht[i].code);
    }
}