哈夫曼树实现文件压缩

最新推荐文章于 2024-05-11 10:30:00 发布

今天也要写bug、

最新推荐文章于 2024-05-11 10:30:00 发布

阅读量3.3k

点赞数 11

分类专栏：数据结构（C++实现）文章标签：算法 c++ 数据结构

本文链接：https://blog.csdn.net/qq_52670477/article/details/126800695

版权

数据结构（C++实现）专栏收录该内容

19 篇文章 5 订阅

订阅专栏

文章目录

哈夫曼树
用哈夫曼树压缩文件
- 压缩
- 解压缩
全部代码

哈夫曼树

哈夫曼树的数据结构很简单，每一次都将权值最小的两个的节点合成一个更大的节点，然后放到优先级队列里面，如此操作之后，所有的节点都在叶子节点。

#pragma once
#include <iostream>
#include <queue>
#include <vector>
using namespace std;

template <class W>
struct HuffmanTreeNode
{
    HuffmanTreeNode(const W &weight)
        : _pLeft(NULL), _pRight(NULL), _pParent(NULL), _weight(weight)
    {
    }
    HuffmanTreeNode<W> *_pLeft;
    HuffmanTreeNode<W> *_pRight;
    HuffmanTreeNode<W> *_pParent;
    W _weight;
};

template <class W>
class HuffmanTree
{
    typedef HuffmanTreeNode<W> *PNode;

private:
    PNode _pRoot;

public:
    HuffmanTree()
        : _pRoot(NULL)
    {
    }
    HuffmanTree(W *array, size_t size)
    {
        _CreateHuffmantree(array, size);
    }
    void _Destroy(PNode &pRoot)
    {
        //后序
        if (pRoot)
        {
            _Destroy(pRoot->_pLeft);
            _Destroy(pRoot->_pRight);
            delete pRoot;
            pRoot = NULL;
        }
    }
    ~HuffmanTree()
    {
        _Destroy(_pRoot);
    }
    PNode GetRoot()
    {
        return _pRoot;
    }

private:
    //构建哈夫曼树
    void _CreateHuffmantree(W *array, size_t size)
    {

        struct PtrNodeCompare
        {
            bool operator()(PNode n1, PNode n2) //重载“（）”
            {
                return n1->_weight < n2->_weight;
            }
        };
        priority_queue<PNode, vector<PNode>, PtrNodeCompare> hp; //

        for (size_t i = 0; i < size; ++i)
        {

            hp.push(new HuffmanTreeNode<W>(array[i])); // 数据入堆
        }
        //空堆
        if (hp.empty())
            _pRoot = NULL;

        while (hp.size() > 1) // 取两个出来只回收1个
        {
            PNode pLeft = hp.top();
            hp.pop();
            PNode pRight = hp.top();
            hp.pop();
            PNode pParent = new HuffmanTreeNode<W>(pLeft->_weight + pRight->_weight); //左加右的权值，作为新节点
            pParent->_pLeft = pLeft;
            pLeft->_pParent = pParent;

            pParent->_pRight = pRight;
            pRight->_pParent = pParent;
            hp.push(pParent);
        }
        _pRoot = hp.top(); // 根节点在堆顶
    }
};

用哈夫曼树压缩文件

根据哈夫曼树的特性，我们首先需要统计文件中所有字符（char）的出现次数，生成对应的“码表”（字符+次数），然后码表构建哈夫曼树。码表是我们用来压缩和解压缩的重要数据结构。

以下是对应的字符结构体:

struct CharInfo
{
    unsigned char _ch;   //字符 比如字符a
    unsigned int _count; //字符次数 比如字符a出现45
    string _code;        //对应的哈夫曼编码 字符a构建的哈夫曼编码为011100

    bool operator!=(const CharInfo &info)
    {
        return _count != info._count;
    }
    CharInfo operator+(const CharInfo &info)
    {
        CharInfo ret;
        ret._count = _count + info._count;
        return ret;
    }
    bool operator>(const CharInfo &info)
    {
        return _count > info._count;
    }
};

在文件压缩和解压缩的类型中，有以下成员变量和函数：

//获取文件的大小
unsigned long getFileSize(const char *path)
{
    unsigned long filesize = -1;
    FILE *fp;
    fp = fopen(path, "r");
    if (fp == NULL)
    {
        return filesize;
    }
    fseek(fp, 0L, SEEK_END);
    filesize = ftell(fp);
    fclose(fp);
    return filesize;
}

struct CharInfo
{
    unsigned char _ch;   //字符 a
    unsigned int _count; //字符次数 45
    string _code;        //对应的哈夫曼编码 011100

    bool operator!=(const CharInfo &info)
    {
        return _count != info._count;
    }
    CharInfo operator+(const CharInfo &info)
    {
        CharInfo ret;
        ret._count = _count + info._count;
        return ret;
    }
    bool operator>(const CharInfo &info)
    {
        return _count > info._count;
    }
};

class FileCompressAndUnCompress
{
    typedef HuffmanTreeNode<CharInfo> Node;
    struct TmpInfo
    {
        unsigned char _ch;   //字符
        unsigned int _count; //次数
    };

protected:
    CharInfo _infos[256];

public:
    //构造函数
    FileCompressAndUnCompress()
    {
        for (size_t i = 0; i < 256; ++i)
        {
            _infos[i]._ch = i;
            _infos[i]._count = 0;
        }
    }
    //获取哈夫曼编码
    void GenerateHuffmanCode(Node *root, string code)
    {
        if (root == NULL)
            return;
        //前序遍历生成编码
        if (root->_pLeft == NULL && root->_pRight == NULL)
        {
            _infos[(unsigned char)root->_weight._ch]._code = code; // 码值
            return;
        }
        GenerateHuffmanCode(root->_pLeft, code + '0');
        GenerateHuffmanCode(root->_pRight, code + '1');
    }
    void Compress(const char *file); 
    void Uncompress(const char *file);
};

压缩

    void Compress(const char *file) // file:要压缩的文件
    {
        //获取文件的大小
        unsigned long fileSize = getFileSize(file);
        // 1.统计字符出现的次数
        FILE *fout = fopen(file, "rb");
        assert(fout);
        char ch = fgetc(fout);
        while (feof(fout) == 0) //文件结束，则返回值为1，否则为0
        {
            _infos[(unsigned char)ch]._count++; // 计算对应字符出现的频率
            ch = fgetc(fout);
        }
        // 2.生成Huffmantree 及code
        // 2.1 生成Huffmantree， 构建哈夫曼树
        HuffmanTree<CharInfo> tree(_infos, 256);

        string compressfile = file;
        compressfile += ".huffman";
        FILE *fin = fopen(compressfile.c_str(), "wb"); //打开压缩文件
        assert(fin);

        string code;
        //  2.2 根据哈夫曼树每个字符对应的code
        GenerateHuffmanCode(tree.GetRoot(), code);
        // 2.3 将码表写入压缩文件中
        int writeNum = 0;
        int objSize = sizeof(TmpInfo);
        for (size_t i = 0; i < 256; ++i)
        {
            if (_infos[i]._count > 0)
            {
                TmpInfo info;
                info._ch = _infos[i]._ch;
                info._count = _infos[i]._count;
                printf("codec ch:%u, cout:%u\n", (unsigned char)info._ch, info._count);
                fwrite(&info, objSize, 1, fin);
                writeNum++;
            }
        }
        //把info._count = -1写进去作为数据字典的结束标志位
        TmpInfo info;
        info._count = -1;
        printf("code objSize:%d\n", objSize);
        fwrite(&info, objSize, 1, fin);

        // 3.开始压缩文件
        int writeCount = 0;
        int readCount = 0;
        //文件指针回到开头
        fseek(fout, 0, SEEK_SET); //文件指针、偏移量、参照位置
        ch = fgetc(fout);
        readCount++;
        unsigned char value = 0;
        size_t pos = 0;
        while (feof(fout) == 0) // 一个个字符读取
        {
            // 读取数据，查找对应编码
            string &code = _infos[(unsigned char)ch]._code; // 查找对应的编码
            printf("code[%d]:%u:%s\n", readCount, (unsigned char)ch, code.c_str());
            // 根据对应的编码，修改对应的value值
            for (size_t i = 0; i < code.size(); ++i)
            {
                if (code[i] == '1')
                {
                    value |= (1 << pos);
                }
                else if (code[i] != '0')
                {
                    assert(false);
                    printf("assert(false); ??????????");
                }
                ++pos;
                if (pos == 8)
                {
                    writeCount++;
                    // 够8个bit就存储一次，不用担心code是否全部读完
                    fputc(value, fin);
                    value = 0;
                    pos = 0;
                }
            }
            readCount++;
            ch = fgetc(fout);
        }
        //最后可能不够8bit，也存储进去
        if (pos > 0)
        {
            writeCount++;
            fputc(value, fin); //写入压缩文件（fin）
        }
        printf("码表的大小:%d字节\n", objSize * (writeNum + 1));
        printf("压缩后的文件大小:%d字节\n", writeCount);
        unsigned long totalSize = writeCount + objSize * (writeNum + 1);
        printf("压缩后总文件大小:%lu字节, 原始文件大小:%lu字节, 压缩率:%0.2f\n",
               totalSize, fileSize, (float)(totalSize * 1.0 / fileSize));
        fclose(fout);
        fclose(fin);
    }

以下面的测试文件为例：
在这里插入图片描述

在这里插入图片描述

如果压缩文件中相同字符的重复率很高，那压缩效率会大幅提高：
以全部为1的文件为例：
在这里插入图片描述

在这里插入图片描述

解压缩

    void Uncompress(const char *file) //要解压缩的文件
    {
        string uncompressfile = file;
        //找到最后一个'.'
        size_t pos = uncompressfile.rfind('.');
        assert(pos != string::npos);
        //删除掉".huffman"后缀
        uncompressfile.erase(pos);
        // uncompressfile += ".unhuffman";
        FILE *fin = fopen(uncompressfile.c_str(), "wb"); //打开解压缩文件
        assert(fin);
        FILE *fout = fopen(file, "rb"); //打开压缩文件
        assert(fout);

        // 读入码表
        TmpInfo info;
        int cycleNum = 1;
        int objSize = sizeof(TmpInfo);
        fread(&info, objSize, 1, fout);

        while (info._count != -1) //-1为结束标志
        {
            _infos[(unsigned char)info._ch]._ch = info._ch;
            _infos[(unsigned char)info._ch]._count = info._count;

            fread(&info, objSize, 1, fout);
            cycleNum++;
        }

        //根据码表重建huaffman树
        HuffmanTree<CharInfo> tree(_infos, 256); //参数：数组，256个，无效值（出现0次）
        Node *root = tree.GetRoot();
        Node *cur = root;
        unsigned int n = root->_weight._count; //所有叶子节点的和（源文件字符的个数）
        char ch = fgetc(fout);                 //从fout(压缩文件)读字符
        int readCount = 0;

        while (ch != EOF || n > 0)
        {
            for (size_t i = 0; i < 8; ++i)
            {
                if ((ch & (1 << i)) == 0)
                    cur = cur->_pLeft; // 往左边找
                else
                    cur = cur->_pRight; // 往右边找
                if (cur->_pLeft == NULL && cur->_pRight == NULL)
                {
                    // cout << cur->_weight._ch;
                    //找到对应的字符，写入解压缩文件
                    fputc(cur->_weight._ch, fin);
                    cur = root;
                    if (--n == 0)
                        break;
                }
            }
            ch = fgetc(fout);
        }
        printf("解压缩完成\n");
        fclose(fin);
        fclose(fout);
    }

对上面的压缩文件进行解压缩：
在这里插入图片描述

在这里插入图片描述

全部代码

HuffmanTree.hpp：

#pragma once
#include <iostream>
#include <queue>
#include <vector>
using namespace std;

template <class W>
struct HuffmanTreeNode
{
    HuffmanTreeNode(const W &weight)
        : _pLeft(NULL), _pRight(NULL), _pParent(NULL), _weight(weight)
    {
    }
    HuffmanTreeNode<W> *_pLeft;
    HuffmanTreeNode<W> *_pRight;
    HuffmanTreeNode<W> *_pParent;
    //权值
    W _weight;
};

template <class W>
class HuffmanTree
{
    typedef HuffmanTreeNode<W> *PNode;

private:
    //根节点
    PNode _pRoot;

public:
    HuffmanTree()
        : _pRoot(NULL)
    {
    }
    HuffmanTree(W *array, size_t size)
    {
        _CreateHuffmantree(array, size);
    }
    void _Destroy(PNode &pRoot)
    {
        //后序
        if (pRoot)
        {
            _Destroy(pRoot->_pLeft);
            _Destroy(pRoot->_pRight);
            delete pRoot;
            pRoot = NULL;
        }
    }
    ~HuffmanTree()
    {
        _Destroy(_pRoot);
    }
    PNode GetRoot()
    {
        return _pRoot;
    }

private:
    //构建哈夫曼树
    void _CreateHuffmantree(W *array, size_t size)
    {

        //小根堆
        struct PtrNodeCompare
        {
            bool operator()(PNode n1, PNode n2) //重载“（）”
            {
                return n1->_weight > n2->_weight;
            }
        };
        priority_queue<PNode, vector<PNode>, PtrNodeCompare> hp;

        for (size_t i = 0; i < size; ++i)
        {
            hp.push(new HuffmanTreeNode<W>(array[i])); // 数据入堆
        }
        //空堆
        if (hp.empty())
        {
            _pRoot = NULL;
        }

        while (hp.size() > 1) // 取两个出来只回收1个
        {
            PNode pLeft = hp.top();
            hp.pop();
            PNode pRight = hp.top();
            hp.pop();
            PNode pParent = new HuffmanTreeNode<W>(pLeft->_weight + pRight->_weight); //左加右的权值，作为新节点
            pParent->_pLeft = pLeft;
            pLeft->_pParent = pParent;

            pParent->_pRight = pRight;
            pRight->_pParent = pParent;
            hp.push(pParent);
        }
        _pRoot = hp.top(); // 根节点在堆顶
    }
};

FileCompress.hpp（压缩和解压缩的头文件）

/*利用库中的优先级队列实现哈夫曼树，最后基于哈夫曼树最终实现文件压缩。
描述：
    1.统计文件中字符出现的次数，利用优先级队列构建Haffman树，生成Huffman编码。
    构造过程可以使用priority_queue辅助，每次pq.top()都可以取出权值（频数）最小的节点。每取出两个最小权值的节点，就new出一个新的节点，左右孩子分别指向它们。然后把这个新节点push进优先队列。
    2.压缩：利用Haffman编码对文件进行压缩，即在压缩文件中按顺序存入每个字符的Haffman编码。
    3.将文件中出现的字符以及它们出现的次数写入配置文件中，以便后续压缩使用。
    4.解压缩：利用配置文件重构Haffman树，对文件进行减压缩。
*/
#pragma once
#include "HuffmanTree.hpp"
#include <iostream>
#include <vector>
#include <assert.h>

using namespace std;

//获取文件的大小
unsigned long getFileSize(const char *path)
{
    unsigned long filesize = -1;
    FILE *fp;
    fp = fopen(path, "r");
    if (fp == NULL)
    {
        return filesize;
    }
    fseek(fp, 0L, SEEK_END);
    filesize = ftell(fp);
    fclose(fp);
    return filesize;
}

struct CharInfo
{
    unsigned char _ch;   //字符 a
    unsigned int _count; //字符次数 45
    string _code;        //对应的哈夫曼编码 011100

    bool operator!=(const CharInfo &info)
    {
        return _count != info._count;
    }
    CharInfo operator+(const CharInfo &info)
    {
        CharInfo ret;
        ret._count = _count + info._count;
        return ret;
    }
    bool operator>(const CharInfo &info)
    {
        return _count > info._count;
    }
};

class FileCompressAndUnCompress
{
    typedef HuffmanTreeNode<CharInfo> Node;
    struct TmpInfo
    {
        unsigned char _ch;   //字符
        unsigned int _count; //次数
    };

protected:
    CharInfo _infos[256];

public:
    //构造函数
    FileCompressAndUnCompress()
    {
        for (size_t i = 0; i < 256; ++i)
        {
            _infos[i]._ch = i;
            _infos[i]._count = 0;
        }
    }
    //获取哈夫曼编码
    void GenerateHuffmanCode(Node *root, string code)
    {
        if (root == NULL)
            return;
        //前序遍历生成编码
        if (root->_pLeft == NULL && root->_pRight == NULL)
        {
            _infos[(unsigned char)root->_weight._ch]._code = code; // 码值
            return;
        }
        GenerateHuffmanCode(root->_pLeft, code + '0');
        GenerateHuffmanCode(root->_pRight, code + '1');
    }
    void Compress(const char *file) // file:要压缩的文件
    {
        //获取文件的大小
        unsigned long fileSize = getFileSize(file);
        // 1.统计字符出现的次数
        FILE *fout = fopen(file, "rb");
        assert(fout);
        char ch = fgetc(fout);
        while (feof(fout) == 0) //文件结束，则返回值为1，否则为0
        {
            _infos[(unsigned char)ch]._count++; // 计算对应字符出现的频率
            ch = fgetc(fout);
        }
        // 2.生成Huffmantree 及code
        // 2.1 生成Huffmantree， 构建哈夫曼树
        HuffmanTree<CharInfo> tree(_infos, 256);

        string compressfile = file;
        compressfile += ".huffman";
        FILE *fin = fopen(compressfile.c_str(), "wb"); //打开压缩文件
        assert(fin);

        string code;
        //  2.2 根据哈夫曼树每个字符对应的code
        GenerateHuffmanCode(tree.GetRoot(), code);
        // 2.3 将码表写入压缩文件中
        int writeNum = 0;
        int objSize = sizeof(TmpInfo);
        for (size_t i = 0; i < 256; ++i)
        {
            if (_infos[i]._count > 0)
            {
                TmpInfo info;
                info._ch = _infos[i]._ch;
                info._count = _infos[i]._count;
                printf("codec ch:%u, cout:%u\n", (unsigned char)info._ch, info._count);
                fwrite(&info, objSize, 1, fin);
                writeNum++;
            }
        }
        //把info._count = -1写进去作为数据字典的结束标志位
        TmpInfo info;
        info._count = -1;
        printf("code objSize:%d\n", objSize);
        fwrite(&info, objSize, 1, fin);

        // 3.开始压缩文件
        int writeCount = 0;
        int readCount = 0;
        //文件指针回到开头
        fseek(fout, 0, SEEK_SET); //文件指针、偏移量、参照位置
        ch = fgetc(fout);
        readCount++;
        unsigned char value = 0;
        size_t pos = 0;
        while (feof(fout) == 0) // 一个个字符读取
        {
            // 读取数据，查找对应编码
            string &code = _infos[(unsigned char)ch]._code; // 查找对应的编码
            printf("code[%d]:%u:%s\n", readCount, (unsigned char)ch, code.c_str());
            // 根据对应的编码，修改对应的value值
            for (size_t i = 0; i < code.size(); ++i)
            {
                if (code[i] == '1')
                {
                    value |= (1 << pos);
                }
                else if (code[i] != '0')
                {
                    assert(false);
                    printf("assert(false); ??????????");
                }
                ++pos;
                if (pos == 8)
                {
                    writeCount++;
                    // 够8个bit就存储一次，不用担心code是否全部读完
                    fputc(value, fin);
                    value = 0;
                    pos = 0;
                }
            }
            readCount++;
            ch = fgetc(fout);
        }
        //最后可能不够8bit，也存储进去
        if (pos > 0)
        {
            writeCount++;
            fputc(value, fin); //写入压缩文件（fin）
        }
        printf("码表的大小:%d字节\n", objSize * (writeNum + 1));
        printf("压缩后的文件大小:%d字节\n", writeCount);
        unsigned long totalSize = writeCount + objSize * (writeNum + 1);
        printf("压缩后总文件大小:%lu字节, 原始文件大小:%lu字节, 压缩率:%0.2f\n",
               totalSize, fileSize, (float)(totalSize * 1.0 / fileSize));
        fclose(fout);
        fclose(fin);
    }
    void Uncompress(const char *file) //要解压缩的文件
    {
        string uncompressfile = file;
        //找到最后一个'.'
        size_t pos = uncompressfile.rfind('.');
        assert(pos != string::npos);
        //删除掉".huffman"后缀
        uncompressfile.erase(pos);
        // uncompressfile += ".unhuffman";
        FILE *fin = fopen(uncompressfile.c_str(), "wb"); //打开解压缩文件
        assert(fin);
        FILE *fout = fopen(file, "rb"); //打开压缩文件
        assert(fout);

        // 读入码表
        TmpInfo info;
        int cycleNum = 1;
        int objSize = sizeof(TmpInfo);
        fread(&info, objSize, 1, fout);

        while (info._count != -1) //-1为结束标志
        {
            _infos[(unsigned char)info._ch]._ch = info._ch;
            _infos[(unsigned char)info._ch]._count = info._count;

            fread(&info, objSize, 1, fout);
            cycleNum++;
        }

        //根据码表重建huaffman树
        HuffmanTree<CharInfo> tree(_infos, 256); //参数：数组，256个，无效值（出现0次）
        Node *root = tree.GetRoot();
        Node *cur = root;
        unsigned int n = root->_weight._count; //所有叶子节点的和（源文件字符的个数）
        char ch = fgetc(fout);                 //从fout(压缩文件)读字符
        int readCount = 0;

        while (ch != EOF || n > 0)
        {
            for (size_t i = 0; i < 8; ++i)
            {
                if ((ch & (1 << i)) == 0)
                    cur = cur->_pLeft; // 往左边找
                else
                    cur = cur->_pRight; // 往右边找
                if (cur->_pLeft == NULL && cur->_pRight == NULL)
                {
                    // cout << cur->_weight._ch;
                    //找到对应的字符，写入解压缩文件
                    fputc(cur->_weight._ch, fin);
                    cur = root;
                    if (--n == 0)
                        break;
                }
            }
            ch = fgetc(fout);
        }
        printf("解压缩完成\n");
        fclose(fin);
        fclose(fout);
    }
};

compress.cpp（压缩文件）

#include "FileCompress.hpp"
#include <iostream>
#include <vector>

#include <assert.h>

int main(int argc, char **argv)
{
    if (argc != 2)
    {
        std::cout << "usage: " << argv[0] << ": <input_file> " << std::endl;
        return 1;
    }

    FileCompressAndUnCompress fc;
    std::string fileName(argv[1]);
     fc.Compress(fileName.c_str()); // 压缩文件
    return 0;
}

uncompress.cpp（解压缩文件）

#include "FileCompress.hpp"
#include <iostream>
#include <vector>

#include <assert.h>

int main(int argc, char **argv)
{
    if (argc != 2)
    {
        std::cout << "usage: " << argv[0] << ": <input_file> " << std::endl;
        return 1;
    }

    FileCompressAndUnCompress fc;
    std::string fileName(argv[1]);

    fc.Uncompress(fileName.c_str()); // 解压缩文件

    return 0;
}