极简的低效压缩工具SimpleStupid

最新推荐文章于 2024-08-28 21:50:30 发布

GGN_2015

最新推荐文章于 2024-08-28 21:50:30 发布

阅读量433

点赞数 2

分类专栏：数据结构文章标签：哈夫曼树压缩工具

本文链接：https://blog.csdn.net/GGN_2015/article/details/116239742

版权

哈夫曼编码压缩算法 C++ 文件处理二进制数据

关键词由CSDN通过智能技术生成

数据结构专栏收录该内容

49 篇文章 1 订阅

订阅专栏

CinpleStupid

2021-05-12 实现了压缩工具的第二版，解决了压缩大文件时 STL 无法分配足够内存空间的问题。但是具体算法仍然十分低效，压缩一个 75MB 的文件，大概需要五分钟时间。

#include <cstdio>
#include <string>
#include <queue>
#include <map>
#include <vector>
#include <algorithm>
using namespace std;

class BitFlowOutput { /// 向文件输出的 bit 流
    private:
        queue<int> bitQueue;
        FILE* fpout;
    public:
        BitFlowOutput() {
            fpout = NULL;
        }
        ~BitFlowOutput() {
            fclose(fpout);
            while(!bitQueue.size()) {
                bitQueue.pop();
            }
        }
        void setFile(const char* fileName) {
            if(fpout != NULL) {
                fclose(fpout);
                while(!bitQueue.size()) {
                    bitQueue.pop();
                }
            }
            fpout = fopen(fileName, "wb+");
            if(fpout == NULL) {
                printf("CinpleStupid: file %s can not write.\n", fileName);
                exit(-1);
            }
        }
        bool byteOutput() {
            if(bitQueue.size() >= 8) {
                char tmp = 0;
                for(int i = 1; i <= 8; i ++) {
                    tmp <<= 1;
                    tmp |= bitQueue.front();
                    bitQueue.pop();
                }
                fputc(tmp, fpout);
                return true;
            }
            return false;
        }
        void stringOutput() {
            while(byteOutput());
        }
        void push(string s) { /// 需要保证 s 是一个 0/1 字符序列
            for(int i = 0; i < s.length(); i ++) {
                bitQueue.push(s[i] - '0');
            }
            stringOutput();
        }
        void pushChar(int ctmp) { /// 压入一个完整的字节
            string stmp = "";
            for(int i = 7; i >= 0; i --) {
                stmp += ((ctmp >> i)&1) + '0';
            }
            push(stmp);
        }
        void finish() { /// 补零并且结束输出
            stringOutput();
            if(!bitQueue.empty()) {
                while(bitQueue.size() < 8) {
                    bitQueue.push(0);
                }
                byteOutput();
            }
        }
        void fpfseek(int pos) { /// 将文件指针移动至首偏移量 pos 处
            if(fpout != NULL) {
                fseek(fpout, pos, SEEK_SET);
            }
        }
        int fpgetpos() { /// 计算当前文件指针所在位置的首偏移量
            return ftell(fpout);
        }
        void outputInt(int val) {
            fwrite(&val, sizeof(int), 1, fpout);
        }
        void close() {
            fclose(fpout);
        }
} bitflow;

/// MAX_NODE 表示哈夫曼树的最大结点数
const int MAX_SIZE = 256, MAX_NODE = 2*MAX_SIZE + 1;

static int StaticHuffmanCountForEveryNode[MAX_NODE];

class Huffman { /// 哈夫曼树，不要存在栈空间里
    private:
        int* count;
        int leftSon [MAX_NODE];
        int rightSon[MAX_NODE];
        char message[MAX_NODE];
        int nodeCnt;
        map<char, int> charCnt;
        map<char, string> charDic; /// 值域是 0/1 字符序列

        void dfs(int root, string str) { /// dfs 得到每个字符对应的零一序列
            //printf("root = %d\n", root);
            if(leftSon[root] == 0) { /// 如果 root 是叶子节点
                charDic[message[root]] = str;
                printf("root = %d, code for %d (cnt = %d) is %s\n", root, message[root], count[root], str.c_str());
            }else {
                dfs(leftSon[root] , str + "0");
                dfs(rightSon[root], str + "1");
            }
        }

        struct cmp {
            bool operator()(int nodeA, int nodeB) { /// 引导生成小根堆
                return StaticHuffmanCountForEveryNode[nodeA] > StaticHuffmanCountForEveryNode[nodeB];
            }
        };

    public:
        Huffman() {
            count = StaticHuffmanCountForEveryNode;
            nodeCnt = 0;
            charCnt.clear();
            for(int i = 0; i < MAX_NODE; i ++) {
                count   [i] = 0;
                leftSon [i] = 0;
                rightSon[i] = 0;
            }
        }
        void addchar(char ctmp) { /// 从文件总读入了一个字符后进行统计
            if(charCnt.find(ctmp) == charCnt.end()) {
                charCnt[ctmp] = 1;
            }else {
                charCnt[ctmp] ++;
            }
        }
        
        void build() { /// 构建哈夫曼树
            typedef map<char, int>::iterator ITR;
            priority_queue<int, vector<int>, cmp> pq;
            for(ITR itr = charCnt.begin(); itr != charCnt.end(); itr ++) {
                int id = ++ nodeCnt;
                count[id] = itr -> second;
                leftSon[id] = 0;
                rightSon[id] = 0;
                message[id] = itr -> first;
                pq.push(id);
            }
            while(pq.size() > 1) {
                int id = ++ nodeCnt;
                int lch = pq.top(); pq.pop();
                int rch = pq.top(); pq.pop();
                count[id] = count[lch] + count[rch];
                leftSon[id] = lch;
                rightSon[id] = rch;
                pq.push(id);
            }
            int root = pq.top(); pq.pop();
            dfs(root, "");
        }
        string GetDic(char ctmp) { /// 得到某个字符对应的 0/1 字符串
            if(charDic.find(ctmp) != charDic.end()) {
                return charDic[ctmp];
            }else {
                return "";
            }
        }
        string operator[](char index) { /// 可以使用中括号取 0/1 字符串
            return GetDic(index);
        }
        void outputDic(BitFlowOutput& bfpout) { /// 按照老式字典规范输出字典
            for(int i = 0; i < 256; i ++) {
                int ctmp = i >= 128 ? i - 256: i;
                int len = GetDic((char)ctmp).length();
                bfpout.pushChar(len);
                bfpout.push(GetDic((char)ctmp));
                if(len % 8 != 0) {
                    int more = 8 - len%8;
                    for(int j = 1; j <= more; j ++) {
                        bfpout.push("0");
                    }
                }
            }
        }
} huffman;

int main(int argc, char* argv[]) { /// 要求有一个控制台参数
    if(argc == 1) {
        printf("CinpleStupid: no input file.\n");
        return -1;
    }else {
        FILE* fpin = fopen(argv[1], "rb");
        if(fpin == NULL) {
            printf("CinpleStupid: file %s can not open.\n", argv[1]);
            return -1;
        }
        while(!feof(fpin)) {
            char ctmp = fgetc(fpin);
            if(!(ctmp == -1 && feof(fpin))) { /// 忽略文末的 eof
                huffman.addchar(ctmp);
            }
        }
        huffman.build();
        bitflow.setFile(((string)argv[1] + ".hfm").c_str());
        huffman.outputDic(bitflow);

        int articalBegin = bitflow.fpgetpos();
        for(int i = 1; i <= 4; i ++) {
            bitflow.pushChar(0); /// 输出四个占位用的零，此处后期会被填充为文件 bit 数
        }
        int fileBitCnt = 0;
        rewind(fpin);
        while(!feof(fpin)) {
            char ctmp = fgetc(fpin);
            if(!(ctmp == -1 && feof(fpin))) { /// 忽略文末的 eof
                bitflow.push(huffman.GetDic(ctmp));
                fileBitCnt += huffman.GetDic(ctmp).length();
            }
        }
        bitflow.finish();
        bitflow.fpfseek(articalBegin);
        bitflow.outputInt(fileBitCnt);
        bitflow.close();
        fclose(fpin);
    }
    return 0;
}

SimpleStupid

我写的一个极其智障的压缩工具，源代码如下：

/// 经过了几次失败的分词尝试 GGN 终于决定返璞归真了
/// Keep it Simple, Studpid.

#include <cstdio>
#include <cstring>
#include <queue>
#include <string>
#include <cstdlib>
#include <vector>
#include <algorithm>
using namespace std;

#define DEBUG

const int maxn = 256;

int cnt[maxn]; /// 统计每种字符出现的次数

void Input(const char* fileName) {
    /// 输入并统计文件中每种字符出现的次数
    memset(cnt, 0x00, sizeof(cnt));
    FILE* fpin = fopen(fileName, "rb");
    if(fpin == NULL) { /// 文件无法打开
        printf("SimpleStupid: can not open file %s\n\n", fileName);
        exit(-1);
    }
    while(!feof(fpin)) {
        unsigned char ctmp = fgetc(fpin); /// 读入一个字符
        if(ctmp == EOF && feof(fpin)) {
            break; /// 忽略文末的 -1
        }
        cnt[ctmp] ++;
    }
    fclose(fpin);
}

struct node {
    int id;
    int cnt; /// 用于堆优化哈夫曼树的构建
};

struct cmp {
    bool operator()(node A, node B) { /// 指导优先队列成为小根堆
        return A.cnt > B.cnt;
    }
};

priority_queue<node, vector<node>, cmp> pq; /// 用于建立哈夫曼树

int lch[2*maxn + 1], rch[2*maxn + 1], ncnt; /// 静态链表记录哈夫曼树
unsigned char msg[2 * maxn + 1];
string dic[maxn]; /// 记录哈夫曼树的字典

void dfs(int rt, string tmp="") {
    /// 通过对哈夫曼树进行 dfs 得到每个结点对应的 01 序列表示
    if(lch[rt] == 0) {
        /// 如果当前节点没有儿子，那么他是叶子节点
        dic[msg[rt]] = tmp;
    }else {
        dfs(lch[rt], tmp + "0");
        dfs(rch[rt], tmp + "1"); /// 递归进行 dfs 确定叶子节点的 01 序列
    }
}

void OutputBinary(string msg, FILE* fpout, int mode = 1) {
    /// msg 是一个零一串，输出 msg 对应的二进制值
    /// mode = 0 表示 01 序列长度用一个字节表示
    /// mode = 1 表示 01 序列长度用四个字节表示
    if(mode == 0) {
        unsigned char siz = msg.length();
        fputc(siz, fpout);
    }else {
        int siz = msg.length();
        fwrite(&siz, sizeof(int), 1, fpout);
    }
    while(msg.length() % 8 != 0) {
        msg += '0'; /// 补零直到其长度为八的倍数
    }
    for(int i = 0; i < msg.length(); i += 8) {
        unsigned char ctmp = 0;
        for(int j = 0; j < 8; j ++) {
            ctmp = ctmp * 2 + (msg[i+j] - '0');
        }
        /// 将相邻的八个数据放到一个字节中去
        fputc(ctmp, fpout);
    }
}

void Output(const char* fileIn, const char* fileOut) { /// 输出到文件
    FILE* fpin  = fopen(fileIn, "rb");
    FILE* fpout = fopen(fileOut, "wb");
    if(fpin == NULL) {
        printf("SimpleStupid: can not open input file %s\n\n", fileIn);
        exit(-1);
    }
    if(fpout == NULL) {
        printf("SimpleStupid: can not open output file %s\n\n", fileOut);
        exit(-1);
    }
    for(int i = 0; i <= 255; i ++) { /// 输出字典
        OutputBinary(dic[i], fpout, 0); /// 用一个字节表示长度
    }
    string tmp = "";
    while(!feof(fpin)) {
        unsigned char ctmp = fgetc(fpin); /// 从输入文档中读入一个字符
        if(ctmp == EOF && feof(fpin)) {
            break; /// 忽略文末 -1
        }
        tmp += dic[ctmp];
    }
    OutputBinary(tmp, fpout, 1); /// 用四个字节表示文件中的 bit 位数
    fclose(fpin);
    fclose(fpout);
}

void Build() {
    for(int ctmp = 0; ctmp <= 255; ctmp ++) {
        if(cnt[ctmp] != 0) {
            int id = ++ ncnt;
            lch[id] = rch[id] = 0; /// 叶子节点
            pq.push((node){id, cnt[ctmp]});
            msg[id] = ctmp; /// 记录叶子节点代表的字符
        }
    }
    if(pq.empty()) {
        /// 此时说明读入了一个空文件
        /// 而空文件我拒绝压缩，真棒
        exit(0);
    }
    while(pq.size() > 1) {
        node L = pq.top(); pq.pop();
        node R = pq.top(); pq.pop();
        int id = ++ ncnt; /// 通过合并产生一个新的结点
        lch[id] = L.id;
        rch[id] = R.id;
        pq.push((node){id, L.cnt + R.cnt});
    }
    int rt = pq.top().id; /// 得到哈夫曼树的根节点
    dfs(rt); /// 通过 dfs 确定每个叶子节点对应的编码
}

int main(int argc, char* argv[]) {
    if(argc == 1) {
        printf("SimpleStupid: no input file.\n\n");
        return -1;
    }else {
        char* fileName = argv[1];
        printf("SimpleStupid: Compressing file: %s\n\n", fileName);
        Input(fileName); /// 输入并统计每种字符出现的次数
        Build(); /// 构建哈夫曼树
        Output(fileName, ((string)fileName + ".hfm").c_str()); /// 输出字典以及文件内容
    }
    return 0;
}

编译为名为 SimpleStupid 的可执行文件后，使用命令行传参确定压缩文件。

> SimpleStupid fileToCompress.txt

ComplexSmart

一个用来为 .hfm 文件解压的垃圾解压工具，源代码如下：

/// 既然压缩软件叫SimpleStupid了，那么解压软件就叫 ComplexSmart 吧

#include <cstdio>
#include <map>
#include <string>
#include <cstdlib>
#include <algorithm>
using namespace std;

#define DEBUG /// 调试模式

map<string, int> redic; /// 解码用字典

string Load(unsigned char ctmp) { /// 从字符整理到 01 string
    int bit[10] = {}, cnt = 0;
    while(ctmp != 0) {
        bit[cnt ++] = ctmp % 2;
        ctmp >>= 1;
    }
    string ans = "";
    for(int i = 7; i >= 0; i --) { /// 从高位到低位整理得到字符串
        if(bit[i]) ans += "1";
        else ans += "0";
    }
    return ans;
}

string GetDicWord(FILE* fpin) { /// 得到字典中的一条记录
    unsigned char len = fgetc(fpin); /// 得到零一串的长度
    int tmp = len;
    if(len % 8 != 0) {
        /// 如果零一串长度不是八的整数倍
        tmp += 8 - len%8; /// 把长度补成 8 的整数倍
    }
    tmp /= 8; /// 现在 tmp 存储的是字节数
    string ans = "";
    for(int i = 0; i < tmp; i ++) {
        unsigned char ctmp = fgetc(fpin); /// 读入一个字节
        //printf("now ctmp = %d\n", (int)ctmp);
        ans += Load(ctmp); /// 将一个字符转化为 01 string
    }
    if(len != tmp*8) {
        //ans[len] = 0; /// 截断多余的零 BUG! 截断不能这么写
        ans = ans.substr(0, len);
    }
    return ans;
}

void InputDic(FILE* fpin) { /// 从文件输入字典
    for(int i = 0; i <= 255; i ++) {
        string stmp = GetDicWord(fpin); /// 得到字典中的一个元素
        if(stmp != "") {
            redic[stmp] = i;
        }
    }
    /// 字典输入结束
}

void GetText(FILE* fpin, string& strTo) {
    strTo = "";
    int len;
    fread(&len, sizeof(int), 1, fpin); /// 输入 01 序列的长度
    int tmp = len;
    if(len % 8 != 0) {
        tmp +=  8 - len % 8; /// 把长度补成八的倍数
    }
    tmp /= 8; /// tmp 当前的值为需要继续输入的字节数
    for(int i = 0; i < tmp; i ++) {
        unsigned char ctmp = fgetc(fpin); /// 读入一个字符
        strTo += Load(ctmp);
    }
    if(len%8 != 0) {
        strTo = strTo.substr(0, len); /// 一定要采用这种方式截断 string
    }
}

void Output(FILE* fpin, const char* fileOut) { /// 恢复压缩文件
    string files = "";
    GetText(fpin, files); /// 将 01 序列 files 中
    FILE* fpout = fopen(fileOut, "wb");
    if(fpout == NULL) {
        printf("ComplexSmart: can not open %s\n\n", fileOut);
        exit(-1);
    }
    typedef map<string, unsigned char>::iterator ITR;
    for(int i = 0; i < files.length();) {
        string stmp = "";
        int j;
        for(j = 0; j < 256; j ++) {
            stmp += files[i + j];
            if(redic.find(stmp) != redic.end()) {
                /// 找到匹配的 01 序列
                if(redic[stmp] == 255 && i + j + 1 >= files.length()) {
                    i = i + j + 1; /// 忽略文末 -1
                    break;
                }
                fputc((char)redic[stmp], fpout); /// 输出对应的字符
                i = i + j + 1;
                break;
            }
        }
        if(j == 256) { /// 找不到对应的解压字符
            printf("ComplexSmart: Input file is not decompressiable. i = %d, j = %d\n\n", i, j);
            exit(-1);
        }
    }
    fclose(fpout);
}

int main(int argc, char* argv[]) {
    if(argc == 1) {
        printf("ComplexSmart: no input file.\n\n");
        return -1;
    }else {
        char* fileName = argv[1];
        printf("ComplexSmart: Decompressing file %s\n\n", fileName);
        FILE* fpin = fopen(fileName, "rb");
        if(fpin == NULL) { /// 无法读入文件
            printf("ComplexSmart: can not read file %s\n\n", fileName);
            return -1;
        }
        InputDic(fpin); /// 输入字典
        Output(fpin, ((string)fileName + ".txt").c_str()); /// 输出原文信息到文本文档
        fclose(fpin);
    }
    return 0;
}

编译为名为 ComplexSmart 的可执行文件后，采用命令行传参确定解压文件：

> ComplexSmart fileToDecompress.hfm

后记：心路历程

根据学校大作业的要求实现了一个极简而低效的压缩工具，我将其命名为 SimpleStupid，原理简单地可怜——利用哈夫曼树对ASCII码字符进行重新编码。显然，这种压缩方式对英文文档比较有效，压缩率一般能达到 $60\%$ 左右。对其他文件压缩比较低效，在对二进制的程序的压缩试验中，甚至出现了压缩率高达 $99\%$ 的情况（没超过 $100\%$ 已经是万幸了）。

命名为 SimpleStupid 一方面是因为我前期一直在思考分词策略，但是分词程序的 BUG 却迟迟调不出来，后来想到了老乔说的：“Keep it simple, stupid.”，决心从极简的压缩过程入手，先丢下分词策略不管，以后再说。另一方面也是因为 SimpleStupid 的实现策略我觉得确实挺 stupid 的，而且也没有按照学校大作业的要求开发一个美观的 UI，而是用命令行传参确定输入输出文件。