c++实现霍夫曼编码

井上泷奈

已于 2022-11-27 15:56:26 修改

阅读量2.1k

点赞数 2

文章标签： c++ 算法开发语言

于 2022-07-20 21:50:01 首次发布

本文链接：https://blog.csdn.net/cris_tian/article/details/125901459

版权

c++实现霍夫曼编码，计算信源的熵、平均码长、编码效率、冗余度与压缩比

考虑到指针可能对新手不太友好，这里用的是vector容器（用法类似数组，可以动态扩容）存储树形结构，大致原理就是n号结点的左右子树分别是2n和2n+1号结点。

HuffmanCode.h

#include <stack>
#include <cmath>
#include "myhead.h"

typedef struct {
    char ch;  // 存储的字符
    int weight;  // 权重
    int parent;  // 父节点
    int lChild;
    int rChild;
} Node;

// 选择最小的权重节点
void findMin(vector<Node> ht, int n, int* s1, int* s2) {
    int min = INT_MAX;
    for (int i = 1; i <= n; i++) {
        if (ht[i].parent == 0) { // 如果没有父节点
            min = i;
            break;
        }
    }

    for (int i = 1; i <= n; i++) {
        if (ht[i].parent == 0) {
            if (ht[i].weight < ht[min].weight) {
                min = i;  // 权重比记录更小，则更新
            }
        }
    }
    *s1 = min;
    //遍历全部结点
    for (int i = 1; i <= n; i++) {
        if (ht[i].parent == 0 && i != (*s1)) {
            min = i;
            break;
        }
    }
    for (int i = 1; i <= n; i++) {
        if (ht[i].parent == 0 && i != (*s1)) {
            if (ht[i].weight < ht[min].weight) {
                min = i;  // 更小则更新
            }
        }
    }
    *s2 = min;
}

// 生成哈夫曼树
void generateTree(vector<Node>& ht, int times[], int n) {
    int m = 2 * n - 1;  // 节点总数
    int s1;
    int s2;
    Node firstNode;  // 填充下标为0
    firstNode.ch = '0';
    firstNode.weight = 0;
    firstNode.lChild = 0;
    firstNode.rChild = 0;
    firstNode.parent = 0;
    ht.push_back(firstNode);
    for (int i = 1; i <= n; i++) {
        Node temp;
        temp.ch = char(96 + i);// a的ASCII码为97
        temp.weight = times[i];
        temp.lChild = 0;
        temp.rChild = 0;
        temp.parent = 0;
        ht.push_back(temp);
    }
    for (int i = n + 1; i <= m; i++) {
        Node temp;
        temp.ch = '0';
        temp.weight = 0;
        temp.lChild = 0;
        temp.rChild = 0;
        temp.parent = 0;
        ht.push_back(temp);
    }
    for (int i = n + 1; i <= m; i++) {
        findMin(ht, i - 1, &s1, &s2);  // 选择最小的两个权重节点
        ht[s1].parent = i;
        ht[s2].parent = i;
        ht[i].lChild = s1;
        ht[i].rChild = s2;
        ht[i].weight = ht[s1].weight + ht[s2].weight;  //更新权重
    }
}

// 从叶节点到根节点找霍夫曼编码
void generateCode(vector<Node>& ht, vector<string>& code, int n) {
    code.push_back("");  // 0下标的占位
    int p;
    int c;
    for (int i = 1; i <= n; i++) {
        stack<char> st;  // 因为是逆向存储，所以使用栈
        int start = n - 1;
        for (c = i, p = ht[i].parent; p != 0; c = p, p = ht[p].parent) {
            if (ht[p].lChild == c) {
                st.push('0');  // 是左孩子则压入0
            }
            else {
                st.push('1');
            }
        }
        int s_len = st.size();
        string hfm_code = "";
        // 从栈中取出
        for (int i = 0; i < s_len; i++) {
            hfm_code += st.top();
            st.pop();
        }
        code.push_back(hfm_code);
    }
    cout << "霍夫曼编码结果如下" << endl;
    for (int i = 1; i <= n; i++)
    {
        cout << ht[i].ch << "：" << code[i] << endl;
    }
}

// 计算信源的熵
double CalEntropy(int times[], int len, int n) {
    double temp = 0;
    for (int i = 1; i < len; i++) {
        double p = double(times[i]) / n;
        temp += p * log2(p);
    }
    temp = -temp;
    return temp;
}

// 计算霍夫曼编码的平均码长
double CalAvgLen(vector<string> code, int times[], int len, int n) {
    double temp = 0;
    for (int i = 1; i < len; i++) {
        int len = code[i].length();
        double p = double(times[i]) / n;
        temp += double(len) * p;
    }
    return temp;
}

myhead.h

#include <iostream>
#include <string>
#include <vector>
#include <time.h>
using namespace std;

// 随机生成字符串，这里只使用a~z，可以使用ASCII码这样就是全部字符，但是会增加遍历时间
string rand_str(int len) {
    string str;
    char c;
    for (int i = 0; i < len; i++) {
        c = 'a' + rand() % 26;
        str.push_back(c);
    }
    return str;
}

// 生成十条随机字符串
vector<string> getChar() {
    vector<string> ten_str;
    srand(unsigned int(time(NULL)));  // 转换为无符号整型避免安全提示
    for (int i = 0; i < 10; i++) {
        int length = 1000 + rand() % 9000;
        ten_str.push_back(rand_str(length));
        // cout << length << endl;
    }
    return ten_str;
}

// 统计每个字符出现的次数
void getNum(string str) {
    int nums[26] = { 0 };
    for (unsigned int i = 0; i < str.length(); i++) {
        nums[int(str[i]) - 97] ++;  // ASCII码中a为97
    }
    cout << "a~z的出现次数分别为：";
    for (int i = 0; i < 26; i++) {
        // cout << char(i + 97) << "的次数为：" << nums[i] << endl;
        cout << nums[i] << "     ";
    }
    cout << endl;
}

#include <iostream>
#include "HuffmanCode.h"

int main() {
    string a = rand_str(5);
    vector<string> ten_str = getChar();  // 生成十条长度在1000~10000的字符串
    for (int i = 0; i < 10; i++) {
        cout << "第" << i + 1 << "条随机字符串长度为" << ten_str[i].length() << endl;
        // getNum(ten_str[i]);

        vector<Node> ht;
        vector<string> code;
        // 因为字符串非常长，所以默认随机生成的时候全部26个字符都包含了,简化算法复杂程度,如果字符串很短需要遍历查询
        int n = 26;
        int times[27] = { 0 };  // 出现的次数
        for (unsigned int j = 0; j < ten_str[i].length(); j++) {
            times[int(ten_str[i][j]) - 96] ++;  // ASCII码中a为97
        }
        cout << "a~z的出现次数分别为：";
        for (int i = 1; i < 27; i++) {
            cout << times[i] << " ";
        }
        cout << endl;
        generateTree(ht, times, n);
        generateCode(ht, code, n);

        // 计算信源的熵
        double entropy = CalEntropy(times, 27, ten_str[i].length());
        cout << "信源的熵为：" << entropy << endl;

        //计算霍夫曼编码的平均码长
        double avg_len = CalAvgLen(code, times, 27, ten_str[i].length());
        cout << "平均码长为：" << avg_len << endl;

        cout << "霍夫曼编码效率为：" << entropy / avg_len << endl;
        cout << "冗余度为：" << (1 - entropy / avg_len) * 100 << "%" << endl;
        cout << "压缩比为：" << 5 / avg_len << endl;
        cout << endl;
    }
    return 0;
}