算法 - Huffman编码

最新推荐文章于 2022-10-17 10:13:23 发布

sunshine2285

最新推荐文章于 2022-10-17 10:13:23 发布

阅读量1.1k

点赞数 1

分类专栏：算法与数据结构文章标签： Huffman编码 Huffman算法

本文链接：https://blog.csdn.net/sunshine2285/article/details/90806967

版权

算法与数据结构专栏收录该内容

24 篇文章 0 订阅

订阅专栏

一、需求分析

统计一篇文章中各个字符出现的频率，按照统计所得的频率使用Huffman算法编码文章，输出编码之后的文件以及Huffman编码
再次解码之前的编码文件，判断编解码前后的文件是否一致，以判断算法的正确性

二、Huffman算法描述

Huffman编码法被称为最优编码法，所获得的编码为无重复前缀码，编码的依据是被编码的字符在整个原文中出现的频率
Huffman树的生成过程：
1. 首先统计出各个字符出现的频率作为权值，然后每个字符对应创建一个节点，节点保存字符值、权值。
2. 取出所有节点中权值最大的节点，合并成为一个节点，该节点的权值为两个子节点权值的和，该节点的左右子节点分别为之前取出来的两个最大子节点。将该节点重新放入所有节点中。
3. 重复 2 步骤，直到剩下一个节点，则所有节点构成一个Huffman树，该节点为树根
编码过程（Huffman树的遍历）：
1. DFS：可使用递归函数，默认向左为0，向右为1
2. BFS：使用队列实现，默认向左为0，向右为1
解码过程：
1. 根据Huffman树解码Huffman编码最为简单
2. 从树根开始，根据Huffman编码0向左，1向右终点便是字符所在节点

三、数据表示

词频统计的过程使用map<char, string> 保存键值对类型的数据
节点每次的排序使用优先队列 priority_queue<Node, vector<Node>, greater<Node>>

四、问题与解决方法

读文件时使用 while(in.get() != EOF) 判断文件结束是会多读一个字符
- 原因：其实 EOF(end of file) 这个文件结束标志是在文件的最后一个字符之后的，当读入最后一个字符的时候其实文件并没有读到 EOF 标志，只有下一次读的时候才会读到 EOF，但是在读到 EOF 的时候如果未加处理还是会把 EOF 的值看做一个字符输出，故总会多输出一个字符
- 解决方法：使用 peek() 函数，该函数获取下个字符的值但不移动文件指针，故可在最后一个字符的时候知道下个字符为 EOF
```
  while (in.peek() != EOF)
{
    char c = in.get();
    cout << c;
}
```

五、实验代码

#include <iostream>
#include <fstream>
#include <queue>
#include <map>
#include <utility>
#include <algorithm>

using namespace ::std;

struct Node
{
    bool isChild = false;
    char symbol = 0;

    string encode = "";

    int weight = 0;
    Node *parent = NULL;
    Node *leftChild = NULL;
    Node *rightChild = NULL;

    Node(char symbol, int weight) : symbol(symbol), weight(weight), isChild(true)
    {
    }

    Node(int weight, Node *leftChild, Node *rightChild)
        : weight(weight), leftChild(leftChild), rightChild(rightChild)
    {
    }

    Node(const Node &node)
        : symbol(node.symbol), encode(node.encode), weight(node.weight),
          parent(node.parent), leftChild(node.leftChild), rightChild(node.rightChild),
          isChild(node.isChild)
    {
    }

    friend bool operator>(const Node n1, const Node n2)
    {
        return (n1.weight > n2.weight);
    }
};

/**
 * 统计文章中每个字符出现的频率，作为生成huffman树的权值
 */
void letterFrequencyCount(ifstream &in, map<char, int> &letterMap)
{
    cout << "open file successed" << endl;

    while (in.peek() != EOF)
    {
        char c = in.get();
        if (letterMap.count(c) == 0)
            letterMap.insert(make_pair(c, 1));
        else
            ++letterMap.find(c)->second;
    }
    map<char, int>::iterator iter;
    for (iter = letterMap.begin(); iter != letterMap.end(); ++iter)
    {
        cout << '[' << '\'' << iter->first << '\'' << " - " << iter->second << ']' << endl;
    }
    cout << "Word frequency statistics successed" << endl;
}


Node *getHuffmanTree(map<char, int> &letterMap)
{
    cout << "calculating huffman encode" << endl;

    priority_queue<Node, vector<Node>, greater<Node>> letterQueue;

    for (map<char, int>::iterator iter = letterMap.begin(); iter != letterMap.end(); ++iter)
    {
        letterQueue.push(Node(iter->first, iter->second));
    }

    //构造huffman树，使用优先队列
    while (letterQueue.size() > 1)
    {
        Node *leftChild = new Node(letterQueue.top());
        letterQueue.pop();
        if (leftChild->leftChild != NULL)
        {
            leftChild->leftChild->parent = leftChild;
            leftChild->rightChild->parent = leftChild;
        }
        Node *rightChild = new Node(letterQueue.top());
        letterQueue.pop();
        if (rightChild->leftChild != NULL)
        {
            rightChild->leftChild->parent = rightChild;
            rightChild->rightChild->parent = rightChild;
        }
        letterQueue.push(Node(leftChild->weight + rightChild->weight, leftChild, rightChild));
    }
    Node *huffmanTree = new Node(letterQueue.top());
    if (huffmanTree->leftChild != NULL && huffmanTree->rightChild != NULL)
    {
        huffmanTree->leftChild->parent = huffmanTree;
        huffmanTree->rightChild->parent = huffmanTree;
    }
    return huffmanTree;
}

void getHuffmanCode(Node *huffmanTree, map<char, string> &letterKeyMap, int len)
{
    //bfs
    queue<Node *> nodeQueue;
    nodeQueue.push(huffmanTree);
    while (!nodeQueue.empty())
    {
        Node *currentNode = nodeQueue.front();
        nodeQueue.pop();
        
        if (!currentNode->isChild)
        {
            currentNode->leftChild->encode = currentNode->encode + '0';
            currentNode->rightChild->encode = currentNode->encode + '1';
            nodeQueue.push(currentNode->leftChild);
            nodeQueue.push(currentNode->rightChild);
        }
        else
        {
            if (currentNode->parent == NULL)
            {
                currentNode->encode = "0";
                letterKeyMap.insert(make_pair(currentNode->symbol, currentNode->encode));
            }
            else
                letterKeyMap.insert(make_pair(currentNode->symbol, currentNode->encode));
        }
    }

    for (map<char, string>::iterator iter = letterKeyMap.begin(); iter != letterKeyMap.end(); ++iter)
    {
        cout << '[' << '\'' << iter->first << '\'' << " - " << iter->second << ']' << endl;
    }
}

void huffmanEncoder(map<char, string> &letterKeyMap, ifstream &in, ofstream &out)
{
    cout << "Starting encoding file......" << endl;
    while (in.peek() != EOF)
    {
        char c = in.get();
        out << letterKeyMap[c];
    }
    cout << "Encoding file successed" << endl;
}

void huffmanDecoder(Node *huffmanTree, ifstream &in, ofstream &out)
{
    cout << "Starting decoding file......" << endl;
    Node *currentNode = huffmanTree;
    while (in.peek() != EOF)
    {
        char c = in.get();
        if (currentNode == huffmanTree && currentNode->isChild)
        {
            out << currentNode->symbol;
            continue;
        }
        if (c == '0')
            currentNode = currentNode->leftChild;
        else
            currentNode = currentNode->rightChild;
        if (currentNode->isChild)
        {
            out << currentNode->symbol;
            currentNode = huffmanTree;
        }
    }
    cout << "decoding file successed" << endl;
}

int main(int argc, char const *argv[])
{
    map<char, int> letterMap;
    map<char, string> letterKeyMap;

    ifstream in("letter.dat");
    if (!in)
    {
        cout << "open file failed" << endl;
        exit(0);
    }

    letterFrequencyCount(in, letterMap);
    in.clear();
    in.seekg(ios::beg);

    Node *huffmanTree = getHuffmanTree(letterMap);
    getHuffmanCode(huffmanTree, letterKeyMap, letterMap.size());

    in.clear();
    in.seekg(ios::beg);
    ofstream encodeOut("letter-encode.dat");
    if (!encodeOut)
    {
        cout << "open encode-out file failed" << endl;
        exit(0);
    }
    huffmanEncoder(letterKeyMap, in, encodeOut);
    encodeOut.flush();
    encodeOut.close();
    in.close();

    ifstream decodeIn("letter-encode.dat");
    ofstream decodeOut("letter-decode.dat");
    if (!decodeIn)
    {
        cout << "open decode-in file failed" << endl;
        exit(0);
    }
    if (!decodeIn)
    {
        cout << "open decode-out file failed" << endl;
        exit(0);
    }
    huffmanDecoder(huffmanTree, decodeIn, decodeOut);
    decodeIn.close();
    decodeOut.flush();
    decodeOut.close();

    return 0;
}

sunshine2285

关注

1
点赞
踩
11

收藏

觉得还不错? 一键收藏
0
评论
算法 - Huffman编码

一、需求分析统计一篇文章中各个字符出现的频率，按照统计所得的频率使用Huffman算法编码文章，输出编码之后的文件以及Huffman编码再次解码之前的编码文件，判断编解码前后的文件是否一致，以判断算法的正确性二、Huffman算法描述Huffman编码法被称为最优编码法，所获得的编码为无重复前缀码，编码的依据是被编码的字符在整个原文中出现的频率Huffman树的生成过程：首先统...
复制链接

扫一扫