一、需求分析
- 统计一篇文章中各个字符出现的频率,按照统计所得的频率使用Huffman算法编码文章,输出编码之后的文件以及Huffman编码
- 再次解码之前的编码文件,判断编解码前后的文件是否一致,以判断算法的正确性
二、Huffman算法描述
- Huffman编码法被称为最优编码法,所获得的编码为无重复前缀码,编码的依据是被编码的字符在整个原文中出现的频率
- Huffman树的生成过程:
- 首先统计出各个字符出现的频率作为权值,然后每个字符对应创建一个节点,节点保存字符值、权值。
- 取出所有节点中权值最大的节点,合并成为一个节点,该节点的权值为两个子节点权值的和,该节点的左右子节点分别为之前取出来的两个最大子节点。将该节点重新放入所有节点中。
- 重复 2 步骤,直到剩下一个节点,则所有节点构成一个Huffman树,该节点为树根
- 编码过程(Huffman树的遍历):
- DFS:可使用递归函数,默认向左为0,向右为1
- BFS:使用队列实现,默认向左为0,向右为1
- 解码过程:
- 根据Huffman树解码Huffman编码最为简单
- 从树根开始,根据Huffman编码0向左,1向右终点便是字符所在节点
三、数据表示
- 词频统计的过程使用
map<char, string>
保存键值对类型的数据 - 节点每次的排序使用优先队列
priority_queue<Node, vector<Node>, greater<Node>>
四、问题与解决方法
- 读文件时使用
while(in.get() != EOF)
判断文件结束是会多读一个字符-
原因:其实 EOF(end of file) 这个文件结束标志是在文件的最后一个字符之后的,当读入最后一个字符的时候其实文件并没有读到 EOF 标志,只有下一次读的时候才会读到 EOF,但是在读到 EOF 的时候如果未加处理还是会把 EOF 的值看做一个字符输出,故总会多输出一个字符
-
解决方法:使用
peek()
函数,该函数获取下个字符的值但不移动文件指针,故可在最后一个字符的时候知道下个字符为 EOFwhile (in.peek() != EOF) { char c = in.get(); cout << c; }
-
五、实验代码
#include <iostream>
#include <fstream>
#include <queue>
#include <map>
#include <utility>
#include <algorithm>
using namespace ::std;
struct Node
{
bool isChild = false;
char symbol = 0;
string encode = "";
int weight = 0;
Node *parent = NULL;
Node *leftChild = NULL;
Node *rightChild = NULL;
Node(char symbol, int weight) : symbol(symbol), weight(weight), isChild(true)
{
}
Node(int weight, Node *leftChild, Node *rightChild)
: weight(weight), leftChild(leftChild), rightChild(rightChild)
{
}
Node(const Node &node)
: symbol(node.symbol), encode(node.encode), weight(node.weight),
parent(node.parent), leftChild(node.leftChild), rightChild(node.rightChild),
isChild(node.isChild)
{
}
friend bool operator>(const Node n1, const Node n2)
{
return (n1.weight > n2.weight);
}
};
/**
* 统计文章中每个字符出现的频率,作为生成huffman树的权值
*/
void letterFrequencyCount(ifstream &in, map<char, int> &letterMap)
{
cout << "open file successed" << endl;
while (in.peek() != EOF)
{
char c = in.get();
if (letterMap.count(c) == 0)
letterMap.insert(make_pair(c, 1));
else
++letterMap.find(c)->second;
}
map<char, int>::iterator iter;
for (iter = letterMap.begin(); iter != letterMap.end(); ++iter)
{
cout << '[' << '\'' << iter->first << '\'' << " - " << iter->second << ']' << endl;
}
cout << "Word frequency statistics successed" << endl;
}
Node *getHuffmanTree(map<char, int> &letterMap)
{
cout << "calculating huffman encode" << endl;
priority_queue<Node, vector<Node>, greater<Node>> letterQueue;
for (map<char, int>::iterator iter = letterMap.begin(); iter != letterMap.end(); ++iter)
{
letterQueue.push(Node(iter->first, iter->second));
}
//构造huffman树,使用优先队列
while (letterQueue.size() > 1)
{
Node *leftChild = new Node(letterQueue.top());
letterQueue.pop();
if (leftChild->leftChild != NULL)
{
leftChild->leftChild->parent = leftChild;
leftChild->rightChild->parent = leftChild;
}
Node *rightChild = new Node(letterQueue.top());
letterQueue.pop();
if (rightChild->leftChild != NULL)
{
rightChild->leftChild->parent = rightChild;
rightChild->rightChild->parent = rightChild;
}
letterQueue.push(Node(leftChild->weight + rightChild->weight, leftChild, rightChild));
}
Node *huffmanTree = new Node(letterQueue.top());
if (huffmanTree->leftChild != NULL && huffmanTree->rightChild != NULL)
{
huffmanTree->leftChild->parent = huffmanTree;
huffmanTree->rightChild->parent = huffmanTree;
}
return huffmanTree;
}
void getHuffmanCode(Node *huffmanTree, map<char, string> &letterKeyMap, int len)
{
//bfs
queue<Node *> nodeQueue;
nodeQueue.push(huffmanTree);
while (!nodeQueue.empty())
{
Node *currentNode = nodeQueue.front();
nodeQueue.pop();
if (!currentNode->isChild)
{
currentNode->leftChild->encode = currentNode->encode + '0';
currentNode->rightChild->encode = currentNode->encode + '1';
nodeQueue.push(currentNode->leftChild);
nodeQueue.push(currentNode->rightChild);
}
else
{
if (currentNode->parent == NULL)
{
currentNode->encode = "0";
letterKeyMap.insert(make_pair(currentNode->symbol, currentNode->encode));
}
else
letterKeyMap.insert(make_pair(currentNode->symbol, currentNode->encode));
}
}
for (map<char, string>::iterator iter = letterKeyMap.begin(); iter != letterKeyMap.end(); ++iter)
{
cout << '[' << '\'' << iter->first << '\'' << " - " << iter->second << ']' << endl;
}
}
void huffmanEncoder(map<char, string> &letterKeyMap, ifstream &in, ofstream &out)
{
cout << "Starting encoding file......" << endl;
while (in.peek() != EOF)
{
char c = in.get();
out << letterKeyMap[c];
}
cout << "Encoding file successed" << endl;
}
void huffmanDecoder(Node *huffmanTree, ifstream &in, ofstream &out)
{
cout << "Starting decoding file......" << endl;
Node *currentNode = huffmanTree;
while (in.peek() != EOF)
{
char c = in.get();
if (currentNode == huffmanTree && currentNode->isChild)
{
out << currentNode->symbol;
continue;
}
if (c == '0')
currentNode = currentNode->leftChild;
else
currentNode = currentNode->rightChild;
if (currentNode->isChild)
{
out << currentNode->symbol;
currentNode = huffmanTree;
}
}
cout << "decoding file successed" << endl;
}
int main(int argc, char const *argv[])
{
map<char, int> letterMap;
map<char, string> letterKeyMap;
ifstream in("letter.dat");
if (!in)
{
cout << "open file failed" << endl;
exit(0);
}
letterFrequencyCount(in, letterMap);
in.clear();
in.seekg(ios::beg);
Node *huffmanTree = getHuffmanTree(letterMap);
getHuffmanCode(huffmanTree, letterKeyMap, letterMap.size());
in.clear();
in.seekg(ios::beg);
ofstream encodeOut("letter-encode.dat");
if (!encodeOut)
{
cout << "open encode-out file failed" << endl;
exit(0);
}
huffmanEncoder(letterKeyMap, in, encodeOut);
encodeOut.flush();
encodeOut.close();
in.close();
ifstream decodeIn("letter-encode.dat");
ofstream decodeOut("letter-decode.dat");
if (!decodeIn)
{
cout << "open decode-in file failed" << endl;
exit(0);
}
if (!decodeIn)
{
cout << "open decode-out file failed" << endl;
exit(0);
}
huffmanDecoder(huffmanTree, decodeIn, decodeOut);
decodeIn.close();
decodeOut.flush();
decodeOut.close();
return 0;
}