一、前言
之前做的一个数据结构作业,通过哈夫曼树实现对文本的压缩与解压,参考了很多网上的方法,因为时间有限,注释并没有写,但是代码缩进还是比较清晰。
另外,哈夫曼树我单独写了个头文件,是在之前写二叉树类的基础上改的,所以其实里面有很多用不到的代码,比如多种遍历方式,包括递归和非递归的实现,大家忽略即可。
代码用C++编写,因为用到模版,所以头文件和cpp实现没有分开,强行分开坑比较多,这里不多解释。
二、要点
1.哈夫曼树压缩的原理
简单的说就是对文本进行重新编码,因为不同的字符出现频率不同,为了节约空间,我们完全可以把经常出现的设置编码比较短,很少出现的设置比较长,而不像很多编码,所有字符的长度都一样。所以这实际上需要先对文本中字符频率进行统计,然后根据统计结果生成哈夫曼树。因为哈夫曼树特有的性质,避免了前缀问题,也就说不会因为某个编码是另一个编码的前缀而又二义性,哈夫曼树的具体实现不再赘述。
2.编码方式
注意,编码时是二进制位级上的操作,具体方法参考代码,而不要存储一串01序列,这样反而会让文本更大。虽然看起来很滑稽,但很多人写这个程序时确实发生了这个错误。
3.字符长度
压缩过程中很有可能编码结果不是8的倍数,也就说为了凑够最后一个字节需要填充一些字符,而解压过程如果没有事先记录会把填充的二进制位也翻译出来,造成错误。
三、作业要求
作业题目:哈夫曼编码与译码方法
哈夫曼编码是一种以哈夫曼树(最优二叉树,带权路径长度最小的二叉树)为基础变长编码方式。其基本思想是:将使用次数多的代码转换成长度较短的编码,而使用次数少的采用较长的编码,并且保持编码的唯一可解性。在计算机信息处理中,经常应用于数据压缩。是一种一致性编码法(又称"熵编码法"),用于数据的无损耗压缩。要求实现一个完整的哈夫曼编码与译码系统。
要求:
- 从文件中读入任意一篇英文文本文件,分别统计英文文本文件中各字符(包括标点符号和空格)的使用频率;
- 根据已统计的字符使用频率构造哈夫曼编码树,并给出每个字符的哈夫曼编码(字符集的哈夫曼编码表);
- 将文本文件利用哈夫曼树进行编码,存储成压缩文件(哈夫曼编码文件);
- 计算哈夫曼编码文件的压缩率;
- 将哈夫曼编码文件译码为文本文件,并与原文件进行比较。
以下可以不做,供思考,做了可以适当加分 - 能否利用堆结构,优化的哈夫曼编码算法。
- 上述 1-5 的编码和译码是基于字符的压缩,考虑基于单词的压缩,完成上述工作,讨论并比较压缩效果。
- 上述 1-5 的编码是二进制的编码,可以采用 K 叉的哈夫曼树完成上述工作,实现“K 进制”的编码和译码,并与二进制的编码和译码进行比较。
注意:后面几个要求例如用堆优化、K叉编码并没有做,仅仅实现了基础的功能。
四、代码
main.cpp
#include <iostream>
#include <vector>
#include <fstream>
#include "HuffmanTree.h"
using namespace std;
typedef struct
{
char ch;
vector<char> bits;
}Code;
vector<char> *ReadCharsFromFile(string path);
vector<pair<char,int> > *CountCharsNumber(vector<char> chars);
pair<int,int> *SelectMin(vector<HuffmanTree<pair<char,int> > *> tree);
HuffmanTree<pair<char,int> > *CreateHuffmanTree(vector<pair<char,int> > chars_count);
vector<Code> *Encoding(HuffmanTree<pair<char,int> > tree);
float Compress(vector<Code> codes, string path);
void Extract(vector<Code> codes, string path);
bool BitsEqual(vector<char> first ,vector<char> second);
int main()
{
string input_path = "input.txt";
vector<char> *chars = ReadCharsFromFile(input_path);
vector<pair<char,int> > *chars_count = CountCharsNumber(*chars);
pair<char,int> *end = new pair<char,int>('\0', 0);
chars_count->push_back(*end);
HuffmanTree<pair<char,int> > *huffmanTree = CreateHuffmanTree(*chars_count);
vector<Code> *codes = Encoding(*huffmanTree);
cout<<"Compress successfully."<<endl;
cout<<"Compression rate: "<<Compress(*codes, input_path) * 100<<'%'<<endl;
Extract(*codes, input_path + ".huff");
cout<<"Extract successfully."<<endl;
cout<<"The file is '"<<input_path + ".huff.txt'."<<endl;
return 0;
}
vector<char> *ReadCharsFromFile(string path)
{
vector<char> *chars = new vector<char>();
fstream input_file(path.c_str(),ios::in);
if (input_file == NULL)
input_file.close();
else
{
char temp;
input_file>>noskipws;
while (!input_file.eof())
{
input_file>>temp;
chars->push_back(temp);
}
}
input_file.close();
return chars;
}
vector<pair<char,int> > *CountCharsNumber(vector<char> chars)
{
vector<pair<char,int> > *chars_count = new vector<pair<char,int> >();
int size = chars.size();
for (int i = 0; i < size; i++)
{
bool exist = false;
int size_chars_cout = chars_count->size();
for (int j = 0; j < size_chars_cout; j++)
{
if (chars[i] == (*chars_count)[j].first)
{
(*chars_count)[j].second++;
exist = true;
break;
}
}
if (!exist)
{
std::pair<char, int> *count = new std::pair<char, int>(chars[i], 1);
chars_count->push_back(*count);
}
}
return chars_count;
}
pair<int,int> *SelectMin(vector<HuffmanTree<pair<char,int> > *> tree)
{
int index_m = 0;
int index_n = 0;
int size = tree.size();
for (int i = 0; i < size; i++)
if (tree[i]->data.second <= tree[index_m]->data.second)
index_m = i;
if (index_m == 0)
index_n = 1;
for (int i = 0; i < size; i++)
if ((tree[i]->data.second <= tree[index_n]->data.second) && i != index_m)
index_n = i;
return new pair<int, int>(index_m, index_n);
}
HuffmanTree<pair<char,int> > *CreateHuffmanTree(vector<pair<char,int> > chars_count)
{
vector<HuffmanTree<pair<char,int> > *> tree;
HuffmanTree<pair<char,int> > *parent;
int size = chars_count.size();
for (int i = 0; i < size; i++)
tree.push_back(new HuffmanTree<pair<char,int> >(chars_count[i]));
for (int i = 0; i < size - 1 ;i++)
{
pair<int, int> *min = SelectMin(tree);
int weight = tree[min->first]->data.second + tree[min->second]->data.second;
pair<char,int> *parent_data = new pair<char,int>('\0' ,weight);
parent = new HuffmanTree<pair<char,int> >(*parent_data);
parent->lchild = tree[min->first];
parent->rchild = tree[min->second];
tree[min->first]->parent = parent;
tree[min->second]->parent = parent;
vector<HuffmanTree<pair<char,int> > *>::iterator it;
for (it = tree.begin(); it != tree.end();)
if (*it == parent->lchild || *it == parent->rchild)
it = tree.erase(it);
else
it++;
tree.push_back(parent);
delete min;
}
return parent;
}
vector<Code> *Encoding(HuffmanTree<pair<char,int> > tree)
{
vector<Code> *codes = new vector<Code>();
vector<HuffmanTree<pair<char,int> > > *leaves = tree.Travel();
for (vector<HuffmanTree<pair<char,int> > >::iterator it = leaves->begin(); it != leaves->end();)
if (!(((*it).lchild == NULL) && ((*it).rchild == NULL)))
it = leaves->erase(it);
else
it++;
int size_leaves = leaves->size();
for (int i = 0; i < size_leaves; i++)
{
HuffmanTree<pair<char,int> > *ptr = &((*leaves)[i]);
if (ptr->data.first == ptr->parent->lchild->data.first)
ptr = ptr->parent->lchild;
else
ptr = ptr->parent->rchild;
Code *code = new Code();
code->ch = (*leaves)[i].data.first;
while (ptr->parent != NULL)
{
if (ptr == ptr->parent->lchild)
code->bits.push_back('0');
else if (ptr == ptr->parent->rchild)
code->bits.push_back('1');
ptr = ptr->parent;
}
int size_bits = code->bits.size();
for (int i = 0; i < size_bits / 2; i++)
{
char temp = code->bits[i];
code->bits[i] = code->bits[size_bits - 1 - i];
code->bits[size_bits - 1 - i] = temp;
}
codes->push_back(*code);
}
return codes;
}
float Compress(vector<Code> codes, string path)
{
int count_in = 0;
int count_out = 0;
fstream input_file(path.c_str(), ios::in);
string output_path = path;
output_path += ".huff";
ofstream output_file(output_path.c_str(), ios::out|ios::binary);
char char_in;
unsigned char char_out = 0;
int index = 0;
input_file>>noskipws;
while (!input_file.eof())
{
vector<char> *code;
input_file>>char_in;
count_in++;
if (input_file.eof())
char_in = '\0';
int size_codes = codes.size();
for (int i = 0; i < size_codes; i++)
if (codes[i].ch == char_in)
code = &codes[i].bits;
int size_code = code->size();
for (int i = 0; i < size_code; i++)
{
index++;
char_out <<= 1;
if ((*code)[i] == '1')
char_out |= 1;
if (index == 8)
{
output_file.put(char_out);
count_out++;
index = 0;
char_out = 0;
}
}
}
if (index != 0)
{
char_out <<= (8 - index);
output_file.put(char_out);
count_out++;
}
input_file.close();
output_file.close();
return ((float)count_out)/count_in;
}
void Extract(vector<Code> codes, string path)
{
fstream input_file(path.c_str(), ios::in|ios::binary);
string output_path = path;
output_path += ".txt";
ofstream output_file(output_path.c_str(), ios::out|ios::binary);
char char_in;
input_file>>noskipws;
Code code;
int size_codes = codes.size();
while (!input_file.eof())
{
int bits_count = 7;
input_file>>char_in;
for (int i = bits_count; i >= 0; i--)
{
if (char_in & (1 << i))
code.bits.push_back('1');
else
code.bits.push_back('0');
for (int i = 0; i < size_codes; i++)
if (BitsEqual(code.bits, codes[i].bits))
{
code.ch = codes[i].ch;
if (code.ch == '\0')
{
input_file.close();
output_file.close();
return;
}
output_file.put(code.ch);
code.bits.clear();
break;
}
}
}
input_file.close();
output_file.close();
return;
}
bool BitsEqual(vector<char> first ,vector<char> second)
{
int size = first.size();
if ((int)second.size() != size)
return false;
else
for (int i = 0; i < size; i++)
if (second[i] != first[i])
return false;
return true;
}
HuffmanTree.h
/**
****************************************************************
* @file HuffmanTree.h
* @author Swocky
* @version V1.0.0
* @date 11-27-2018
* @brief
An easy implementation of huffman tree.
****************************************************************
**/
#ifndef HUFFMANTREE_H
#define HUFFMANTREE_H
#include <queue>
#include <vector>
#include <stack>
enum TRAVEL_MODE
{
PRE, IN, POST, LEVEL
};
enum TRAVEL_METHOD
{
LOOP, RECURSION
};
template<typename data_type>
class HuffmanTree
{
private:
bool isEmpty;
void CreateFromArrayByRecursion(data_type **data, data_type null)
{
if (**data == null)
{
(*data)++;
isEmpty = true;
}
else
{
this->data = **data;
isEmpty = false;
(*data)++;
HuffmanTree<data_type> *tree_l = new HuffmanTree<data_type>();
tree_l->CreateFromArrayByRecursion(data, null);
if (!tree_l->IsEmpty())
{
tree_l->parent = this;
lchild = tree_l;
}
else
delete tree_l;
HuffmanTree<data_type> *tree_r = new HuffmanTree<data_type>();
tree_r->CreateFromArrayByRecursion(data, null);
if (!tree_r->IsEmpty())
{
tree_r->parent = this;
rchild = tree_r;
}
else
delete tree_r;
}
return;
}
void CreateFromArrayByLoop(data_type **data, data_type null, unsigned size)
{
HuffmanTree<data_type> *content[size+1];
content[1] = this;
this->data = **data;
(*data)++;
for (int i = 2; i<= size; i++)
{
if (**data != NULL)
{
content[i] = new HuffmanTree<data_type>(**data);
isEmpty = false;
(*data)++;
}
else
{
(*data)++;
isEmpty = true;
continue;
}
if (i != 0)
{
int j = i / 2;
if (i % 2 == 0)
{
content[i]->parent = this;
content[j]->lchild = content[i];
}
else
{
content[i]->parent = this;
content[j]->rchild = content[i];
}
}
}
}
void TravelByRecursion(std::vector<HuffmanTree<data_type> > *result, TRAVEL_MODE travel_mode)
{
std::vector<HuffmanTree<data_type> > *part;
HuffmanTree<data_type> *root = this;
if (!IsEmpty())
{
if (travel_mode == PRE)
result->push_back(*root);
if (this->lchild != NULL)
{
part = this->lchild->Travel(travel_mode, RECURSION);
result->insert(result->end(), part->begin(), part->end());
}
if (travel_mode == IN)
result->push_back(*root);
if (this->rchild != NULL)
{
part = this->rchild->Travel(travel_mode, RECURSION);
result->insert(result->end(), part->begin(), part->end());
}
if (travel_mode == POST)
result->push_back(*root);
}
return;
}
void TravelByLoop(std::vector<HuffmanTree<data_type> > *result, TRAVEL_MODE travel_mode)
{
std::stack<HuffmanTree<data_type> > stack;
HuffmanTree<data_type> temp;
std::queue<HuffmanTree<data_type> > queue;
HuffmanTree<data_type> *root = this;
std::stack<std::pair<HuffmanTree<data_type>, int> > *stack_post = new std::stack<std::pair<HuffmanTree<data_type>, int> >();
switch(travel_mode)
{
case PRE:case IN:
while(root != NULL || stack.size()!= 0)
{
while(root != NULL)
{
if (travel_mode == PRE)
result->push_back(*root);
stack.push(*root);
root = root->lchild;
}
if (stack.size()!= 0)
{
root = &(stack.top());
stack.pop();
if (travel_mode == IN)
result->push_back(*root);
root = root->rchild;
}
}
break;
case POST:
while (root != NULL || stack_post->size() != 0)
{
while (root != NULL)
{
std::pair<HuffmanTree<data_type>, int> *data_flag = new std::pair<HuffmanTree<data_type>, int>(*root, 1);
stack_post->push(*data_flag);
root = root->lchild;
}
while (stack_post->size() != 0 && stack_post->top().second == 2)
{
result->push_back(stack_post->top().first);
stack_post->pop();
}
if (stack_post->size() != 0)
{
stack_post->top().second = 2;
root = stack_post->top().first.rchild;
}
}
break;
case LEVEL:
if (root == NULL)
return;
queue.push(*root);
while(queue.size() != 0)
{
temp = queue.front();
queue.pop();
result->push_back(temp);
if (temp.lchild != NULL)
queue.push(*temp.lchild);
if (temp.rchild != NULL)
queue.push(*temp.rchild);
}
break;
}
return;
}
public:
data_type data;
HuffmanTree<data_type> *lchild = NULL;
HuffmanTree<data_type> *rchild = NULL;
HuffmanTree<data_type> *parent = NULL;
HuffmanTree()
{
isEmpty = true;
}
HuffmanTree(data_type data)
{
this->data = data;
isEmpty = false;
}
HuffmanTree(data_type *data, data_type null)
{
CreateFromArrayByRecursion(&data, null);
}
HuffmanTree(data_type *data, data_type null, unsigned size)
{
CreateFromArrayByLoop(&data, null, size);
}
//~HuffmanTree(){}
void MakeEmpty()
{
data = NULL;
lchild = NULL;
rchild = NULL;
parent = NULL;
isEmpty = true;
return;
}
bool IsEmpty()
{
return isEmpty;
}
std::vector<HuffmanTree<data_type> > *Travel(TRAVEL_MODE travel_mode = LEVEL, TRAVEL_METHOD travel_method = LOOP)
{
std::vector<HuffmanTree<data_type> > *result = new std::vector<HuffmanTree<data_type> >;
if (travel_mode == LEVEL && travel_method == RECURSION)
return result;
else
{
if (travel_method == LOOP)
TravelByLoop(result, travel_mode);
else if (travel_method == RECURSION)
TravelByRecursion(result, travel_mode);
}
return result;
}
bool IsCompleteHuffmanTree()
{
HuffmanTree<data_type> *root = this;
HuffmanTree<data_type> temp;
std::queue<HuffmanTree<data_type> > queue;
bool isComplete = true;
if (IsEmpty())
return false;
queue.push(*root);
while(queue.size() != 0)
{
temp = queue.front();
if ((temp.rchild != NULL) && (temp.lchild == NULL))
isComplete = false;
else if (((temp.rchild == NULL) || (temp.lchild == NULL)) && queue.size() != 0
&& ((queue.front().lchild != NULL) || (queue.front().rchild != NULL)))
isComplete = false;
if (temp.lchild != NULL)
queue.push(*temp.lchild);
if (temp.rchild != NULL)
queue.push(*temp.rchild);
queue.pop();
}
return isComplete;
}
int GetWidth()
{
HuffmanTree<data_type> *root = this;
HuffmanTree<data_type> temp;
std::queue<HuffmanTree<data_type> > queue;
std::queue<HuffmanTree<data_type> > next;
int width = 1;
if (IsEmpty())
return false;
queue.push(*root);
while (true)
{
int length = queue.size();
if (length == 0)
break;
while (length > 0)
{
temp = queue.front();
queue.pop();
length--;
if (temp.lchild != NULL)
queue.push(*temp.lchild);
if (temp.rchild != NULL)
queue.push(*temp.rchild);
}
if (width < queue.size())
width = queue.size();
}
return width;
}
};
#endif