文件压缩
1,统计文件中字符出现的个数,来构建Huffman树。
2,首先构建Huffman树:(最优二叉树)权值小的在树的最下面,路径最长。
3,将文件中出现的字符以及他们出现的次数写入配置文件,以便后续的解压缩。
4,根据配置文件读取相关信息重建Huffman树,对压缩后的文件进行译码。
代码实现:
HuffmanTree.h
#pragma once
#include "Heap.h"
#include<assert.h>
template<class T>
struct HuffmanTreeNode
{
HuffmanTreeNode<T>* _left;
HuffmanTreeNode<T>* _right;
HuffmanTreeNode<T>* _parent;
T _weight; //权值
HuffmanTreeNode(const T& x)
:_weight(x)
, _left(NULL)
, _right(NULL)
, _parent(NULL)
{}
};
template<class T>
class HuffmanTree
{
typedef HuffmanTreeNode<T> Node;
public:
HuffmanTree() //构造
:_root(NULL)
{}
~HuffmanTree() //析构
{
Destory(_root);
}
template <class T>
struct NodeCompare
{
bool operator()(Node *l, Node *r) //右子树大于左子树
{
return l->_weight < r->_weight;
}
};
public:
void CreatTree(const T* a, size_t size, const T& invalid)//建树
{
assert(a);
Heap<Node*, NodeCompare<T>> minHeap;
for (size_t i = 0; i < size; ++i)
{
if (a[i] != invalid)
{
Node* node = new Node(a[i]);
minHeap.Push(node); //小堆
}
}
while (minHeap.Size() > 1)
{
Node* left = minHeap.Top();
minHeap.Pop();
Node* right = minHeap.Top();
minHeap.Pop();
Node* parent = new Node(left->_weight + right->_weight);
parent->_left = left;
parent->_right = right;
left->_parent = parent;
right->_parent = parent;
minHeap.Push(parent);
}
_root = minHeap.Top();
}
Node* GetRootNode()
{
return _root;
}
void Destory(Node* root)
{
if (root)
{
Destory(root->_left);
Destory(root->_right);
delete root;
root = NULL;
}
}
private:
HuffmanTreeNode<T>* _root;
};
void TestHuffmanTree()
{
int a[10] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
HuffmanTree<int> hf;
hf.CreatTree(a, 10, -1);
}
HuffmanTree.h
#pragma once
#include "HuffmanTree.h"
#include<algorithm>
#include<windows.h>
typedef long long LongType;
struct FileInfo
{
unsigned char _ch;
LongType _count;
string _code;
FileInfo(unsigned char ch = 0)
:_ch(ch)
, _count(0)
{}
FileInfo operator+(FileInfo& fi)
{
FileInfo tmp;
tmp._count = this->_count + fi._count;
return tmp;
}
bool operator < (FileInfo& fi)
{
return this->_count < fi._count;
}
bool operator != (const FileInfo& fi)const
{
return this->_count != fi._count;
}
};
template<class T>
class FileCompress
{
public:
FileCompress()
{
for (int i = 0; i < 256; ++i)
{
_infos[i]._ch = i;
}
}
public:
bool Compress(const char* filename)
{
//1.打开文件,统计文件字符出现的次数
long long Charcount = 0;
assert(filename);
FILE* fOut = fopen(filename, "rb");
assert(fOut);
char ch = fgetc(fOut);
while (ch != EOF)
{
_infos[(unsigned char)ch]._count++;
ch = fgetc(fOut);
Charcount++;
}
//2.生成对应的huffman编码
GenerateHuffmanCode();
//3.压缩文件
string compressFile = filename;
compressFile += ".compress";
FILE* fwCompress = fopen(compressFile.c_str(), "wb");
assert(fwCompress);
fseek(fOut, 0, SEEK_SET);
ch = fgetc(fOut);
char inch = 0;
int index = 0;
while (ch != EOF)
{
string& code = _infos[(unsigned char)ch]._code;
for (size_t i = 0; i < code.size(); ++i)
{
inch = inch << 1;
if (code[i] == '1')
{
inch |= 1;
}
if (++index == 8)
{
fputc(inch, fwCompress);
inch = 0;
index = 0;
}
}
ch = fgetc(fOut);
}
if (index)
{
inch = inch << (8 - index);
fputc(inch, fwCompress);
}
//4.配置文件,方便后续的解压缩
string configFile = filename;
configFile += ".config";
FILE *fconfig = fopen(configFile.c_str(), "wb");
assert(fconfig);
char CountStr[128];
_itoa(Charcount >> 32, CountStr, 10);
fputs(CountStr, fconfig);
fputc('\n', fconfig);
_itoa(Charcount & 0xffffffff, CountStr, 10);
fputs(CountStr, fconfig);
fputc('\n', fconfig);
FileInfo invalid;
for (int i = 0; i < 256; i++)
{
if (_infos[i] != invalid)
{
fputc(_infos[i]._ch, fconfig);
fputc(',', fconfig);
fputc(_infos[i]._count + '0', fconfig);
fputc('\n', fconfig);
}
}
fclose(fOut);
fclose(fwCompress);
fclose(fconfig);
return true;
}
bool UnCompresss(const char* filename)
{
string configfile = filename;
configfile += ".config";
FILE* outConfig = fopen(configfile.c_str(), "rb");
assert(outConfig);
char ch;
long long Charcount = 0;
string line = ReadLine(outConfig);
Charcount = atoi(line.c_str());
Charcount <<= 32;
line.clear();
line = ReadLine(outConfig);
Charcount += atoi(line.c_str());
line.clear();
while (feof(outConfig))
{
line = ReadLine(outConfig);
if (!line.empty())
{
ch = line[0];
_infos[(unsigned char)ch]._count = atoi(line.substr(2).c_str());
line.clear();
}
else
{
line = '\n';
}
}
HuffmanTree<FileInfo> ht;
FileInfo invalid;
ht.CreatTree(_infos, 256, invalid);
HuffmanTreeNode<FileInfo>* root = ht.GetRootNode();
string UnCompressFile = filename;
UnCompressFile += ".uncompress";
FILE* fOut = fopen(UnCompressFile.c_str(), "wb");
string CompressFile = filename;
CompressFile += ".compress";
FILE* fIn = fopen(CompressFile.c_str(), "rb");
int pos = 8;
HuffmanTreeNode<FileInfo>* cur = root;
ch = fgetc(fIn);
while ((unsigned char)ch != EOF)
{
--pos;
if ((unsigned char)ch &(1 << pos))
{
cur = cur->_right;
}
else
{
cur = cur->_left;
}
if (cur->_left == NULL && cur->_right == NULL)
{
fputc(cur->_weight._ch, fOut);
cur = root;
Charcount--;
}
if (pos == 0)
{
ch = fgetc(fIn);
pos = 8;
}
if (Charcount == 0)
{
break;
}
}
fclose(outConfig);
fclose(fIn);
fclose(fOut);
return true;
}
protected:
string ReadLine(FILE* fConfig)
{
char ch = fgetc(fConfig);
if(ch == EOF)
{
return "";
}
string line;
while (ch != '\n' && ch != EOF)
{
line += ch;
ch = fgetc(fConfig);
}
return line;
}
void GenerateHuffmanCode()
{
HuffmanTree<FileInfo> hft;
FileInfo invalid;
hft.CreatTree(_infos, 256, invalid);
_GenerateHuffmanCode(hft.GetRootNode());
}
void _GenerateHuffmanCode(HuffmanTreeNode<FileInfo>* root)
{
if (root == NULL)
{
return;
}
_GenerateHuffmanCode(root->_left);
_GenerateHuffmanCode(root->_right);
if (root->_left == NULL && root->_right == NULL)
{
HuffmanTreeNode<FileInfo>* cur = root;
HuffmanTreeNode<FileInfo>* parent = cur->_parent;
string& code = _infos[cur->_weight._ch]._code;
while (parent)
{
if (parent->_left == cur)
{
code += '0';
}
else if (parent->_right == cur)
{
code += '1';
}
cur = parent;
parent = cur->_parent;
}
reverse(code.begin(), code.end());
}
}
private:
FileInfo _infos[256];
};
void TestFileCompress()
{
FileCompress<FileInfo> fc;
int begin1 = GetTickCount();
fc.Compress("Input.BIG");
int end1 = GetTickCount();
cout << end1 - begin1 << endl;
int begin2 = GetTickCount();
fc.UnCompresss("Input.BIG");
int end2 = GetTickCount();
cout << end2 - begin2 << endl;
}
Heap.h
#pragma once
#include <vector>
#include<assert.h>
#include<iostream>
using namespace std;
// 小堆
template<class T>
struct Less
{
bool operator() (const T& l, const T& r)
{
return l < r; // operator<
}
};
template<class T>
struct Greater
{
bool operator() (const T& l, const T& r)
{
return l > r; // operator<
}
};
template<class T, class Compare = Less<T>>
class Heap
{
public:
Heap()
{}
Heap(const T* a, size_t size)
{
for (size_t i = 0; i < size; ++i)
{
_arrays.push_back(a[i]);
}
// 建堆
for (int i = (_arrays.size() - 2) / 2; i >= 0; --i)
{
AdjustDown(i);
}
}
void Push(const T& x)
{
_arrays.push_back(x);
AdjustUp(_arrays.size() - 1);
}
void Pop()
{
assert(_arrays.size() > 0);
swap(_arrays[0], _arrays[_arrays.size() - 1]);
_arrays.pop_back();
AdjustDown(0);
}
T& Top()
{
assert(_arrays.size() > 0);
return _arrays[0];
}
bool Empty()
{
return _arrays.empty();
}
int Size()
{
return _arrays.size();
}
void AdjustDown(int root)
{
int child = root * 2 + 1;
//
Compare com;
while (child < _arrays.size())
{
// 比较出左右孩子中小的那个
//if (child+1<_arrays.size() &&
// _arrays[child+1] > _arrays[child])
if (child + 1<_arrays.size() &&
com(_arrays[child + 1], _arrays[child]))
{
++child;
}
//if(_arrays[child] > _arrays[root])
if (com(_arrays[child], _arrays[root]))
{
swap(_arrays[child], _arrays[root]);
root = child;
child = 2 * root + 1;
}
else
{
break;
}
}
}
void AdjustUp(int child)
{
int parent = (child - 1) / 2;
//while (parent >= 0)
while (child > 0)
{
//if (_arrays[child] > _arrays[parent])
if (Compare()(_arrays[child], _arrays[parent]))
{
swap(_arrays[parent], _arrays[child]);
child = parent;
parent = (child - 1) / 2;
}
else
{
break;
}
}
}
void Print()
{
for (size_t i = 0; i < _arrays.size(); ++i)
{
cout << _arrays[i] << " ";
}
cout << endl;
}
public:
/*T* _array;
size_t _size;
size_t _capacity;*/
vector<T> _arrays;
};
template<class T>
class PriorityQueue
{
public:
void Push(const T& x)
{
_hp.Push(x);
}
void Pop()
{
_hp.Pop();
}
public:
Heap<T> _hp;
};
void Test1()
{
int a[10] = { 10, 11, 13, 12, 16, 18, 15, 17, 14, 19 };
Heap<int, Greater<int> > hp1(a, 10);
hp1.Push(1);
hp1.Print();
Heap<int> hp2(a, 10);
hp2.Push(1);
hp2.Print();
//Less<int> less;
//cout<<less(1, 2)<<endl;
//Greater<int> greater;
//cout<<greater(1, 2)<<endl;
}
#include <list>
void Test2()
{
vector<int> v1;
v1.push_back(1);
v1.push_back(2);
v1.push_back(3);
v1.push_back(4);
// [)
vector<int>::iterator it = v1.begin();
while (it != v1.end())
{
cout << *it << " ";
++it;
}
cout << endl;
list<int> l1;
l1.push_back(1);
l1.push_back(2);
l1.push_back(3);
l1.push_back(4);
list<int>::iterator listIt = l1.begin();
while (listIt != l1.end())
{
cout << *listIt << " ";
++listIt;
}
cout << endl;
}
void AdjustDown(int* a, size_t size, int root)
{
int child = root * 2 + 1;
while (child < size)
{
if (child + 1 < size && a[child + 1] > a[child])
{
++child;
}
if (a[child] > a[root])
{
swap(a[child], a[root]);
root = child;
child = 2 * root + 1;
}
else
{
break;
}
}
}
void HeapSort(int* a, size_t size)
{
// 建堆
for (int i = (size - 2) / 2; i >= 0; --i)
{
AdjustDown(a, size, i);
}
// 选数据排序
for (size_t i = 0; i < size; ++i)
{
swap(a[0], a[size - i - 1]);
AdjustDown(a, size - i - 1, 0);
}
}
void TestHeapSort()
{
int a[10] = { 5, 9, 2, 3, 0, 1, 7, 8, 4, 6 };
HeapSort(a, 10);
}
Test.cpp
#define _CRT_SECURE_NO_WARNINGS 1
#include <iostream>
using namespace std;
//#include "HuffmanTree.h"
#include "FileCompress.h"
int main()
{
// TestHuffmanTree();
TestFileCompress();
return 0;
}
运行结果:
运行后文件夹:
压缩后:
解压后: