主要实现思路: 先对字符进行哈夫曼编码,再利用最小堆的特性,构造哈夫曼树。
- 统计文件字符出现的次数,本质是利用字符出现的次数进行构造哈夫曼树; 然后通过遍历哈夫曼树获取哈夫曼编码;
- 压缩文件中有两部分信息,先是配置信息的主要内容:字符,字符出现次数,然后是huffman code,通过info标记区分两部分信息;
当解压缩时可以利用压缩文件配置信息中的内容构造出一颗新的哈夫曼树,然后利用其中的哈夫曼编码还原文件信息;
- heap.h
// heap.h
#pragma once
#include <iostream>
using namespace std;
#include <vector>
template <class T>
struct Less
{
bool operator ()(const T& left, const T& right)
{
return left < right;
}
};
template <class T>
struct Less<T*>
{
bool operator ()( T* nodeleft, T* noderight)
{
return (nodeleft->_w) < (noderight->_w);
}
};
template <class T>
struct Greater
{
bool operator ()(const T& left, const T& right)
{
return left > right;
}
};
//建 xiao堆
template <class T, class Compare = Less<T>>
class Heap
{
public:
Heap()
{}
Heap(T* a, size_t n)
{
_a.reserve(n);
for (size_t i = 0; i < n; i++)
{
_a.push_back(a[i]);
}
//建堆
for (int i = (_a.size()); i >= 0; --i)
{
AdjustDown(i);
}
}
//向下调整
void AdjustDown(int root)
{
Compare com;
size_t parent = root;
size_t child = parent * 2 + 1;
while (child < _a.size())
{
if ((child + 1) < _a.size() && com(_a[child + 1], _a[child]))
{
++child;
}
if(com(_a[child], _a[parent]))
{
swap(_a[child], _a[parent]);
parent = child;
child = parent * 2 + 1;
}
else
{
break;
}
}
}
//向上调整
void AdjustUp(int child)
{
Compare com;
int parent = (child - 1) / 2;
while (parent >= 0)
{
if (com(_a[child], _a[parent]))
{
swap(_a[child], _a[parent]);
child = parent;
parent = (child - 1) / 2;
}
else
{
break;
}
}
}
size_t size()
{
return _a.size();
}
void push(const T& x)
{
_a.push_back(x);
AdjustUp(_a.size()-1);//vector大小-1 = 最后一个子节点下标
}
void pop()
{
if (!empty())
{
swap(_a[0], _a[_a.size() - 1]);
_a.pop_back();
AdjustDown(0);
}
}
bool empty()
{
return _a.empty();
}
bool isHeap()
{
for (int i = 0; i <+ (_a.size() - 2) / 2; i++)
{
if (_a[i] < _a[2 * i + 1] ||
(_a[i] < _a[2 * i + 2] && (2 * i + 2) < _a.size))
return false;
}
return true;
}
T& top()
{
if (!empty())
return _a[0];
}
void print()
{
for (size_t i = 0 ; i < _a.size(); ++i)
{
cout << _a[i] << " ";
}
cout << endl;
}
private:
vector<T> _a;
};
const size_t N = 10000;
const size_t K = 3;
void AdjustDown(int* heap, int n, int root)
{
assert(heap);
int parent = root;
int child = parent * 2 + 1;
while (child < n)
{
if (child + 1<n && heap[child + 1] < heap[child])
{
++child;
}
if (heap[child] < heap[parent])
{
swap(heap[child], heap[parent]);
parent = child;
child = parent * 2 + 1;
}
else
{
break;
}
}
}
void TopK()
{
int a[N] = { 0 };
for (size_t i = 0; i < N; ++i)
{
a[i] = rand() % N;
}
a[0] = 11111;
a[100] = 25025;
a[2] = 48525;
int heap[K] = { 0 };
for (size_t i = 0; i < K; ++i)
{
heap[i] = a[i];
}
// 建堆
for (int i = (K - 2) / 2; i >= 0; --i)
{
AdjustDown(heap, K, i);
}
for (size_t i = K; i < N; ++i)
{
if (a[i] > heap[0])
{
heap[0] = a[i];
AdjustDown(heap, K, 0);
}
}
for (size_t i = 0; i < K; ++i)
{
cout << heap[i] << " ";
}
}
void HeapSort(int* a, size_t n)
{
assert(a);
// 建堆 O(N*lgN)
for (int i = (n - 2) / 2; i >= 0; --i)
{
AdjustDown(a, n, i);
}
// 选择
int end = n - 1;
while (end > 0)
{
swap(a[0], a[end]);
AdjustDown(a, end, 0);
--end;
}
}
void TestHeapSort()
{
int a[] = { 10,11, 13, 12, 16, 18, 15, 17, 14, 19 };
HeapSort(a, sizeof(a) / sizeof(a[0]));
for (size_t i = 0; i < sizeof(a) / sizeof(a[0]); ++i)
{
cout << a[i] << " ";
}
cout << endl;
}
- huffmantree.h
// huffmantree.h
#pragma once
#include<assert.h>
#include "Heap.h"
template <class W>
struct HuffmanTreeNode
{
W _w; //权值
HuffmanTreeNode<W>* _left;
HuffmanTreeNode<W>* _right;
HuffmanTreeNode<W>* _parent;
HuffmanTreeNode(const W& w)
: _w(w)
, _left(NULL)
, _right(NULL)
, _parent(NULL)
{}
};
template<class T>
class HuffmanTree
{
typedef HuffmanTreeNode<T> Node;
public:
HuffmanTree()
:_root(0)
{}
HuffmanTree(T* arr, size_t size, const T& invalid)
{
_root = GreatHuffmanTree(arr, size, invalid);
}
Node* getHuffmanTree()
{
return _root;
}
private:
Node* GreatHuffmanTree(T* arr, size_t size, const T& invalid)
{
assert(arr);
//建小堆
Heap<Node*, Less<Node*>> minheap;
for (size_t i = 0; i < size; ++i)
{
if (arr[i] != invalid)
{
Node* tmp = new Node(arr[i]);
minheap.push(tmp);
}
}
//构建Huffman
if (minheap.size())
{
while (minheap.size() > 1)
{
Node* left = minheap.top();
minheap.pop();
Node* right = minheap.top();
minheap.pop();
Node* newNode = new Node(left->_w + right->_w);
newNode->_left = left;
newNode->_right = right;
left->_parent = newNode;
right->_parent = newNode;
minheap.push(newNode);
}
return minheap.top();
}
else
return NULL;
}
private:
Node* _root;
};
- file_compress.h
// file_compress.h
#pragma once
#include "HuffmanTree.h"
#include <string>
#include <assert.h>
typedef long long longtype;
struct charInfo
{
unsigned char _ch; //存放字符信息0-255
longtype _count;//统计出现次数
string _code; //存放huffman编码
charInfo operator+(const charInfo& info)
{
charInfo ret;
ret._count = _count + info._count;
return ret;
}
bool operator<(const charInfo& info)
{
return _count < info._count;
}
bool operator>(const charInfo& info)
{
return _count > info._count;
}
bool operator!=(const charInfo& info)
{
return _count != info._count;
}
};
class FileCompress
{
typedef HuffmanTreeNode<charInfo> Node;
struct ConfigInfo //写配置信息时使用
{
unsigned char _ch;
longtype _count;
};
public:
FileCompress()
{
for (int i = 0; i < 256; i++)
{
_infos[i]._ch = i;
_infos[i]._count = 0;
}
}
//文件压缩
void Compress(const char* filename)
{
assert(filename);
FILE* fout = fopen(filename,"r");
assert(fout);
// 1.统计文件中字符出现的次数
char ch = fgetc(fout);
while (ch != EOF)
{
_infos[(unsigned char)ch]._count++;
ch = fgetc(fout);
}
// 2.构建huffman tree
charInfo invalid;
invalid._count = 0;
HuffmanTree<charInfo> tree(_infos, 256, invalid);
Node* root = tree.getHuffmanTree();
// 3.获得huffman code
string code;
GenerateHuffmancode(root, code);
//GenerateHuffmanCode(root);
// 4.压缩
string compressfile = filename;
compressfile += ".huffman";
FILE* fin = fopen(compressfile.c_str(),"w");
assert(fin);
// 写配置信息(字符出现次数)
//fseek(fin, 0, SEEK_SET);
for (size_t i = 0; i < 256; ++i)
{
ConfigInfo info;
if (_infos[i]._count)
{
info._ch = _infos[i]._ch;
info._count = _infos[i]._count;
fwrite(&info, sizeof(ConfigInfo), 1, fin);
}
}
//区分配置信息和huffman编码
ConfigInfo info;
info._count = 0;
fwrite(&info, sizeof(ConfigInfo), 1, fin);
//将string code转换为二进制
fseek(fout, 0, SEEK_SET);//函数设置文件指针stream的位置 指针回到开始位置
ch = fgetc(fout);
char value = 0; //存放二进制
char pos = 0;
while (ch != EOF)
{
//"aaaabbbccd" 00001010 10111111 1100000
string& code = _infos[(unsigned char)ch]._code;
for (size_t i = 0; i < code.size(); ++i)
{
if (code[i] == '1')
{
value |= (1 << pos);
}
else // '0'
{
value &= (~(1 << pos));
}
++pos;
if (pos == 8)
{
fputc(value, fin);
value = 0;
pos = 0;
}
}
ch = fgetc(fout);
}
if (pos)//将不足8位的补齐
{
fputc(value, fin);
}
写配置信息(字符出现次数)
//fseek(fin, 0, SEEK_SET);
//for (size_t i = 0; i < 256; ++i)
//{
// ConfigInfo info;
// if (_infos[i]._count)
// {
// info._ch = _infos[i]._ch;
// info._count = _infos[i]._count;
// fwrite(&info, sizeof(ConfigInfo), 1, fin);
// }
//}
区分配置信息和huffman编码
//ConfigInfo info;
//info._count = 0;
//fwrite(&info, sizeof(ConfigInfo), 1, fin);
//关闭文件流
fclose(fin);
fclose(fout);
}
void Uncompress(const char* filename)
{
assert(filename);
string uncompressFile = filename;
size_t index = uncompressFile.rfind('.');
assert(index != string::npos);
//uncompressFile.erase(index, uncompressFile.size() - index + 1);
uncompressFile += ".unhuffman";
FILE* fin = fopen(uncompressFile.c_str(), "w");
assert(fin);
FILE* fout = fopen(filename, "r");
assert(fout);
//1.读配置信息
while (1)
{
ConfigInfo info;
info._count = 0;
fread(&info, sizeof(ConfigInfo), 1, fout);
if (info._count)
{
_infos[info._ch]._count = info._count;
}
else
{
break;
}
}
//2.重建Huffman树
charInfo invalid;
invalid._count = 0;
HuffmanTree<charInfo> tree(_infos, 256, invalid);
Node* root = tree.getHuffmanTree();
Node* cur = root;
longtype totalcount = root->_w._count;//所有字符出现总次数
char value = fgetc(fout);
size_t pos = 0;
while (value != EOF)
{
if (value&(1 << pos))
{
cur = cur->_right;
}
else
{
cur = cur->_left;
}
if ((cur->_left == NULL) && (cur->_right == NULL))
{
fputc(cur->_w._ch, fin);
cur = root;
if (--totalcount == 0)//解码完成
{
break;
}
}
++pos;
if (pos == 8)
{
value = fgetc(fout);
pos = 0;
}
}
fclose(fout);
fclose(fin);
}
//生成Huffman code
void GenerateHuffmancode(Node* cur, string code)
{
if (cur == NULL)
return;
if (cur->_left == NULL && cur->_right == NULL)
{
_infos[(unsigned char)cur->_w._ch]._code = code;
return;
}
GenerateHuffmancode(cur->_left, code + '0');
GenerateHuffmancode(cur->_right, code + '1');
}
/*void GenerateHuffmanCode(Node* cur)
{
if (cur == NULL)
return;
if (cur->_left == NULL && cur->_right == NULL)
{
Node* child = cur;
Node* parent = child->_parent;
string& code = _infos[cur->_w._ch]._code;
while (parent)
{
if (parent->_left == child)
{
code += '0';
}
else
{
code.push_back('1');
}
child = parent;
parent = child->_parent;
}
reverse(code.begin(), code.end());
}
GenerateHuffmanCode(cur->_left);
GenerateHuffmanCode(cur->_right);
}*/
private:
charInfo _infos[256];//ASCII码表中0-255
};
- test.cpp
// test.cpp
#include <iostream>
using namespace std;
#include <assert.h>
#include "Heap.h"
#include "HuffmanTree.h"
#include <string>
#include "FileCompress.h"
//Heap
void test1()
{
int a[] = { 8, 7, 13, 12, 16, 18, 15, 17, 14, 19 };
Heap<int> hp1(a, sizeof(a) / sizeof(a[10]));
hp1.print();
cout << hp1.size() << endl;
hp1.push(55);
hp1.push(54);
hp1.push(53);
hp1.pop();
hp1.print();
}
//TOP K /HeapSort
void test2()
{
TopK();
cout << endl;
TestHeapSort();
}
//文件压缩
void TestFileCompress()
{
FileCompress fc;
fc.Compress("input.txt");
cout << "FileCompress Succeed!" << endl;
}
//文件解压缩
void TestFileUncompress()
{
FileCompress fc;
fc.Uncompress("input.txt.huffman");
cout << "FileUncompress Succeed!" << endl;
}
int main()
{
//test1();
//test2();
TestFileCompress();
TestFileUncompress();
system("pause");
return 0;
}