什么是文件压缩?
文件压缩:采用一种机制,将大文件转换为小文件。
基本概念
在一棵二叉树中:
路径:定义从A结点到B结点所经过的分支序列为A结点到B结点的路径。
路径长度:定义从A结点到B结点所经过的分支个数为A结点到B结点的路径长度。
二叉树的路径长度:从二叉树的根结点到二叉树中所有结点的路径长度之和。
由树的定义可知:从属的根结点到达树中每一结点有且只有一条路径。若设根结点处于第1层,某结点处于第k层,则从根结点到其他各结点的路径长度等于该结点所处层数减1.
PL1(完全二叉树)=0+1+1+2+2+2+2+3=13
PL2(普通二叉树)=0+1+1+2+2+2+3+3=14
设二叉树有n个带权值的叶节点,定义从二叉树的根结点到二叉树中所有叶结点的路径长度与相应权值的乘积之和为该二叉树的带权路径长度WPL.
对于一组具有确定权值的叶结点,可以构造出多个具有不同带权路径长度的二叉树。
这4棵二叉树的带权路径长度分别为:
(a) WPL=1*2+3*2+5*2+7*2=32
(b) WPL=1*2+3*3+5*3+7*1=33
(c) WPL=7*3+5*3+3*2+1*1=43
(d) WPL=1*3+3*3+5*2+7*1=29
代码实现
HuffmanTree.h
#pragma once
#include<queue>
#include<vector>
template<class W>
struct HuffmanTreeNode
{
HuffmanTreeNode(const W& weight = W())
:_pLeft(0)
, _pRight(0)
, _pParent(0)
, _weight(weight)
{}
HuffmanTreeNode<W>* _pLeft;
HuffmanTreeNode<W>* _pRight;
HuffmanTreeNode<W>* _pParent;
W _weight;
};
//仿函数
template<class W>
class Compare
{
public:
bool operator()(HuffmanTreeNode<W>* pLeft, HuffmanTreeNode<W>* pRight)
{
return pLeft->_weight > pRight->_weight;
}
};
template<class W>
class HuffmanTree
{
typedef HuffmanTreeNode<W> Node;
typedef Node* PNode;
public:
HuffmanTree(const W* array,size_t size,const W& invalid)//invalid:无效值
{
std::priority_queue<PNode,std::vector<PNode>,Compare<W>> q;
for (size_t i = 0; i < size; ++i)
{
if (array[i]!=invalid)
q.push(new Node(array[i]));//有效值的处理
}
while (q.size()>1)
{
PNode pLeft = q.top();
q.pop();
PNode pRight = q.top();
q.pop();
PNode pParent = new Node(pLeft->_weight + pRight->_weight);
pParent->_pLeft = pLeft;
pLeft->_pParent = pParent;
pParent->_pRight = pRight;
pRight->_pParent = pParent;
q.push(pParent);
}
_pRoot = q.top();
}
PNode GetRoot()
{
return _pRoot;
}
~HuffmanTree()
{
Destroy(_pRoot);
}
private:
void Destroy(PNode& pRoot)
{
if (pRoot)
{
Destroy(pRoot->_pLeft);
Destroy(pRoot->_pRight);
delete pRoot;
pRoot = 0;
}
}
private:
PNode _pRoot;
};
FileCompress.h
#pragma once
_CRT_SECURE_NO_WARNINGS
#include<assert.h>
#include<string>
#include<iostream>
#include"HuffmanTree.h"
using namespace std;
typedef unsigned long long UINT64;
struct CharInfo //字符信息结构体
{
CharInfo(UINT64 appearCount = 0)
: _appearCount(appearCount)
{}
CharInfo operator+(const CharInfo& info)
{
return CharInfo(_appearCount + info._appearCount);
}
bool operator>(const CharInfo& info)
{
return _appearCount>info._appearCount;
}
bool operator!=(const CharInfo& info)const
{
return _appearCount!=info._appearCount;
}
bool operator==(const CharInfo& info)const //const 修饰this指针
//当前对象invalid比较时比较的两个值都是定值
{
return _appearCount==info._appearCount;
}
unsigned char _ch;//避免出现负数
UINT64 _appearCount;//字符出现的次数
string _strCode;//字符的编码
};
class FileCompress
{
public:
FileCompress()
{
for (size_t i = 0; i < 256; ++i)
{
_fileInfo[i]._ch = i;//字符初始化(i是字符的ASCII码)
_fileInfo[i]._appearCount = 0;//字符出现次数的初始化
}
}
void CompressFile(const string& strFilePath)
{
//1.统计文件中每个字符出现的次数(把文件读一遍)
FILE* fIn = fopen(strFilePath.c_str(), "rb");//fopen的参数是char*,获取c格式的字符串
assert(fIn);
unsigned char* pReadBuff = new unsigned char[1024];//无符号的都为正数,一次读取1024个字符
while (1)
{
size_t readSize=fread(pReadBuff, 1, 1024, fIn);//从fIn中每次读1字,一次读1024个字节,
//放到pReadBuff中,若读的次数<1024,就返回读到的字符数
if (0 == readSize)//读到文件的末尾
break;
for (size_t i = 0; i < readSize; ++i)
_fileInfo[pReadBuff[i]]._appearCount++;//从所读的字符中统计某个字符出现的次数
}
//根据权值创建Huffman树
HuffmanTree<CharInfo> ht(_fileInfo, 256,CharInfo(0));
//根据Huffman树获取每个字符的编码
GenerateHuffmanCode(ht.GetRoot());
FILE* fOut = fopen("1.hzp", "w");
assert(fOut);
//解压缩需要用到的信息
//WriteComFileHeadInfo(fOut, strFilePath);
//用每个字符的编码重新改写文件
fseek(fIn, 0, SEEK_SET);//偏移量,fIn这个文件指针,从文件的起始地址 ,偏移量为0
char ch = 0;
size_t pos = 0;
char* pWriteBuff = new char[1024];
size_t writeSize = 0;
for (;;)//表示死循环
{
size_t readSize = fread(pReadBuff, 1, 1024, fIn);
if (0 == readSize)
break;
for (size_t i = 0; i < readSize; ++i)
{
string& strCode = _fileInfo[pReadBuff[i]]._strCode;
for (size_t j = 0; j < strCode.length(); ++j)
{
// 100 10010110 11011111 11111100 00000000
ch <<= 1;
if ('1' == strCode[j])//第j个字符编码为1
ch |= 1;//或等上1
++pos;
if (8 == pos)
{
pWriteBuff[writeSize++] = ch;
if (1024 == writeSize)
{
fwrite(pWriteBuff, 1, 1024, fOut);//把写缓冲区中的数据写到fOut中
writeSize = 0;
}
ch = 0;//ch8个bit位写满了,就要清零
pos = 0;//满了,就要清零
}
}
}
}
if (pos>0 && pos < 8)
{
ch <<= (8 - pos);
pWriteBuff[writeSize++] = ch;
}
fwrite(pWriteBuff, 1, writeSize, fOut);
fclose(fIn);
fclose(fOut);
delete[]pReadBuff;
delete[]pWriteBuff;
}
void UNCompressFile(const string& strComFilePath)
{
1.从压缩文件头部信息中获取解压缩需要用到的信息
FILE* fIn=fopen(strComFilePath.c_str(),"r");
assert(fIn);
获取文件后缀
string strFilePostFix;
GetLine(fIn,strFilePostFix);
获取有效字符的总行数
string strCount;
GetLine(fIn,strCount);
size_t lineCount=atoi(strCount.c_str());
获取每个字符的次数信息
strCount="";
for(size_t i=0;i<lineCount;++i)
{
GetLine(fIn,strCount);
_fileInfo[strCount[0]]._appearCount=atoi(strCount.c_str()+2);
strCount="";
}
还原Huffman树
HuffmanTree<CharInfo> ht(_fileInfo,256,CharInfo(0));
//读取压缩数据+huffman树还原源文件
string strFileName("2");
strFileName+=strFilePostFix;
FILE* fOut=fopen(strFileName.c_str(),"w");
assert(fOut);
char* pReadBuff=new char[1024];
char* pWriteBuff=new char[1024];
size_t writeSize=0;
HuffmanTreeNode<CharInfo>* pRoot=ht.GetRoot();
HuffmanTreeNode<CharInfo>* pCur=pRoot;
源文件大小
size_t fileLen=pRoot->_weight._appearCount;
size_t totalSize=0;//解压了多少字符
for(;;)
{
size_t readSize=fread(pReadBuff,1,1024,fIn);
if(0==readSize)
break;
for(size_t i=0;i<readSize;++i)
{
char ch=pReadBuff[i];
for(size_t j=0;j<8;++j)
{
if(ch&(1<<(7-j)))
pCur=pCur->_pRight;
else
pCur=pCur->_pLeft;
if(NULL==pCur->_pLeft&&NULL==pCur->_pRight)
{
pWriteBuff[writeSize++]=pCur->_weight._ch;
if(1024==writeSize)
{
fwrite(pWriteBuff,1,1024,fOut);
writeSize=0;
}
pCur=pRoot;
totalSize++;
if(totalSize==fileLen)
break;
}
}
}
}
fwrite(pWriteBuff,1,writeSize,fOut);
fclose(fIn);
fclose(fOut);
delete[]pReadBuff;
delete[]pWriteBuff;
}
private:
void GetLine(FILE* fIn,string& strCode)
{
char ch;
while(!feof(fIn))
{
ch=fgetc(fIn);
if('\n'==ch)
return;
strCode+=ch;
}
}
void WriteComFileHeadInfo(FILE* fOut,const string& strFilePath)
{
string strHeadInfo=GetFilePostFix(strFilePath);
strHeadInfo+='\n';
string strCode;
size_t lineCount=0;
char szAppearCount[32];//出现的次数
for(size_t i=0;i<256;++i)
{
if(0!=_fileInfo[i]._appearCount)
{
lineCount++;
strCode+=_fileInfo[i]._ch;
strCode+=',';
itoa(_fileInfo[i]._appearConut,szAppearCount,10);
strCode+=szAppearCount;
strCode+='\n';
}
}
itoa(lineCount,szAppearCount,10);
strHeadInfo+=szAppearCount;
strHeadInfo+='\n';
fwrite(strHeadInfo.c_str(),1,strHeadInfo.length(),fOut);
fWrite(strCode.c_str(),1,strCode.length(),fOut);
}
string GetFilePostFix(const string& strFilePath)//获取文件后缀
{
//111.txt
//F:\word\111.txt
return strFilePath.substr(strFilePath.find_last_of('.'));
}
void GenerateHuffmanCode(HuffmanTreeNode<CharInfo>* pRoot)
{
if (NULL == pRoot)
return;
GenerateHuffmanCode(pRoot->_pLeft);
GenerateHuffmanCode(pRoot->_pRight);
if (NULL == pRoot->_pLeft && NULL == pRoot->_pRight)
{
HuffmanTreeNode<CharInfo>* pCur = pRoot;//刚开始把pCur应该放到叶子结点处
HuffmanTreeNode<CharInfo>* pParent = pCur->_pParent;
string& strCode = _fileInfo[pCur->_weight._ch]._strCode;//结构体的对象用“.”
//叶子权值,权值里面对应的字符,字符的ASCII码为下标,
while (pParent)
{
if (pCur == pParent->_pLeft)
strCode += '0';
else
strCode += '1';
pCur = pParent;
pParent = pCur->_pParent;
}
reverse(strCode.begin(), strCode.end());
}
}
private:
CharInfo _fileInfo[256];//char类型范围
};
void TestFileCompress()
{
//压缩
FileCompress fc;
fc.CompressFile("1.txt");
//解压缩
//FileCompress fc;
//fc.UNCompressFile("2.hzp");
}
Test.cpp
#include"FileCompress.h"
int main()
{
TestFileCompress();
return 0;
}