文件压缩有很多种算法本文介绍的是基于Huffman算法的文件压缩
对于Huffman压缩最重要的就是建立Huffman树与重建Huffman树,本文对如何建立Huffman树不做重点讨论
首先将源文件遍历一遍统计其中每个字符出现的次数并将其 保存在下面的结构体中
struct FileInfo
{
FileInfo(unsigned char ch = char())
:_ch(ch)
, _count(0)
{}
FileInfo& operator=(const FileInfo& x)
{
this->coding = x.coding;
this->_ch = x._ch;
this->_count = x._count;
return *this;
}
bool operator==(const FileInfo& x)const
{
return !(*this != x);
}
bool operator!=(const FileInfo& x)const
{
return this->_count != x._count;
}
unsigned char _ch; //出现的字符
long long _count; //该字符出现的次数
string coding; //该字符的Huffman编码
};
FileInfo operator+(const FileInfo& left, const FileInfo& right)
{
FileInfo ret;
ret._count = left._count + right._count;
return ret;
}
bool operator<(const FileInfo& left, const FileInfo& right)
{
return left._count < right._count;
}
文件压缩的类
class CompressedFile
{
public:
CompressedFile()//构造函数
{
for (size_t i = 0; i < 256; i++)
{
_FileInfo[i]._ch = i;
}
}
void Compressed(string& readname);//压缩函数
void UnCompressed(const string& readname);//解压函数
private:
FileInfo _FileInfo[256];//文件中总共可能出现256种字符
};
压缩文件
void Compressed(string& readname)
{
FILE* readfile = fopen(readname.c_str(), "rb");
assert(readfile);
unsigned char* readstr = new unsigned char[1024];
size_t readcount = 0;
readcount = fread(readstr, 1, 1024, readfile); //每次读1kb的数据减少不必要的I/O次数一提高效率
while (readcount != 0)
{
for (size_t i = 0; i < readcount; i++)
{
_FileInfo[readstr[i]]._count++; //统计每个字符出现的次数
}
readcount = fread(readstr, 1, 1024, readfile);
}
FileInfo invalue;
HuffmanTree<FileInfo> ht(_FileInfo, 256, invalue); //建立Huffman树
Coding(ht); //建立Huffman编码
fseek(readfile, 0, SEEK_SET); //是文件指针回到文件头重新遍历文件
string FileHead = GetPost(readname); //将编码的信息与源文件的后缀写在文件头
string CompressedFileName = readname.substr(0, readname.find_last_of('.'));
CompressedFileName += ".huf";
FileHead += '\n';
size_t valuecount = 0;
string strvalue;
for (size_t i = 0; i < 256; i++)
{
if (_FileInfo[i]._count)
{
string ptmp;
char strptmp[20] = { 0 };
ptmp += _FileInfo[i]._ch;
ptmp += ',';
_i64toa(_FileInfo[i]._count, strptmp, 10);
ptmp += strptmp;
ptmp += '\n';
strvalue += ptmp;
valuecount++;
}
}
char str[10] = { 0 };
_itoa(valuecount, str, 10);
FileHead += str;
FileHead += '\n';
FileHead += strvalue;
FILE* writefile = fopen(CompressedFileName.c_str(), "wb");
assert(writefile);
fwrite(FileHead.c_str(),1,FileHead.length(),writefile);
char* writestr = new char[1024];
size_t writecount = 0;
size_t idx = 0; //一次写八位用idx来标记
char value = 0; //要写入的值
readcount = fread(readstr, 1, 1024, readfile);
HuffmanNode<FileInfo>* _root = ht.GetRoot();
long long weight = _root->_weight._count / 1024;
long long arv = weight / 100;
long long k = 0;
char __arr[102] = { 0 };
while (readcount)
{
k++;
if (k == arv)
{
cout << '*'; //模拟实现简单的进度条
k = 0;
}
for (size_t i = 0; i < readcount; i++)
{
string coding = _FileInfo[readstr[i]].coding;
for (size_t j = 0; j < coding.length(); ++j)
{
value <<= 1;
if (coding[j] == '1')
value |= 1;
if (++idx == 8) //将idx value 清零
{
writestr[writecount++] = value;
if (writecount == 1024)
{
fwrite(writestr, 1, 1024, writefile);
writecount = 0;
}
idx = 0;
value = 0;
}
}
}
readcount = fread(readstr, 1, 1024, readfile);
}
if (idx) //循环退出时该字节没有写满将其高位补齐
{
value <<= (8 - idx);
writestr[writecount++] = value;
}
if (writecount)
{
fwrite(writestr, 1, writecount, writefile);
}
delete[] readstr;
delete[] writestr;
fclose(readfile);
fclose(writefile);
}
void Coding(HuffmanTree<FileInfo>& ht)
{
HuffmanNode<FileInfo>* root = ht.GetRoot();
size_t count = 0;
GetLeaf(root, count); //只对叶子节点进行编码
cout << "有效字符个数为count = " << count << endl;
}
void GetLeaf(HuffmanNode<FileInfo>* root, size_t& count)
{
if (root)
{
GetLeaf(root->_pLeft, count);
GetLeaf(root->_pRight, count);
if (root->_pLeft == NULL && root->_pRight == NULL)
{
count++;
HuffmanNode<FileInfo>* child = root;
HuffmanNode<FileInfo>* parent = child->_pParent;
string& coding = _FileInfo[root->_weight._ch].coding;
while (parent) //找到叶子节点后记录从其到根的路径
{
if (child == parent->_pLeft)
{
coding += '0'; //约定是双亲的左孩子写入字符0
}
else
{
coding += '1'; //约定是双亲的右孩子写入字符1
}
child = parent;
parent = child->_pParent;
}
reverse(coding.begin(), coding.end()); //因为是从叶子到根进行遍历所一要将其逆置
}
}
}
解压文件
void UnCompressed(const string& readname)
{
FILE* readfile = fopen(readname.c_str(), "rb");
assert(readfile);
string writename = readname.substr(0, readname.find_last_of('.')); //首先读到源文件的后缀
string strptmmp;
GetLine(readfile, strptmmp);
if (strptmmp.length())
{
writename += strptmmp;
}
int num = 0;
strptmmp = "";
GetLine(readfile,strptmmp);
if (strptmmp.length())
{
num = atoi(strptmmp.c_str());
}
for (int i = 0; i < num; ++i)
{
strptmmp = "";
GetLine(readfile,strptmmp);
unsigned char ch = strptmmp[0]; //必须强制装换为无符号型不然出现负数,负数作为下标会导致程序崩溃,但是本程序会在这里奔溃会在本函数退出是崩溃调试了好长时间都没有注意到
_FileInfo[ch]._count = atoi(strptmmp.c_str()+2); //获取源文件里每个字符出现的次数
}
FILE* writefile = fopen(writename.c_str(), "wb");
assert(writefile);
FileInfo invalue;
HuffmanTree<FileInfo> ht(_FileInfo, 256, invalue); //根据读到的源文件里的每个字符出现的次数重建Huffman树
HuffmanNode<FileInfo>* _root = ht.GetRoot();
unsigned char* readstr = new unsigned char[1024];
unsigned char* writestr = new unsigned char[1024];
size_t writecount = 0;
unsigned char ch;
int pos = 7; //标记该字符的每一位对其每一位进行处理
long long filesize = _root->_weight._count;
HuffmanNode<FileInfo> * root = _root;
size_t readcount = fread(readstr, 1, 1024, readfile);
while (readcount)
{
for (size_t i = 0; i < readcount;)
{
ch = readstr[i];
while (NULL != root->_pLeft || NULL != root->_pRight) //找到叶子节点其中的字符就是要写入的字符
{
if (ch & (1 << pos--))
root = root->_pRight;
else
root = root->_pLeft;
if (pos < 0)
{
pos = 7;
ch = readstr[++i];
break;
}
}
if (NULL == root->_pLeft && NULL == root->_pRight)
{
writestr[writecount++] = root->_weight._ch;
filesize--;
root = _root;
if (0 == filesize) //已经写入了和源文件大小相等的字符数退出该程序
{
fwrite(writestr, 1, writecount, writefile);
return;
}
if (1024 == writecount)
{
fwrite(writestr, 1, 1024, writefile);
writecount = 0;
}
}
}
readcount = fread(readstr, 1, 1024, readfile);
}
}
void GetLine(FILE* fp,string& line) //一次读取一行
{
char first;
if (!feof(fp))
{
first = fgetc(fp);
line += first;
}
while (!feof(fp))
{
char ch = fgetc(fp);
if (ch != '\n')
{
line += ch;
}
else
{
break;
}
}