Huffman编码是最流行的基于统计学的数据压缩方法,下面我们简单介绍它的实现步骤:
1. 将信源符号按照概率递减顺序排列;
2. 取两个概率最小的符号分别分配以“0”和“1”,然后把它们的概率相加,并作为一个新的符号的概率,与其它未分配符号按照(1)重新排列;
3. 重复(1)(2)过程,直至所有概率相加得1;
4. 寻找从每一个信源符号到概率为1处的路径,记录下路径上的“0”和“1”;
5. 写出每一个符号的“0”和“1”序列(从树根到信源符号节点)。
缺点:
从理论上讲,采用Huffman编码可以获得最佳编码效果,但是在实际中,由于计算机中存储和处理的最小数据单位是比特(bit),因此在某种情况下,实际的压缩编码效果往往达不到理论的压缩比。例如:信源符号{X, Y},其对应的概率为{2/3, 1/3},则根据理论计算,符号X, Y的最佳码长分别是:X: log(2/3)2 = 0.588(bit),Y: log(1/3)2 = 1.58(bit)
这表明,要获得最佳效果,符号{X, Y }的码字长度应分别为0.588bit和1.58bit,而计算机不可能有非整数位出现,只能按整数位进行,即采用哈夫曼编码对{X,Y}进行编码,得{X,Y}的码字分别为0和1,也就是两符号的信息编码长度都为1。可见,对于大概率符号X并未赋予较短的码字,实际编码效果没有达到理论编码效果。由上述分析可见,Huffman编码的主要缺点在于其编码方法是对每个符号进行编码,每个符号的码长只能是整数。为此提出算术编码,以解决计算机中必须以整数位进行编码的问题。
编码实现:
头文件
/ Huffman.h: interface for the Huffman class.
//
//
#if !defined(AFX_HUFFMAN_H__08A1863A_6641_4FE9_9596_5EEBE76B53F7__INCLUDED_)
#define AFX_HUFFMAN_H__08A1863A_6641_4FE9_9596_5EEBE76B53F7__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#include <string>
/***********************数据结构***********************/
//哈弗曼树节点
typedef struct
{
unsigned int weight;
unsigned int parent;
unsigned int lchild;
unsigned int rchild;
}HuffTreeNode,*HuffTree;
//字符-权值-编码映射
typedef struct
{
char c;
unsigned int weight;
char *code;
}CharMapNode,*CharMap;
/*************************类定义****************************/
class Huffman
{
private:
void select(int n, int &s1, int &s2);
HuffTree huffTree; //哈弗曼树
CharMap chars; //字符表
int n; //字符数
std::string text; //原文
std::string code; //编码
public:
void InputCharsWeight();
void CountCharsWeight();
void Decode();
void ReadTextFromFile(char *filename);
void ReadCodeFromFile(char *filename);
void SaveTextToFile(char *filename);
void SaveCodeToFile(char *filename);
void PrintCode();
void MakeCharMap();
void PrintText();
void PrintCharCode();
void PrintCharWeight();
void SetCharMap(CharMap m, int number);
void Encode();
Huffman();
virtual ~Huffman();
};
#endif // !defined(AFX_HUFFMAN_H__08A1863A_6641_4FE9_9596_5EEBE76B53F7__INCLUDED_)
Huffman类
// Huffman.cpp: implementation of the Huffman class.
//
//
#include "Huffman.h"
#include <iostream>
#include <fstream>
using namespace std;
//
// Construction/Destruction
//
Huffman::Huffman()
{
huffTree = NULL;
chars = NULL;
n = 0;
}
Huffman::~Huffman()
{
}
//对Text串进行哈弗曼编码
void Huffman::Encode()
{
code = "";
for (string::size_type i = 0; i != text.size(); ++i)
{
for (int j = 1; j <= n; ++j)
if (chars[j].c == text[i])
code += chars[j].code; //code为数组名,可表示数组存放的地址
}
}
//设置字符表
void Huffman::SetCharMap(CharMap m, int number)
{
chars = m;
n = number;
}
//在huffTree[1..n]中选择parent为0且weight最小的两个节点,其序号为s1,s2
void Huffman::select(int n, int &s1, int &s2)
{
s1 = s2 = 0;
for (int i = 1; i <= n; ++i)
{
if (huffTree[i].parent != 0)
continue;
if (s1 == 0)
s1 = i;
else if (s2 == 0)
{
//此处采用的策略,使得整个过程中s1的权值小于s2的权值
if (huffTree[i].weight < huffTree[s1].weight)
{
s2 = s1;
s1 = i;
}
else
s2 = i;
}
else
{
if (huffTree[i].weight < huffTree[s1].weight)
{
s2 = s1;
s1 = i;
}
else if (huffTree[i].weight < huffTree[s2].weight)
s2 = i;
}
}
}
void Huffman::PrintCharWeight()
{
for (int i = 1; i <= n; ++i)
{
/* switch (chars[i].c)
{
case '\t':
cout << "\\t";
break;
case '\n':
cout << "\\n";
break;
default:*/
cout << chars[i].c;
// break;
//}
cout << "——" << chars[i].weight << endl;
}
}
void Huffman::PrintCharCode()
{
for (int i = 1; i <= n; ++i)
{
/*switch (chars[i].c)
{
case '\t':
cout << "\\t";
break;
case '\n':
cout << "\\n";
break;
default:*/
cout << chars[i].c;
// break;
//}
cout << "——" << chars[i].code << endl;
}
}
//输出文本串
void Huffman::PrintText()
{
cout << text << endl;
}
//输出0-1编码
void Huffman::PrintCode()
{
cout << code << endl;
}
//根据各字符的权值建立字符-编码表
void Huffman::MakeCharMap()
{
if (n <= 1)
return;
int m = 2 * n - 1; //哈弗曼树所需节点数
huffTree = new HuffTreeNode[m+1]; //0号单元未使用
//初始化
int i;
for (i = 1; i <= n; ++i) //从1开始
{
huffTree[i].weight = chars[i].weight;
huffTree[i].parent = 0;
huffTree[i].lchild = 0;
huffTree[i].rchild = 0;
}
for (i = n + 1; i <= m; ++i)
{
huffTree[i].weight = 0;
huffTree[i].parent = 0;
huffTree[i].lchild = 0;
huffTree[i].rchild = 0;
}
//建哈弗曼树
for (i = n + 1; i <= m; ++i)
{
int s1,s2;
select(i - 1, s1, s2);
huffTree[s1].parent = huffTree[s2].parent = i;
huffTree[i].lchild = s1;
huffTree[i].rchild = s2;
huffTree[i].weight = huffTree[s1].weight + huffTree[s2].weight;
}
//从叶子到根节点逆向求每个字符的哈弗曼编码
char *cd = new char[n]; //分配求编码的工作空间(每个字符编码结果最长n-1再加上'\0')
cd[n-1] = '\0'; //编码结束符
for(i = 1; i <= n; ++i) //逐个字符求哈弗曼编码
{
int start = n - 1;
int c,f;
//从叶子到根逆向求编码
for (c = i, f = huffTree[i].parent; f != 0; c = f, f = huffTree[f].parent)
{
if (huffTree[f].lchild == c) //左孩子编码为0
cd[--start] = '0';
else //右孩子编码为1
cd[--start] = '1';
}
chars[i].code = new char[n - start]; //为第i个字符编码分配空间
strcpy(chars[i].code,&cd[start]);
}
delete cd;
}
//从文件读入原文
void Huffman::ReadTextFromFile(char *filename)
{
ifstream infile(filename);
if(!infile)
{
cerr << "无法打开文件!" <<endl;
return;
}
char c;
while(infile.get(c))
{
text += c;
}
}
//将编码存入文件
void Huffman::SaveCodeToFile(char *filename)
{
ofstream outfile(filename);
if (!outfile)
{
cerr << "保存文件出错!" << endl;
return;
}
outfile << code;
}
//从文件读入编码
void Huffman::ReadCodeFromFile(char *filename)
{
ifstream infile(filename);
if (!infile)
{
cerr << "无法打开文件!" <<endl;
return;
}
infile >> code;
}
//将0-1编码串解码
void Huffman::Decode()
{
text = "";
string::size_type i,count;
for (i = 0; i < code.size(); i += count)
{
//每个字符的编码结果最长n-1,从1至n-1依次尝试
for (count = 1; count < n; ++count)
{
for (int j = 1; j <= n; ++j)
if (code.substr(i, count) == chars[j].code)//code.substr(a,b)表示对字符串code截取从第a个到第b个,依次与chars[].code比较
{
text += chars[j].c; //text表示原文存放的数组名,作指针使用
goto next;
}
}
next:
;
}
}
//统计原文中各字符的权值
void Huffman::CountCharsWeight()
{
if (text.empty())
return;
if (chars != NULL)
delete chars;
int i = 0;
n = 0;
chars = new CharMapNode[2];
chars[1].c = text[i];
chars[1].weight = 1;
++n;
for (i = 1; i != text.size(); ++i)
{
int j;
for (j = 1; j <= n; ++j) //遍历当前字符表,如果已存在该字符,权值+1
{
if (text[i] == chars[j].c)
{
++chars[j].weight;
break;
}
}
if (j > n) //该字符不存在,添加该字符
{
++n;
CharMap newchars = new CharMapNode[n + 1];
memcpy(newchars, chars, n * sizeof(CharMapNode));
delete chars;
chars = newchars;
chars[n].c = text[i];
chars[n].weight = 1;
}
}
}
//输入字符和对应权值
void Huffman::InputCharsWeight()
{
cout << "请输入字符集大小n(n>1):" << endl;
cin >> n;
if (chars != NULL)
delete chars;
chars = new CharMapNode[n+1]; //0号单元未使用
cout << "请输入字符和权值:" << endl;
for (int i = 1; i <= n; ++i)
{
cin.ignore(); //清除输入缓冲区
cin.get(chars[i].c); //输入单个字符,可以是空白符
cin >> chars[i].weight;
}
}
void Huffman::SaveTextToFile(char *filename)
{
ofstream outfile(filename);
if (!outfile)
{
cerr << "保存文件出错!" << endl;
return;
}
outfile << text;
}
主函数
#include <iostream>
#include "Huffman.h"
using namespace std;
int main()
{
Huffman huffman;
huffman.ReadTextFromFile("text.txt");
/****************第一步输入字符和对应权值******************/
cout << "请选择: 1.程序自动统计字符和权值(推荐) 2.用户输入" << endl;
int r;
do
{
cin >> r;
}
while((r != 1) && (r != 2));
if (r == 1)
huffman.CountCharsWeight();
else
huffman.InputCharsWeight();
cout << "字符及对应权值:" << endl;
huffman.PrintCharWeight(); //计算每个字符对应的权值
system("pause");
cout << endl;
/****************第二步建哈弗曼树,输出字符与编码的对应关系******************/
huffman.MakeCharMap(); //实现哈弗曼编码,对应存入chars[i].c-chars[i].code
cout << "字符及对应的编码:" << endl;
huffman.PrintCharCode(); //打印每个字符及其对应的编码,即chars[i].c-chars[i].code
system("pause");
cout << endl;
/****************第三步对字符进行编码,将结果输出并存入文件******************/
cout << "对原文进行编码:" << endl;
cout << "原文:" << endl;
huffman.PrintText(); //输出文本串
huffman.Encode(); //对文本串进行编码
cout << "编码:" << endl;
huffman.PrintCode();
huffman.SaveCodeToFile("code.txt");
system("pause");
cout << endl;
/****************第四步从文件读入0、1代码串解码后输出并存入文件******************/
cout << "对编码进行解码:" << endl;
huffman.ReadCodeFromFile("code.txt");
cout << "编码:" << endl;
huffman.PrintCode();
huffman.Decode();
cout << "原文:" << endl;
huffman.PrintText();
huffman.SaveTextToFile("resulttext.txt");
cout << "\n Over ^_^" << endl;
system("pause");
return 0;
}
输出界面