哈夫曼编码是一种被广泛应用而且非常有效的无损数据压缩技术,它是一种特殊类型的前缀编码,并且是变长编码方式。哈夫曼编码是David A.Huffman在读博士时开发的算法。作为麻省理工学院的学生,他于1952年发表题为“构建最小冗余码的方法”的论文。尽管哈夫曼编码这几个字不常出现在我们的日常生活中,但是它与L7ZZ共同组成的DEFLATEE压缩算法被zip压缩文件所使用,而zip压缩文件在生活中的许多地方起到了非常重要的作用。无论是Mac OS,Unix还是Windows系统都对zip压缩文件有原生的支持。数据包在网络中的传输便使用了zip压缩算法。当今使用广泛的PNG,JPEG,WebP图像格式,所使用的压缩算法也包含了哈夫曼编码方法。哈夫曼编码给我们提供了一个简单有效的压缩数据的方式,在现实中使用广泛。
以下是具体代码。
//coder.cpp
#include <stdlib.h>
#include <stdio.h>
#include "Coder.h"
/***********************
构建哈夫曼树及哈夫曼编码
*************************/
void BuildHuffmanTree( HuffmanTree &HT, //哈夫曼树
HuffmanCode &HC, //哈夫曼编码数组
unsigned int *w, //字符的权值数组
unsigned int n //待编码的字符数量
)
{
if (n <=1 ) return;
unsigned int m = 2 * n - 1; //生成的哈夫曼树的结点数量
HT = (HuffmanTree)malloc((m + 1) * sizeof(HTNode)); //未使用0号存储单位
HT->weight = m;
unsigned int i;
HuffmanTree p;
for (p = HT + 1, i = 1; i <= n; ++i, ++p, ++w){
p->lChild = 0;
p->rChild = 0;
p->weight = (unsigned int)*w;
p->parent = 0;
} //初始化哈夫曼树的数组存储结构//
for (; i <= m; ++i, ++p){
p->lChild = 0;
p->rChild = 0;
p->weight = 0;
p->parent = 0;
}//即构造初态
for (i = n + 1; i <= m; ++i) {
unsigned int s1, s2;
Select(HT, i - 1, s1, s2);
HT[s1].parent = HT[s2].parent = i;
HT[i].lChild = s1;
HT[i].rChild = s2;
HT[i].weight = HT[s1].weight + HT[s2].weight;
}
//----------从叶子到根逆向求每个字符的哈夫曼编码------------
HC = (HuffmanCode)malloc((n + 1) * sizeof(char *)); //未使用0号存储单元
char * cd = 0;
HC[0] = (char *)n; //利用空闲的0号存储单元,保存HC申请的存储单元数
cd = (char *)malloc(n * sizeof(char));
cd[n - 1] = '\0';
for (i = 1; i <= n; ++i) {
unsigned int start = n - 1; //编码结束符位置,减一是因为最后一个存储单元是结束符\0
for (unsigned int c = i, f = HT[i].parent; f != 0; c = f, f = HT[f].parent)
if (HT[f].lChild == c) cd[--start] = '0';
else cd[--start] = '1';
HC[i] = (char *)malloc((n - start) * sizeof(char));
char * d = cd + start;
strcpy(HC[i], d);
}
free(cd);
}
/*********************************************************************
从HT[1...i-1]中选择最小的两个序号,保证最小序号s1的权值小于s2的权值
*********************************************************************/
void Select(HuffmanTree &HT, //待选择序号的哈夫曼树
unsigned int i, //最大序号(不包括 i )
unsigned int &s1, //最小序号1
unsigned int &s2 //最小序号2
)
{
unsigned int *temp[2];
unsigned int min = 0;
temp[0] = &s1;
temp[1] = &s2;
//{
//int l = 0;
while (HT[++min].parent != 0);
//min = l;
//}
for (int k = 0; k < 2; k++) {
for (unsigned int j = min + 1; j < i; j++) {
if (HT[j].parent == 0 && HT[j].weight < HT[min].weight) {
min = j;
}
}
HT[min].parent = 1; //排除已选出的最小序号 //任意非零正数即可
*temp[k] = min;
//int l = 0;
min = 0;
while (HT[++min].parent != 0);
//while (HT[++l].parent != 0);
//min = l;
}
HT[s1].parent = HT[s2].parent = 0; //恢复原始值
}
//把字符串从 sou 复制到 des
size_t strcpy( char *& des, //目的地字符串
char *& sou //源字符串
)
{
char *p;
const char *q;
p = des;
q = sou;
while ((*p++ = *q++));
return (size_t)(p - des);
}
/*******************************
搜索哈夫曼树,寻找匹配的路径
返回值:匹配的结点序号
*******************************/
unsigned int SearchTree( HuffmanTree &HT, //哈夫曼树
char *&s, //字符串
unsigned int n //哈夫曼树数组序号
)
{
char *temp = s;
if (HT[n].lChild + HT[n].rChild == 0) { //表示这是一个叶子结点,返回它的序号
return n;
}
else if (!*temp || *temp == '\n') { //遇到字符串结尾,仍未找到叶子结点,则出错
fprintf(stderr, "\n输入的编码不完整,请检查输入是否正确。\n");
return ERROR;
}
s += 1;
switch (*temp) //递归遍历哈夫曼树
{
case '0':
return SearchTree(HT, s, HT[n].lChild);
break;
case '1':
return SearchTree(HT, s, HT[n].rChild);
break;
default:
fprintf(stderr, "\n发现非法编码值。\n");
return ERROR;
}
}
/*
释放构建哈夫曼树时申请的内存
*/
void DestroyTree(HuffmanTree &HT, HuffmanCode &HC)
{
for (int i = 1; i <= (int)HC[0]; i++)
free(HC[i]);
free(HC);
HC = 0;
free(HT);
HT = 0;
}
//coder.h
#pragma once
#include <stdlib.h>
#define ERROR (0)
typedef struct {
unsigned int weight; //权值
unsigned int parent; //父结点序号
unsigned int lChild; //左孩子结点序号
unsigned int rChild; //右孩子结点序号
} HTNode, *HuffmanTree;
typedef char ** HuffmanCode;
void BuildHuffmanTree(HuffmanTree &HT, HuffmanCode &HC, unsigned int * w, unsigned int n);
void Select(HuffmanTree & HT, unsigned int i, unsigned int &s1, unsigned int &s2);
size_t strcpy(char* &des, char* &sou);
unsigned int SearchTree(HuffmanTree &HT, char *&s, unsigned int n);
void DestroyTree(HuffmanTree &HT, HuffmanCode &HC);
// Huffman compress.cpp: 定义控制台应用程序的入口点。
//
#include <stdlib.h>
#include <stdio.h>
#include "Coder.h"
int main()
{
const int bufferSize = 1000;
unsigned int maxSize = 100;
unsigned int n = 0, n1 = 0, //待编码的字符数量
*d = 0; //字符的权值数组
int weight; //权值
char *data = 0, //待编码字符数组
buf[bufferSize]; //控制台读入字符串缓冲区
d = (unsigned int *)malloc(sizeof(unsigned int) * maxSize);
data = (char *)malloc(sizeof(char) * maxSize);
system("title 哈夫曼编码译码器演示程序");
printf("\n\t\t哈夫曼编码译码器\n本程序仅支持对单个字符的编码,且字符数量限制为100个以内.\n\n"
"请输入字符及其对应的正整数权值,单独一个回车行结束整个输入。\n例如:\nA 2\nB 3 C 4 d 5\ne 6 \n\n--------------------\n");
//从控制台窗口接受字符及其对应的权值
for (;;) {
char c;
char s[2];
int match_len; //匹配的个数
putc('>', stdout);
fgets(buf, 20, stdin);
if ('\n' == buf[0])
if (n <= 1) {
printf("请至少输入两组数据。\n");
continue;
}
else
break;
unsigned int in = 0, ind;
while ((match_len = sscanf(buf + in, "%1s %d%n", &s, &weight, &ind)) == 2 && s[0] > ' ' && weight > 0) {
c = s[0];
in += ind;
int i = 0;
while (data[i] != c && data[i]) i++;
if (data[i]) {
fprintf(stdout, "\n一个字符仅可有一个权值!\n");
buf[in] = 'Z';
break;
}
d[n] = (unsigned int)weight;
data[n] = c;
if (++n >= maxSize) {
maxSize += 100;
void * p = 0, *q = 0;
p = realloc(d, maxSize);
q = realloc(data, maxSize);
if (!p && !q) {
d = (unsigned int *)p;
data = (char *)q;
}
else {
fprintf(stdout, "\n输入达上限!\n");
system("pause");
return 0;
}
}
}
unsigned int ii = in;
while (buf[ii] <= 32 && buf[ii] > 0)ii++;
if (ii == 0 || buf[ii] != '\0') { n = n1; printf("%s", "输入错误,请重新输入。\n"); continue; }
n1 = n;
}
HuffmanTree HT;
HuffmanCode HC;
//任务一 构造哈夫曼树
BuildHuffmanTree(HT, HC, d, n);
//任务二 输出哈夫曼编码
printf("\n%s\t%s\n", "字符", "哈夫曼编码");
for (unsigned int i = 1; i <= n; i++) {
printf("%c\t%s\n", data[i - 1], HC[i]);
}
putc('\n', stdout);
//任务三 翻译哈夫曼编码
//char buf2[1001];
char *p = buf;
char *q = buf;
unsigned int result = 1;
for (;;) {
printf("\n请输入待译码的字符串:\n>");
fgets(buf, bufferSize, stdin);
if ('\n' == buf[0]){
char c[22];
printf("确定要退出吗?(y/n)");
fgets(c, 20, stdin);
if(c[0] == 'y')
break;
c[0] = '\0';
continue;
}
//putc('>', stdout);
while (p - buf < bufferSize + 1) {
result = SearchTree(HT, p, 2 * n - 1);
if (result != ERROR) {
*q++ = data[result - 1];
//printf("%c", data[result - 1]);
/*if (*p == '\n') {
putc('\n\n', stdout);
break;
}
*/
if (*p == '\n') {
*q = '\0';
printf("原字符串为:\n%s\n\n", buf);
break;
}
}
else break;
}
p = q = buf;
}
DestroyTree(HT, HC);
system("pause");
return 0;
}
参考资料:
[1] 严蔚敏,吴伟民.数据结构.北京:清华大学出版社,2008
[2] 苏仕华.数据结构课程设计.北京:机械工业出版社,2010
[3] Brian W. Kernighan,Dennis M. Ritchie著,徐宝文,李志 译,C程序设计语言.北京:机械工业出版社,2004
[4] https://en.wikipedia.org/wiki/Huffman_coding