原理
分析
- 首先制作一个输入文件,文件中包含很多字符。程序以输入文件中字符出现的评论不同作为huffman树生成的依据
- 统计字符出现的概率。此处有两个熟悉,字符名和出现次数,可以使用结构体进行设计
- 构建huffuman树
- 存储huffuman码表和编码后的文件
难点主要是所有字符的排序生成huffuman树及码字的分配
统计完字符的出现次数后,需要对其进行先后顺序的排列。如果使用的是c++,就可以用c++的stl提供的数据结构-优先级队列实现,使用的C就要编写算法。
输入文件
Shall I compare thee to a summer's day?
Thou art more lovely and more temperate:
Rough winds do shake the darling buds of
May,
And summer's lease hath all too short a
date:
Sometime too hot the eye of heaven shines,
And often is his gold complexion dimm'd;
And every fair from fair sometime declines,
By chance, or nature's changing course,
untrimm'd;
But thy eternal summer shall not fade
Nor lose possession of that fair thou ow'st;
Nor shall Death brag thou wander'st in his
shade,
When in eternal lines to time thou grow'st;
So long as men can breathe or eyes can
see,
So long lives this, and this gives life to thee.
字符出现次数排序功能
.C
#include "config.h"
LUX_HUFFUMAN_DataPoint_ST dataPoint;
LUX_HUFFUMAN_FilePoint_ST filePoint;
LUX_HUFFUMAN_CharFreq_ST charFreq[126] = {{0,0}};
struct stat getSize_ST;
/**
* @description: 初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_Init(void)
{
int ret = LUX_SUCCESS;
int fileSize = 0;
int len = 0;
do
{
/*inputfile*/
filePoint.inputFile = fopen(LUX_HUFFUMAN_InputFile, "r");
if (NULL == filePoint.inputFile)
{
printf("fopen LUX_HUFFUMAN_InputFile failed\n");
perror("LUX_HUFFUMAN_InputFile");
ret = LUX_FAILED;
break;
}
/*get bytes of file*/
ret = stat(LUX_HUFFUMAN_InputFile, &getSize_ST);
if (-1 == ret)
{
perror("get file size failed");
break;
}
fileSize = (int)getSize_ST.st_size;
/*store file*/
dataPoint.fileData = malloc(fileSize);
if (NULL == dataPoint.fileData)
{
printf("malloc dataPoint.fileData failed\n");
perror("dataPoint.fileData");
ret = LUX_FAILED;
break;
}
len = fread(dataPoint.fileData, 1, fileSize, filePoint.inputFile);
if (0 == len)
{
printf("fread dataPoint.fileData failed\n");
perror("why");
ret = LUX_FAILED;
break;
}
#if 0
//TODO:test cache
int i = 0;
for (i = 0; i < fileSize; i++)
{
printf("%c", *(dataPoint.fileData + i));
}
#endif
}while (0);
return ret;
}
/**
* @description: 排序
* @param [in] arr 数组首地址
* @param [in] time 排序次数
* @return
*/
int LUX_HUFFUMAN_Sort(LUX_HUFFUMAN_CharFreq_ST *arr, int time)
{
int i, j = 0;
LUX_HUFFUMAN_CharFreq_ST tmp;
memset(&tmp, 0, sizeof(LUX_HUFFUMAN_CharFreq_ST));
for (i = 0; i < time; i++)
{
for (j = 0; j < time - i; j++)
{
if (arr[j].frequency > arr[j + 1].frequency)
{
tmp.character = arr[j].character;
tmp.frequency = arr[j].frequency;
arr[j].character = arr[j + 1].character;
arr[j].frequency = arr[j + 1].frequency;
arr[j + 1].character = tmp.character;
arr[j + 1].frequency = tmp.frequency;
}
}
}
return LUX_SUCCESS;
}
/**
* @description: 统计字符及出现的次数
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_GetInfo(void)
{
int ret = LUX_SUCCESS;
int hufTreeCnt = 0;
char buf;
int i, j, k = 0;
int num, cnt= 0;
int fileSize = (int)getSize_ST.st_size;
/*get info*/
for (i = 0; i < fileSize; i++)
{
buf = *(dataPoint.fileData + i);
charFreq[buf].character = buf;
charFreq[buf].frequency++;
}
/*get huf tree node num*/
for (i = 0; i < NUM_OF_ASCII; i++)
{
if (charFreq[i].frequency > 0)
{
++hufTreeCnt;//41
/*test character frquency*/
//printf("node[%d] character[%c] frequency[%d]\n", hufTreeCnt - 1, charFreq[i].character, charFreq[i].frequency);
}
}
LUX_HUFFUMAN_Sort(charFreq, NUM_OF_ASCII);
/*sort*/
LUX_HUFFUMAN_CharFreq_ST sortChar[hufTreeCnt];
memset(sortChar, 0, hufTreeCnt * sizeof(LUX_HUFFUMAN_CharFreq_ST));
LUX_HUFFUMAN_CharFreq_ST tmp;
memset(&tmp, 0, sizeof(LUX_HUFFUMAN_CharFreq_ST));
/*restore*/
for (i = 0; i < NUM_OF_ASCII + 1; i++)
{
if (charFreq[i].frequency > 0)
{
sortChar[num].frequency = charFreq[i].frequency;
sortChar[num].character = charFreq[i].character;
num++;
}
}
//TODO:TEST
for (i = 0; i < hufTreeCnt; i++)
{
printf("node[%d] character[%c] frequency[%d]\n", i, sortChar[i].character, sortChar[i].frequency);
}
/*get left and right node*/
/*conbine the smallest tow into new node*/
/*give back the new node*/
/*resort*/
return ret;
}
/**
* @description: 去初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_DeInit(void)
{
fclose(filePoint.inputFile);
free(dataPoint.fileData);
free(dataPoint.charData);
return LUX_SUCCESS;
}
int main(void)
{
int ret = LUX_SUCCESS;
ret = LUX_HUFFUMAN_Init();
if(LUX_FAILED == ret)
{
printf("LUX_TEST_Init error\n");
return LUX_FAILED;
}
ret = LUX_HUFFUMAN_GetInfo();
if(LUX_FAILED == ret)
{
printf("LUX_HUFFUMAN_Func error\n");
return LUX_FAILED;
}
ret = LUX_HUFFUMAN_DeInit();
if(LUX_FAILED == ret)
{
printf("LUX_HUFFUMAN_DeInit error\n");
return LUX_FAILED;
}
return ret;
}
.H
#include <stdio.h>
#include <error.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#define LUX_HUFFUMAN_InputFile "inputFile"
#define LUX_HUFFUMAN_EncodeFile "encodeFile"
#define LUX_HUFFUMAN_EncodeTable "huffumanTable"
#define LUX_SUCCESS 0
#define LUX_FAILED -1
#define NUM_OF_ASCII 126
typedef struct
{
char *fileData; //YUV
char *charData; //所有字符信息
}LUX_HUFFUMAN_DataPoint_ST;
typedef struct
{
FILE *inputFile; //inutFile
FILE *huffumanEncode; //huffuman编码后文件
FILE *huffumanTable; //huffuman编码表
}LUX_HUFFUMAN_FilePoint_ST;
typedef struct LUX_HUFFUMAN_HuffumanTree_ST
{
char character;
int frequency;
struct LUX_HUFFUMAN_HuffumanTree_ST* pLeft;
struct LUX_HUFFUMAN_HuffumanTree_ST* pRight;
}LUX_HUFFUMAN_HufTree_ST;
typedef struct
{
char character; //char
int frequency; //出现次数
}LUX_HUFFUMAN_CharFreq_ST;
/**
* @description: 初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_Init(void);
/**
* @description: 统计字符及出现的次数
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_GetInfo(void);
/**
* @description: 排序
* @param [in] arr 数组首地址
* @param [in] time 排序次数
* @return
*/
int LUX_HUFFUMAN_Sort(LUX_HUFFUMAN_CharFreq_ST *arr, int time);
/**
* @description: 去初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_DeInit(void);
效果
打印出输入文件中字符的信息,接下来就是通过获取的信息进行二叉树的构建
思考
-
当直接使用数组
构成霍夫曼树时,需要额外的位置存放,这就要求数组足够大,从而会造成资源的浪费。作为开发者,需要对自己代码性能非常敏感,平时写代码就要以高标准要求自己。而且,使用数组无法进行霍夫曼码字的分配 -
使用链表
操作方便,即用即创建没有空间浪费,删除方便,分配码字方便
于是,接下来的步骤就是:创建一个链表,将排序好的数组信息存到链表中。创建huf tree,只需要删除两个最小的节点,再添加一个新节点即可;分配码字,只需要依次将链表重构,分配码字即可。
未完成的代码
.c
#include "config.h"
LUX_HUFFUMAN_DataPoint_ST dataPoint;
LUX_HUFFUMAN_FilePoint_ST filePoint;
LUX_HUFFUMAN_CharFreq_ST charFreq[126] = {{0,0}};
struct stat getSize_ST;
/**
* @description: 初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_Init(void)
{
int ret = LUX_SUCCESS;
int fileSize = 0;
int len = 0;
do
{
/*inputfile*/
filePoint.inputFile = fopen(LUX_HUFFUMAN_InputFile, "r");
if (NULL == filePoint.inputFile)
{
printf("fopen LUX_HUFFUMAN_InputFile failed\n");
perror("LUX_HUFFUMAN_InputFile");
ret = LUX_FAILED;
break;
}
/*get bytes of file*/
ret = stat(LUX_HUFFUMAN_InputFile, &getSize_ST);
if (-1 == ret)
{
perror("get file size failed");
break;
}
fileSize = (int)getSize_ST.st_size;
/*store file*/
dataPoint.fileData = malloc(fileSize);
if (NULL == dataPoint.fileData)
{
printf("malloc dataPoint.fileData failed\n");
perror("dataPoint.fileData");
ret = LUX_FAILED;
break;
}
len = fread(dataPoint.fileData, 1, fileSize, filePoint.inputFile);
if (0 == len)
{
printf("fread dataPoint.fileData failed\n");
perror("why");
ret = LUX_FAILED;
break;
}
#if 0
//TODO:test cache
int i = 0;
for (i = 0; i < fileSize; i++)
{
printf("%c", *(dataPoint.fileData + i));
}
#endif
}while (0);
return ret;
}
/**
* @description: 排序
* @param [in] arr 数组首地址
* @param [in] time 排序次数
* @return
*/
int LUX_HUFFUMAN_Sort(LUX_HUFFUMAN_CharFreq_ST *arr, int time)
{
int i, j = 0;
LUX_HUFFUMAN_CharFreq_ST tmp;
memset(&tmp, 0, sizeof(LUX_HUFFUMAN_CharFreq_ST));
for (i = 0; i < time; i++)
{
for (j = 0; j < time - i; j++)
{
if (arr[j].frequency > arr[j + 1].frequency)
{
tmp.character = arr[j].character;
tmp.frequency = arr[j].frequency;
arr[j].character = arr[j + 1].character;
arr[j].frequency = arr[j + 1].frequency;
arr[j + 1].character = tmp.character;
arr[j + 1].frequency = tmp.frequency;
}
}
}
return LUX_SUCCESS;
}
/**
* @description: 构建huffuman树
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_BuildHufTree(LUX_HUFFUMAN_HufTree_ST* arr, int num)
{
int i,j = 0;
int minFreq1, minFreq2 = 100000; /*存放两个无父节点且权值最小的两个节点*/
int min1, min2 = 0;
/*loop build huf tree*/
for (i = 0; i < num - 1; i++)
{
minFreq1 = minFreq2 = 100000;
min1 = min2 = 0;
for (j = 0; j < num + i; j++)
{
if (arr[j].frequency < minFreq1 && arr[j].parent == -1)
{
minFreq2 = minFreq1;
min2 = min1;
minFreq1 = arr[j].frequency;
min1 = j;
}
else if (arr[j].frequency < minFreq2 && arr[j].parent == -1)
{
minFreq2 = arr[j].frequency;
min2 = j;
}
}
/*设置找到的两个子节点的信息*/
arr[min1].parent = num + i;
arr[min2].parent = num + i;
arr[num + i].frequency = arr[min1].frequency + arr[min2].frequency;
arr[num + i].lchild = min1;
arr[num + i].rchild = min2;
//TODO:TEST
printf ("min1.frequency and min2.frequency in round %d: %d, %d\n\n",
i+1, arr[min1].frequency, arr[min2].frequency);
}
return LUX_SUCCESS;
}
/**
* @description: 统计字符及出现的次数
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_GetInfo(void)
{
int ret = LUX_SUCCESS;
int hufTreeCnt = 0;
char buf;
int i, j, k = 0;
int num, cnt= 0;
int fileSize = (int)getSize_ST.st_size;
/*get info*/
for (i = 0; i < fileSize; i++)
{
buf = *(dataPoint.fileData + i);
charFreq[buf].character = buf;
charFreq[buf].frequency++;
}
/*get huf tree node num*/
for (i = 0; i < NUM_OF_ASCII; i++)
{
if (charFreq[i].frequency > 0)
{
++hufTreeCnt;//41
/*test character frquency*/
//printf("node[%d] character[%c] frequency[%d]\n", hufTreeCnt - 1, charFreq[i].character, charFreq[i].frequency);
}
}
LUX_HUFFUMAN_Sort(charFreq, NUM_OF_ASCII);
/*sort*/
LUX_HUFFUMAN_CharFreq_ST sortChar[hufTreeCnt];
memset(sortChar, 0, hufTreeCnt * sizeof(LUX_HUFFUMAN_CharFreq_ST));
LUX_HUFFUMAN_CharFreq_ST tmp;
memset(&tmp, 0, sizeof(LUX_HUFFUMAN_CharFreq_ST));
/*restore*/
for (i = 0; i < NUM_OF_ASCII + 1; i++)
{
if (charFreq[i].frequency > 0)
{
sortChar[num].frequency = charFreq[i].frequency;
sortChar[num].character = charFreq[i].character;
num++;//41
}
}
/*get data after restore*/
//这里申请完这个变量后会出现段错误,目前不知道怎么解决
printf("11111111\n");
LUX_HUFFUMAN_HufTree_ST Hu;
printf("22222222\n");
#if 0
for (i = 0; i < 2 * hufTreeCnt - 1; i++)
{
HufTree[i].frequency = 0;
HufTree[i].character = ' ';
HufTree[i].parent = -1;
HufTree[i].lchild = -1;
HufTree[i].rchild = -1;
}
for (i = 0; i < hufTreeCnt; i++)
{
HufTree[i].frequency = sortChar[i].frequency;
HufTree[i].character = sortChar[i].character;
HufTree[i].parent = -1;
HufTree[i].lchild = -1;
HufTree[i].rchild = -1;
}
#endif
/*build huffuman tree*/
//LUX_HUFFUMAN_BuildHufTree(HufTree, hufTreeCnt);
#if 0
//TODO:TEST
for (i = 0; i < hufTreeCnt; i++)
{
printf("node[%d] character[%c] frequency[%d]\n", i, sortChar[i].character, sortChar[i].frequency);
}
#endif
/*get left and right node*/
/*conbine the smallest tow into new node*/
/*give back the new node*/
/*resort*/
return ret;
}
/**
* @description: 去初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_DeInit(void)
{
fclose(filePoint.inputFile);
free(dataPoint.fileData);
free(dataPoint.charData);
return LUX_SUCCESS;
}
int main(void)
{
int ret = LUX_SUCCESS;
ret = LUX_HUFFUMAN_Init();
if(LUX_FAILED == ret)
{
printf("LUX_TEST_Init error\n");
return LUX_FAILED;
}
ret = LUX_HUFFUMAN_GetInfo();
if(LUX_FAILED == ret)
{
printf("LUX_HUFFUMAN_Func error\n");
return LUX_FAILED;
}
ret = LUX_HUFFUMAN_DeInit();
if(LUX_FAILED == ret)
{
printf("LUX_HUFFUMAN_DeInit error\n");
return LUX_FAILED;
}
return ret;
}
.h
#include <stdio.h>
#include <error.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#define LUX_HUFFUMAN_InputFile "inputFile"
#define LUX_HUFFUMAN_EncodeFile "encodeFile"
#define LUX_HUFFUMAN_EncodeTable "huffumanTable"
#define LUX_SUCCESS 0
#define LUX_FAILED -1
#define NUM_OF_ASCII 126
typedef struct
{
char *fileData; //YUV
char *charData; //所有字符信息
}LUX_HUFFUMAN_DataPoint_ST;
typedef struct
{
FILE *inputFile; //inutFile
FILE *huffumanEncode; //huffuman编码后文件
FILE *huffumanTable; //huffuman编码表
}LUX_HUFFUMAN_FilePoint_ST;
typedef struct LUX_HUFFUMAN_HuffumanTree_ST
{
int frequency;
int parent;
int lchild;
int rchild;
char character;
}LUX_HUFFUMAN_HufTree_ST;
typedef struct LUX_HUFFUMAN_HufLink_ST
{
char character;
int frequency;
struct LUX_HUFFUMAN_HufLink_ST* next;
}LUX_HUFFUMAN_HufLink_ST;
typedef struct
{
char character; //char
int frequency; //出现次数
}LUX_HUFFUMAN_CharFreq_ST;
/**
* @description: 初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_Init(void);
/**
* @description: 统计字符及出现的次数
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_GetInfo(void);
/**
* @description: 构建huffuman树
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_BuildHufTree(LUX_HUFFUMAN_HufTree_ST* arr, int num);
/**
* @description: 排序
* @param [in] arr 数组首地址
* @param [in] time 排序次数
* @return
*/
int LUX_HUFFUMAN_Sort(LUX_HUFFUMAN_CharFreq_ST *arr, int time);
/**
* @description: 去初始化
* @param [in] void
* @return
*/
int LUX_HUFFUMAN_DeInit(void);
能够成功运行的代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#include <iostream>
#define MAXBIT 100
#define MAXVALUE 10000
#define MAXLEAF 30
#define MAXNODE MAXLEAF*2 -1
typedef struct
{
int bit[MAXBIT];
int start;
} HCodeType; /* 编码结构体 */
typedef struct
{
int weight;
int parent;
int lchild;
int rchild;
char value;
} HNodeType; /* 结点结构体 */
/* 构造一颗哈夫曼树 */
void HuffmanTree (HNodeType HuffNode[MAXNODE], int n)
{
/* i、j: 循环变量,m1、m2:构造哈夫曼树不同过程中两个最小权值结点的权值,
* x1、x2:构造哈夫曼树不同过程中两个最小权值结点在数组中的序号。*/
int i, j, m1, m2, x1, x2;
/* 初始化存放哈夫曼树数组 HuffNode[] 中的结点 */
for (i=0; i<2*n-1; i++)
{
HuffNode[i].weight = 0;//权值
HuffNode[i].parent =-1;
HuffNode[i].lchild =-1;
HuffNode[i].rchild =-1;
HuffNode[i].value=' '; //实际值,可根据情况替换为字母
} /* end for */
/* 输入 n 个叶子结点的权值 */
for (i=0; i<n; i++)
{
printf ("Please input char of %d leaf node: ", i);
scanf ("%c",&HuffNode[i].value);
getchar();
} /* end for */
for (i=0; i<n; i++)
{
printf ("Please input weight of %d leaf node: ", i);
scanf ("%d",&HuffNode[i].weight);
getchar();
} /* end for */
/* 循环构造 Huffman 树 */
for (i=0; i<n-1; i++)
{
m1=m2=MAXVALUE; /* m1、m2中存放两个无父结点且结点权值最小的两个结点 */
x1=x2=0;
/* 找出所有结点中权值最小、无父结点的两个结点,并合并之为一颗二叉树 */
for (j=0; j<n+i; j++)
{
if (HuffNode[j].weight < m1 && HuffNode[j].parent==-1)
{
m2=m1;
x2=x1;
m1=HuffNode[j].weight;
x1=j;
}
else if (HuffNode[j].weight < m2 && HuffNode[j].parent==-1)
{
m2=HuffNode[j].weight;
x2=j;
}
} /* end for */
/* 设置找到的两个子结点 x1、x2 的父结点信息 */
HuffNode[x1].parent = n+i;
HuffNode[x2].parent = n+i;
HuffNode[n+i].weight = HuffNode[x1].weight + HuffNode[x2].weight;
HuffNode[n+i].lchild = x1;
HuffNode[n+i].rchild = x2;
printf ("x1.weight and x2.weight in round %d: %d, %d\n", i+1, HuffNode[x1].weight, HuffNode[x2].weight); /* 用于测试 */
printf ("\n");
} /* end for */
} /* end HuffmanTree */
//解码
void decodeing(char string[],HNodeType Buf[],int Num)
{
int i,tmp=0,code[1024];
int m=2*Num-1;
char *nump;
char num[1024];
for(i=0;i<strlen(string);i++)
{
if(string[i]=='0')
num[i]=0;
else
num[i]=1;
}
i=0;
nump=&num[0];
while(nump<(&num[strlen(string)]))
{tmp=m-1;
while((Buf[tmp].lchild!=-1)&&(Buf[tmp].rchild!=-1))
{
if(*nump==0)
{
tmp=Buf[tmp].lchild ;
}
else tmp=Buf[tmp].rchild;
nump++;
}
printf("%c",Buf[tmp].value);
}
}
int main(void)
{
HNodeType HuffNode[MAXNODE]; /* 定义一个结点结构体数组 */
HCodeType HuffCode[MAXLEAF], cd; /* 定义一个编码结构体数组, 同时定义一个临时变量来存放求解编码时的信息 */
int i, j, c, p, n;
char pp[100];
printf ("Please input n:\n");
scanf ("%d", &n);
HuffmanTree (HuffNode, n);
for (i=0; i < n; i++)
{
cd.start = n-1;
c = i;
p = HuffNode[c].parent;
while (p != -1) /* 父结点存在 */
{
if (HuffNode[p].lchild == c)
cd.bit[cd.start] = 0;
else
cd.bit[cd.start] = 1;
cd.start--; /* 求编码的低一位 */
c=p;
p=HuffNode[c].parent; /* 设置下一循环条件 */
} /* end while */
/* 保存求出的每个叶结点的哈夫曼编码和编码的起始位 */
for (j=cd.start+1; j<n; j++)
{ HuffCode[i].bit[j] = cd.bit[j];}
HuffCode[i].start = cd.start;
} /* end for */
/* 输出已保存好的所有存在编码的哈夫曼编码 */
for (i=0; i<n; i++)
{
printf ("%d 's Huffman code is: ", i);
for (j=HuffCode[i].start+1; j < n; j++)
{
printf ("%d", HuffCode[i].bit[j]);
}
printf(" start:%d",HuffCode[i].start);
printf ("\n");
}
printf("Decoding?Please Enter code:\n");
scanf("%s",&pp);
decodeing(pp,HuffNode,n);
getchar();
return 0;
}