哈夫曼树
定义
- 路径 一个节点到另一个节点的通路,称为路径(祖先节点到子孙节点)
- 路径长度:每经过一个节点,路径长度就增加1,不包括起始节点的
- 节点权值:对于节点赋予一个数值,表示节点的权值 比较:节点元素出现的次数
- 带权路径长度:从根节点出发到该节点的路径长度 乘以 该节点的权值
- 树的带权路径长度(WPL):树中所有叶子节点的带权路径之和
- 哈夫曼树:最优二叉树,由n个节点组成的二叉树的带权路径长度最短
- 节点相同,哈夫曼树可能不唯一,但树的带权路径长度相等
- 把n个节点构成哈夫曼树,这n个节点必然作为叶子节点,需要添加n-1个分支节点
构建哈夫曼树
- 遵循的原则:权重越大离根节点越近
- 算法描述过程
- 把n个叶子看作n棵独立的树,构成森林F
- 创建一个新的节点,然后从森林F中选取两棵节点权值最小的数作为新节点的左右子树,并且把的根节点设置为这两棵树根节点权值之和
- 从森林F中把刚才选取两棵树删除,并且把的节点作为树的根节点加入森林
- 重复2和3的步骤,直到森林中只剩下一棵树为止
哈夫曼树特性
- 每个初始结点最终都成为叶结点,且权值越小的结点到根结点的路径长度越大
- 哈夫曼树的结点总数为2n − 1
- 哈夫曼树中不存在度为1的结点
- 哈夫曼树并不唯一,但WPL必然相同且为最优
哈夫曼编码
- 由哈夫曼树获取哈夫曼编码
- 从根节点出发,左子树为0,右子树为1,到所有叶子节点所经过的路径就构成了哈夫曼编码
文件的压缩和解压
- 利用哈夫曼树编码对文件进行压缩和解压
- 算法过程
- 读取文件中的内容,统计每一个字符出现的次数
- 根据得到的字符-对应的次数构建哈夫曼树,得到哈夫曼编码
- 对文件中的字符用哈夫曼编码进行压缩
- 为了能够解压,需要把哈夫曼树存储到文件中
哈夫曼树实现(文件压缩和解压)
结构体定义
#define SUCCESS 0
#define FAILURE -1
typedef unsigned char ElemType;
#define CODE_LEN 256
#define MAX_NODES 256
struct HfmNode {
ElemType data;
size_t weight;
char code[CODE_LEN];
struct HfmNode *lchild, *rchild;
};
struct HfmCompress {
struct HfmNode *index[MAX_NODES];
struct HfmNode *nodes[MAX_NODES];
size_t len;
};
typedef struct HfmNode* HfmTree;
#define BUFF_LEN 1024
基本功能函数
HfmTree create_hfmtree(ElemType elems[], size_t weights[], size_t n);
HfmTree create_hfmtree_by_hfmnodes(struct HfmNode *nodes[], size_t n);
void hfmcode_hfmtree(HfmTree tree);
基本功能实现
创建哈夫曼树 create_hfmtree
HfmTree create_hfmtree(ElemType elems[], size_t weights[], size_t n)
{
struct HfmNode *nodes[n];
if (nodes == NULL) {
return NULL;
}
for (int i = 0; i < n; ++i) {
nodes[i] = (struct HfmNode*)malloc(sizeof(struct HfmNode));
nodes[i]->data = elems[i];
nodes[i]->weight = weights[i];
nodes[i]->lchild = nodes[i]->rchild = NULL;
memset(nodes[i]->code, 0, CODE_LEN);
}
return create_hfmtree_by_hfmnodes(nodes, n);
}
用结点创建哈夫曼树 create_hfmtree_by_hfmnodes
int compareWeight(const void *v1, const void *v2)
{
struct HfmNode **h1 = (struct HfmNode**)v1;
struct HfmNode **h2 = (struct HfmNode**)v2;
if ((*h2)->weight < (*h1)->weight) {
return -1;
} else if ((*h1)->weight < (*h2)->weight) {
return 1;
}
return 0;
}
HfmTree create_hfmtree_by_hfmnodes(struct HfmNode *nodes[], size_t n)
{
qsort(nodes, n, sizeof(struct HfmNode*), compareWeight);
struct HfmNode *root = NULL;
for (int i = n - 1; i > 0; --i) {
root = (struct HfmNode*)malloc(sizeof(struct HfmNode));
root->lchild = nodes[i - 1];
root->rchild = nodes[i];
root->weight = nodes[i - 1]->weight + nodes[i]->weight;
memset(root->code, 0, CODE_LEN);
int j;
for (j = i - 2; j >= 0 && nodes[j]->weight < root->weight; --j) {
nodes[j + 1] = nodes[j];
}
nodes[j + 1] = root;
}
root = nodes[0];
return root;
}
通过哈夫曼树生成哈夫曼编码 hfmcode_hfmtree
void hfmcode_hfmtree(HfmTree tree)
{
if (tree != NULL) {
if (tree->lchild != NULL) {
strcpy(tree->lchild->code, tree->code);
strcat(tree->lchild->code, "0");
hfmcode_hfmtree(tree->lchild);
}
if (tree->rchild != NULL) {
strcpy(tree->rchild->code, tree->code);
strcat(tree->rchild->code, "1");
hfmcode_hfmtree(tree->rchild);
}
}
}
文件压缩和解压功能函数
HfmTree build_hfmtree_by_file(const char *file, const char *dfile, struct HfmCompress *phfm);
unsigned char code_to_byte(const char *codes, int len);
static int compress_by_hfmcode(const char *srcFile, const char *destFile,
HfmTree tree, struct HfmNode *index[]);
HfmTree get_hfmtree_by_compress_file(FILE *fp, size_t *fs);
int code_to_char(const char *codes, unsigned char *byte, HfmTree tree);
int dencompress_file(FILE *fr, FILE *fw, HfmTree tree, size_t fs);
int compress(const char *srcFile, const char *destFile);
int dencompress(const char *srcFile, const char *destFile);
压缩解压实现
通过文件建立哈夫曼树 build_hfmtree_by_file
void EWC(unsigned char e, size_t w, char *code)
{
printf("%c %2d %s\n", e, w, code);
}
HfmTree build_hfmtree_by_file(const char *file, const char *dfile, struct HfmCompress *phfm)
{
FILE *fp = fopen(file, "rb");
if (fp == NULL) {
return NULL;
}
unsigned char buf[BUFF_LEN] = {};
size_t cnt = 0;
while ((cnt = fread(buf, 1, BUFF_LEN, fp)) > 0) {
for (int i = 0; i < cnt; i++) {
unsigned char b = buf[i];
if (phfm->index[b] == NULL) {
struct HfmNode *node = (struct HfmNode*)malloc(sizeof(struct HfmNode));
node->lchild = node->rchild = NULL;
node->data = b;
node->weight = 1;
phfm->index[b] = node;
phfm->nodes[phfm->len] = node;
++phfm->len;
} else {
++phfm->index[b]->weight;
}
}
}
size_t fs = ftell(fp);
fclose(fp);
fp = fopen(dfile, "wb");
fwrite(&fs, sizeof(fs), 1, fp);
fwrite(&phfm->len, sizeof(phfm->len), 1, fp);
for (int i = 0; i < phfm->len; ++i) {
fwrite(&phfm->nodes[i]->data, 1, 1, fp);
fwrite(&phfm->nodes[i]->weight, sizeof(size_t), 1, fp);
}
fclose(fp);
HfmTree tree = create_hfmtree_by_hfmnodes(phfm->nodes, phfm->len);
hfmcode_hfmtree(tree);
foreach_hfmtree(tree, EWC);
return tree;
}
把哈夫曼编码转换成二进制 code_to_byte
unsigned char code_to_byte(const char *codes, int len)
{
unsigned char byte = 0;
int i;
for (i = 0; i < len && i < 7; ++i) {
byte = (byte << 1) | (codes[i] - '0');
}
for (; i < 7; ++i) {
byte = (byte << 1);
}
return byte;
}
压缩成哈夫曼编码 compress_by_hfmcode
static int compress_by_hfmcode(const char *srcFile, const char *destFile,
HfmTree tree, struct HfmNode *index[])
{
FILE *fr = fopen(srcFile, "rb");
if (fr == NULL) return FAILURE;
FILE *fw = fopen(destFile, "ab");
if (fw == NULL) {
fclose(fr);
return FAILURE;
}
unsigned char buf[BUFF_LEN] = {};
size_t cnt = 0;
unsigned char codes[BUFF_LEN * CODE_LEN] = {};
int codeLen = 0;
while ((cnt = fread(buf, 1, BUFF_LEN, fr)) > 0) {
for (int i = 0; i < cnt; ++i) {
unsigned char b = buf[i];
strcpy(codes + codeLen, index[b]->code);
codeLen += strlen(index[b]->code);
}
unsigned char res[BUFF_LEN] = {};
size_t n = 0;
while(codeLen >= 7) {
res[n] = code_to_byte(codes + 7 * n, codeLen);
codeLen -= 7;
++n;
}
fwrite(res, 1, n, fw);
if (codeLen > 0) {
strncpy(codes, codes + 7 * n, codeLen);
codes[codeLen] = '\0';
}
}
if (codeLen > 0) {
unsigned char byte = code_to_byte(codes, codeLen);
fwrite(&byte, 1, 1, fw);
}
fclose(fr);
fclose(fw);
return SUCCESS;
}
获取压缩文件中的哈夫曼节点 并且 建立哈夫曼树 返回哈夫曼树根节点get_hfmtree_by_compress_file
HfmTree get_hfmtree_by_compress_file(FILE *fp, size_t *fs)
{
fread(fs, sizeof(*fs), 1, fp);
size_t n = 0;
fread(&n, sizeof(n), 1, fp);
unsigned char datas[n];
size_t weights[n];
size_t i;
for (int i = 0; i < n; ++i) {
fread(&datas[i], sizeof(datas[i]), 1, fp);
fread(&weights[i], sizeof(weights[i]), 1, fp);
}
return create_hfmtree(datas, weights, n);
}
把哈夫曼编码转换成原字符 code_to_char
int code_to_char(const char *codes, unsigned char *byte, HfmTree tree)
{
int i;
struct HfmNode *node = tree;
for (i = 0; codes[i] != '\0'; ++i) {
if (codes[i] == '0') {
node = node->lchild;
} else {
node = node->rchild;
}
if (node->lchild == NULL && node->rchild == NULL) {
*byte = node->data;
return i + 1;
}
}
return 0;
}
解压 哈夫曼编码->字符 dencompress_file
int dencompress_file(FILE *fr, FILE *fw, HfmTree tree, size_t fs)
{
unsigned char buf[BUFF_LEN] = {};
size_t cnt = 0;
unsigned char codes[BUFF_LEN * 8 + 1] = {};
size_t codeLen = 0;
int i, j;
size_t sum = 0;
while ((cnt = fread(buf, 1, BUFF_LEN, fr)) > 0) {
for (i = 0; i < cnt; ++i) {
for (j = 6; j >= 0; --j) {
codes[codeLen] = ((buf[i] >> j) & 0x1) + '0';
++codeLen;
}
}
codes[codeLen] = '\0';
unsigned char res[BUFF_LEN * 8 + 1] = {};
size_t resLen = 0;
size_t n = 0;
size_t len = 0;
while ((len = code_to_char(codes + n, &res[resLen], tree)) > 0) {
n += len;
++resLen;
}
if (sum + resLen > fs) {
resLen = fs - sum;
}
fwrite(res, 1, resLen, fw);
sum += resLen;
codeLen -= n;
if (codeLen > 0) {
strncpy(codes, codes + n, codeLen);
codes[codeLen] = '\0';
}
}
fclose(fr);
fclose(fw);
}
压缩 compress
int compress(const char *srcFile, const char *destFile)
{
assert(srcFile != NULL && destFile != NULL);
if (strcmp(srcFile, destFile) == 0) {
return FAILURE;
}
struct HfmCompress hfm = {};
HfmTree tree = build_hfmtree_by_file(srcFile, destFile, &hfm);
if (tree == NULL) {
return FAILURE;
}
int ret = compress_by_hfmcode(srcFile, destFile, tree, hfm.index);
destroy_hfmtree(tree);
return ret;
}
解压 dencompress
int dencompress(const char *srcFile, const char *destFile)
{
assert(srcFile != NULL && destFile != NULL);
FILE *fr = fopen(srcFile, "rb");
if (fr == NULL) {
return FAILURE;
}
FILE *fw = fopen(destFile, "wb");
if (fw == NULL) {
fclose(fr);
return FAILURE;
}
size_t fs = 0;
HfmTree tree = get_hfmtree_by_compress_file(fr, &fs);
hfmcode_hfmtree(tree);
int ret = dencompress_file(fr, fw, tree, fs);
destroy_hfmtree(tree);
return ret;
}