test3_Huffman codes

最新推荐文章于 2021-10-14 18:35:24 发布

fenmate

最新推荐文章于 2021-10-14 18:35:24 发布

阅读量266

点赞数

分类专栏：编程文章标签：编码

本文链接：https://blog.csdn.net/fenmate/article/details/73614253

版权

编程专栏收录该内容

5 篇文章 0 订阅

订阅专栏

Huffman编码的方法

（1）统计符号发生的概率。

（2）按照出现概率从小到大排序。

（3）每一次选出概率最小的两个符号作为二叉树的叶节点，将和作为它们的根节点，其频率为两个子节点频率之和，这两个叶子节点不再参与比较，再用新的根节点参与比较。
（4）重复（3）步骤，直到得到概率为1的根节点。

（5）二叉树的左节点为0，右节点为1，从上到下由根节点到叶节点得到每个叶节点的编码。

Huffman节点及Huffman码字节点的数据结构

[cpp]view plaincopy 
   
 <strong> typedef struct huffman_node_tag   
 {  
     unsigned char isLeaf; // 是否为叶节点，1是0否  
     unsigned long count; //信源中出现频数  
     struct huffman_node_tag *parent; //父节点指针  
     union{  
         struct{ //如果不是叶节点，这里为左右子节点指针  
             struct huffman_node_tag *zero, *one;  
         };  
         unsigned char symbol; //如果是叶节点，这里为一个信源符号  
     };  
 } huffman_node;  
   
 typedef struct huffman_code_tag //码字数据类型  
 {  
     unsigned long numbits; //码字长度  
     /* 码字的第1到第8比特由低到高保存在bits[0]中，第9比特到第16比特保存在bits[1]中/  
     unsigned char *bits;  
 } huffman_code;</strong>  

静态链接库

该程序文件包含两个两个工程（project），其中“Huff_run”为主工程（Win32 Console Application），其中包含程序的主函数，有“Huff_code”为库工程（Win32 Static Library）。

Huffman编码的流程

1.读入文件。

2.进行第一次扫描，统计文件中各个字符出现的频率。

3.建立huffman树。

4.将码表及其他必要信息写入输出文件。

5.第二次扫描，对源文件进行编码并输出。

Huff_code

Huffman.h

[csharp]view plaincopy 
   
 /* 
  *  huffman_coder - Encode/Decode files using Huffman encoding. 
  *  http://huffman.sourceforge.net 
  *  Copyright (C) 2003  Douglas Ryan Richardson; Gauss Interprise, Inc 
  * 
  *  This library is free software; you can redistribute it and/or 
  *  modify it under the terms of the GNU Lesser General Public 
  *  License as published by the Free Software Foundation; either 
  *  version 2.1 of the License, or (at your option) any later version. 
  * 
  *  This library is distributed in the hope that it will be useful, 
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of 
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU 
  *  Lesser General Public License for more details. 
  * 
  *  You should have received a copy of the GNU Lesser General Public 
  *  License along with this library; if not, write to the Free Software 
  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
  */  
  
 #ifndef HUFFMAN_HUFFMAN_H  
 #define HUFFMAN_HUFFMAN_H  
  
 #include <stdio.h>  
   
 int huffman_encode_file(FILE *in, FILE *out,FILE *out_Table );//step1:changed by yzhang for huffman statistics  
 int huffman_decode_file(FILE *in, FILE *out);  
 int huffman_encode_memory(const unsigned char *bufin,  
                           unsigned int bufinlen,  
                           unsigned char **pbufout,  
                           unsigned int *pbufoutlen);  
 int huffman_decode_memory(const unsigned char *bufin,  
                           unsigned int bufinlen,  
                           unsigned char **bufout,  
                           unsigned int *pbufoutlen);  
  
 #endif  

Huffman.c

1.从源文件中读取数据（本实验以ASCII字符流），统计每个符号发生的概率，并建立相应的树叶节点。

[csharp]view plaincopy 
   
 #define MAX_SYMBOLS 256  
 typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];  

[csharp]view plaincopy 
   
 <span style="font-size: 14px;">static unsigned int  
 get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)//统计文件中各个字符出现频率  
 {  
     int c;  
     unsigned int total_count = 0;//扫描的总信源符号数，初始化为0  
   
 </span><span style="font-size:12px;">    /* 将所有信源符号地址初始化为NULL，使得所有字符频率为0 */  
     init_frequencies(pSF);  
   
     /* 计算输入文件中每个符号的频率。 */  
     while((c = fgetc(in)) != EOF)//挨个读取字符</span><span style="font-size: 14px;">  
   </span><span style="font-size:12px;">  {  
         unsigned char uc = c;//将读取的字符赋给uc  
         if(!(*pSF)[uc])//如果uc不存在对应的空间，即uc是一个新的符号  
             (*pSF)[uc] = new_leaf_node(uc);//产生该字符的一个新的叶节点。</span><span style="font-size: 14px;">  
 </span><span style="font-size:12px;">        ++(*pSF)[uc]->count;//如果uc不是一个新的字符，则当前字符出现的频数累加1  
         ++total_count;//总计数值加1  
     }  
   
     return total_count;//返回值为总计数值  
 }</span>  

new_leaf_node(）

[csharp]view plaincopy 
   
 static huffman_node*  
 new_leaf_node(unsigned char symbol)/*新建一个叶节点*/  
 {  
     huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));  
     p->isLeaf = 1;//1表示是叶节点  
     p->symbol = symbol;//将新的符号的值存入symbol中  
     p->count = 0;//该节点的频数为初始化0  
     p->parent = 0;//该节点父节点初始化为0  
     return p;  
 }  

2. 构建霍夫曼树及生成霍夫曼码。

[csharp]view plaincopy 
   
 static SymbolEncoder*  
 calculate_huffman_codes(SymbolFrequencies * pSF)  
 {  
     unsigned int i = 0;  
     unsigned int n = 0;  
     huffman_node *m1 = NULL, *m2 = NULL;  
     SymbolEncoder *pSE = NULL;  
  
 #if 0  
     printf("BEFORE SORT\n");  
     print_freqs(pSF);  
 #endif  
   
     /* 按升序对符号频率数组进行排序 */  
     qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);//数组的起始地址，数组的元素数，每个元素的大小，比较函数的指针  
     //将所有的节点按照字符概率小到大排序，可使用qsort函数对节点结构体进行排序。排序的依据是SFComp，即根据每个字符发生的概率进行排序。  
 #if 0     
     printf("AFTER SORT\n");  
     print_freqs(pSF);  
 #endif  
   
     /*得到文件出现的字符种类数   */  
     for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)  
         ;  
   
     /* 
      * Construct a Huffman tree. This code is based 
      * on the algorithm given in Managing Gigabytes 
      * by Ian Witten et al, 2nd edition, page 34. 
      * Note that this implementation uses a simple 
      * count instead of probability. 
      构建霍夫曼树 
      */  
     for(i = 0; i < n - 1; ++i)  
     {  
         /* 将m1和m2设置为最小概率的两个子集。 */  
             m1 = (*pSF)[0];  
         m2 = (*pSF)[1];  
   
         /* 将m1和m2替换为一个集合{m1，m2}，其概率是m1和m2之和的概率。*/  
   
         //合并m1、m2为非叶节点，count为二者count之和    
         //并将该非叶节点的左右孩子设为m1、m2    
         //将左右孩子的父节点指向该非叶节点    
         //将(*pSF)[0]指向该非叶节点  
         (*pSF)[0] = m1->parent = m2->parent =  
             new_nonleaf_node(m1->count + m2->count, m1, m2);//  
         (*pSF)[1] = NULL;//1节点置空      
         /* 由于最小的两个频率数，进行了合并，频率大小发生改变，所以重新排序 */  
         qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);  
     }  
   
     /* Build the SymbolEncoder array from the tree. */  
     pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));  
     //定义一个指针数组，数组中每个元素是指向码节点的指针  
     memset(pSE, 0, sizeof(SymbolEncoder));  
     build_symbol_encoder((*pSF)[0], pSE);  
     return pSE;  
 }  

其中qsort函数使用到的比较函数SFComp代码如下：

[csharp]view plaincopy 
   
 static int  
 SFComp(const void *p1, const void *p2)  
 {  
     const huffman_node *hn1 = *(const huffman_node**)p1;  
     const huffman_node *hn2 = *(const huffman_node**)p2;  
   
     /* 用于将所有NULL排到最后 */  
     if(hn1 == NULL && hn2 == NULL)  
         return 0;//若两者都为空，则返回相等  
     if(hn1 == NULL)  
         return 1;//若返回值为1，大于0，则hn1排到hn2后  
     if(hn2 == NULL)  
         return -1;若返回值为-1，小于0，则hn2排到hn1后  
     /*由小到大排列*/  
     if(hn1->count > hn2->count)  
         return 1;  
     else if(hn1->count < hn2->count)  
         return -1;  
   
     return 0;  
 }  

遍历递归Huffman树，对存在的每个字符计算码字

[csharp]view plaincopy 
   
 static void  
 build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)  
 {  
     if(subtree == NULL)  
         return;//判断是否是空树, 是则说明编码结束，  
   
     if(subtree->isLeaf)//判断是否为树叶节点，是则产生新的码字  
         (*pSF)[subtree->symbol] = new_code(subtree);  
     else  
     {//  
         build_symbol_encoder(subtree->zero, pSF);//遍历左子树，调用build_symbol_encoder函数自身  
         build_symbol_encoder(subtree->one, pSF);//遍历右子数  
     }  
 }  

对每个树叶节点进行编码：

[csharp]view plaincopy 
   
 static huffman_code*  
 new_code(const huffman_node* leaf)  
 {  
     /* 通过走到根节点然后反转位来构建huffman代码， 
     因为霍夫曼代码是通过走下树来计算的。*/  
     //采用向上回溯的方法  
     unsigned long numbits = 0;//表示码长，以位为单位  
     unsigned char* bits = NULL;//表示指向码字的指针  
     huffman_code *p;  
   
     while(leaf && leaf->parent)//用来判断节点和父节点是否存在，leaf为NULL时，不进行编码；parent为NULL时，已经到达树根不在编码  
     {  
         huffman_node *parent = leaf->parent;  
         unsigned char cur_bit = (unsigned char)(numbits % 8);//current_bit为当前在bits[]的第几位  
   
         unsigned long cur_byte = numbits / 8;//current_byte  
   
         /* 如果码字长度超过一个字节，那么就在分配一个字节 */  
         if(cur_bit == 0)  
         {  
             size_t newSize = cur_byte + 1;  
             bits = (char*)realloc(bits, newSize);  
             /*realloc()函数先判断当前的指针是否有足够的连续空间，如果有，扩大bits指向的地址，并且将bits返回，如果空间不够，先按照newsize指定的大小分配空间，将原有数据从头到尾拷贝到新分配的内存区域，而后释放原来bits所指内存区域(注意:原来指针是自动释放，不需要使用free)，同时返回新分配的内存区域的首地址。即重新分配存储器块的地址。*/  
             bits[newSize - 1] = 0; /* Initialize the new byte. */  
         }  
   
 //如果是左孩子，则不用改变数值，因为初始化为0。如果是右孩子，则将该位置1  
         if(leaf == parent->one)  
             bits[cur_byte] |= 1 << cur_bit;//将1左移至cur_bit，再将其与bits[cur_byte]进行或的操作  
   
         ++numbits;//码字位数加1  
         leaf = parent;//下一位的码字在当前码字的父节点一级  
     }  
   
     if(bits)//将现有的码字进行反转  
         reverse_bits(bits, numbits);  
   
     p = (huffman_code*)malloc(sizeof(huffman_code));  
     p->numbits = numbits;//码长赋给节点的numbits  
     p->bits = bits;//码字付给节点的bits  
     return p;//返回值为码字  
 }  

码字逆序：

[csharp]view plaincopy 
   
 static void  
 reverse_bits(unsigned char* bits, unsigned long numbits)  
 {  
     unsigned long numbytes = numbytes_from_numbits(numbits);//将numbits除8后上取整得到numbytes  
     unsigned char *tmp =  
         (unsigned char*)alloca(numbytes);//alloca()是内存分配函数，在栈上申请空间，用完后马上就释放  
     unsigned long curbit;  
     long curbyte = 0;//记录即将要反转的二进制码所在的的数组下标  
   
     memset(tmp, 0, numbytes); //将数组tmp[numbytes]所有元素置为为0  
   
     for(curbit = 0; curbit < numbits; ++curbit)  
     {  
         unsigned int bitpos = curbit % 8;//表示curbit不是8的倍数时需要左移的位数  
   
         if(curbit > 0 && curbit % 8 == 0)//curbit为8的倍数时，进入下一个字节  
             ++curbyte;  
   
         tmp[curbyte] |= (get_bit(bits, numbits - curbit - 1) << bitpos);  
     }  
   
     memcpy(bits, tmp, numbytes);//将tmp临时数组内容拷贝到bits数组中  
 }  

将码表写入文件

[csharp]view plaincopy 
   
 static int  
 write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)  
 {  
     unsigned long i, count = 0;  
   
     /* 计算se中的字符种类数. */  
     for(i = 0; i < MAX_SYMBOLS; ++i)  
     {  
         if((*se)[i])  
             ++count;  
     }  
   
     /* Write the number of entries in network byte order. */  
     i = htonl(count);    //在网络传输中，采用big-endian序，对于0x0A0B0C0D ，传输顺序就是0A 0B 0C 0D ，  
     //因此big-endian作为network byte order，little-endian作为host byte order。  
     //little-endian的优势在于unsigned char/short/int/long类型转换时，存储位置无需改变  
     if(fwrite(&i, sizeof(i), 1, out) != 1)  
         return 1;//将字符种类的个数写入文件  
   
     /* Write the number of bytes that will be encoded. */  
     symbol_count = htonl(symbol_count);  
     if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1)  
         return 1;//将字符数写入文件  
   
     /* Write the entries. */  
     for(i = 0; i < MAX_SYMBOLS; ++i)  
     {  
         huffman_code *p = (*se)[i];  
         if(p)  
         {  
             unsigned int numbytes;  
             /* 写入1字节的符号 */  
             fputc((unsigned char)i, out);  
             /* 写入一字节的码长 */  
             fputc(p->numbits, out);  
             /* 写入numbytes字节的码字*/  
             numbytes = numbytes_from_numbits(p->numbits);  
             if(fwrite(p->bits, 1, numbytes, out) != numbytes)  
                 return 1;  
         }  
     }  
   
     return 0;  
 }  

第二次扫描对文件进行Huffman编码

[csharp]view plaincopy 
   
 static int  
 do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)  
 {  
     unsigned char curbyte = 0;  
     unsigned char curbit = 0;  
     int c;  
   
     while((c = fgetc(in)) != EOF)//遍历文件的每一个字符  
     {  
         unsigned char uc = (unsigned char)c;  
         huffman_code *code = (*se)[uc];//查表  
         unsigned long i;  
         /*将码字写入文件*/  
         for(i = 0; i < code->numbits; ++i)  
         {  
             /* Add the current bit to curbyte. */  
             curbyte |= get_bit(code->bits, i) << curbit;  
   
             /* If this byte is filled up then write it 
              * out and reset the curbit and curbyte. */  
             if(++curbit == 8)  
             {  
                 fputc(curbyte, out);  
                 curbyte = 0;  
                 curbit = 0;  
             }  
         }  
     }  

输出统计结果

[csharp]view plaincopy 
   
 int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)  
 {  
     int i,count =0;  
     for(i = 0; i < MAX_SYMBOLS; ++i)  
     {     
         if((*SF)[i])  
         {  
             st->freq[i]=(float)(*SF)[i]->count/total_count;  
             count+=(*SF)[i]->count;  
         }  
         else   
         {  
             st->freq[i]= 0;  
         }  
     }  
     if(count==total_count)  
         return 1;  
     else  
         return 0;  
 }  
   
 int huffST_getcodeword(SymbolEncoder *se, huffman_stat *st)  
 {  
     unsigned long i,j;  
   
     for(i = 0; i < MAX_SYMBOLS; ++i)  
     {  
         huffman_code *p = (*se)[i];  
         if(p)  
         {  
             unsigned int numbytes;  
             st->numbits[i] = p->numbits;  
             numbytes = numbytes_from_numbits(p->numbits);  
             for (j=0;j<numbytes;j++)  
                 st->bits[i][j] = p->bits[j];  
         }  
         else  
             st->numbits[i] =0;  
     }  
   
     return 0;  
 }  
   
 void output_huffman_statistics(huffman_stat *st,FILE *out_Table)  
 {  
     int i,j;  
     unsigned char c;  
     fprintf(out_Table,"symbol\t   freq\t   codelength\t   code\n");  
     for(i = 0; i < MAX_SYMBOLS; ++i)  
     {     
         fprintf(out_Table,"%d\t   ",i);  
         fprintf(out_Table,"%f\t   ",st->freq[i]);  
         fprintf(out_Table,"%d\t    ",st->numbits[i]);  
         if(st->numbits[i])  
         {  
             for(j = 0; j < st->numbits[i]; ++j)  
             {  
                 c =get_bit(st->bits[i], j);  
                 fprintf(out_Table,"%d",c);  
             }  
         }  
         fprintf(out_Table,"\n");  
     }  
 }  

各样本文件的概率分布图

实验结果

根据香农第一定理（无失真信源编码定理），对于二进制码信源符号，平均码长的下界为信源熵。当信源符号接近等概分布时，信源熵最大，而平均码长也没有可降低的空间了。故当文件的概率分布越不均匀，通过霍夫曼编码得到的编码效率越高。

fenmate

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
test3_Huffman codes

Huffman编码的方法（1）统计符号发生的概率。（2）按照出现概率从小到大排序。（3）每一次选出概率最小的两个符号作为二叉树的叶节点，将和作为它们的根节点，其频率为两个子节点频率之和，这两个叶子节点不再参与比较，再用新的根节点参与比较。（4）重复（3）步骤，直到得到概率为1的根节点。（5）二叉树的左节点为
复制链接

扫一扫