【C语言】huffman编码实现数据压缩

最新推荐文章于 2024-08-29 20:37:19 发布

明月清风旧

最新推荐文章于 2024-08-29 20:37:19 发布

阅读量545

点赞数 4

分类专栏： c语言算法文章标签： c语言算法开发语言

本文链接：https://blog.csdn.net/Ranchaun/article/details/137109561

版权

c语言算法专栏收录该内容

7 篇文章 3 订阅

订阅专栏

原理

huffman统计数据中字符的出现次数，根据每个字符的出现次数来编码，出现次数越多的数据使用越短的编码长度，从而实现数据压缩的目的。

类型定义

定义Huffman树节点类型和Huffman结构体，由于一个字节最多可以表示256种数据，index_table和count_table长度最大设置为256即可满足所有数据的压缩。

typedef struct _huff_tree{
  uint8_t data;
  uint8_t pos;// 位置，左为1，右为0
  uint32_t count;
  struct _huff_tree *parant;
  struct _huff_tree *left;
  struct _huff_tree *right;
}huff_tree;



typedef struct{
  huff_tree *tree;
  uint32_t index_table_index;
  huff_tree *index_table[256];
  uint32_t count_table[256];
  uint8_t *out;
  uint32_t out_len;
  const uint8_t *in;
  uint32_t in_len;
  uint32_t in_bit_count;
  uint32_t arr_bit_index;
  /* 以下成员调试时使用 */
  uint32_t tree_point_num;// 使用的树节点个数
}huffman_def;

完整代码

头文件定义如下：

#ifndef huffman_h__
#define huffman_h__

#include "stdint.h"

// huffman编码的实现
// out使用之后需要free内存
int hm_encode(const uint8_t *in,const int in_len,uint8_t **out,int *out_len);

// huffman解码
// out使用之后需要free内存
int hm_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len);

#endif

源文件定义如下：


#include "stdlib.h"
#include "stdio.h"
#include "string.h"
#include "huffman_.h"
#include <stdlib.h>
// huffman编码的实现

#define DBG_WARN printf
#define DBG_LOG printf

typedef struct _huff_tree{
  uint8_t data;
  uint8_t pos;// 位置，左为1，右为0
  uint32_t count;
  struct _huff_tree *parant;
  struct _huff_tree *left;
  struct _huff_tree *right;
}huff_tree;



typedef struct{
  huff_tree *tree;
  uint32_t index_table_index;
  huff_tree *index_table[256];
  uint32_t count_table[256];
  uint8_t *out;
  uint32_t out_len;
  const uint8_t *in;
  uint32_t in_len;
  uint32_t in_bit_count;
  uint32_t arr_bit_index;
  /* 以下成员调试时使用 */
  uint32_t tree_point_num;// 使用的树节点个数
}huffman_def;


static int hm_calc_value_of_tree(huff_tree *t);
static int hm_calc_deep_of_child(huff_tree* t);



// 生成一个树节点
static huff_tree *hm_creat_tree_point(huffman_def *h)
{
  h->tree_point_num++;
  return calloc(1,sizeof(huff_tree));
}

// 删除一个树节点
static void hm_del_tree_point(huffman_def *h,huff_tree *t)
{
  if(h->tree_point_num>0){
    h->tree_point_num--;
    free(t);
  }
}

// 按出现频次排序
static void hm_sort_index_table(huff_tree **table,int num)
{
  for(int i=0;i<num;i++)
  {
    huff_tree *item=table[i];
    for (int j=i;j<num;j++)
    {
      if(hm_calc_value_of_tree(table[j])>hm_calc_value_of_tree(item))
      {
        table[i]=table[j];
        table[j]=item;
        item=table[i];
      }
    }
  }
}



// 打印index_table
static void hm_index_table_print(huffman_def *h){
  DBG_LOG("-----index_table-----\n");
  for(int i=0;i<h->index_table_index;i++){
    DBG_LOG("index:%d,data:%02x,count:%d\n",i,h->index_table[i]->data,h->index_table[i]->count);
  }
}


// 打印数据的编码
static void hm_data_code_print(huffman_def *h){
  huff_tree *t;
  DBG_LOG("------data code------\n");
  for(int i=0;i<h->index_table_index;i++){
    t=h->index_table[i];
    DBG_LOG("%c:",t->data);
    while(t->parant){
      DBG_LOG("%d",t->pos);
      t=t->parant;
    }
    DBG_LOG("\n");
  }
}



static void hm_calc_count(huffman_def *h,const uint8_t *d,const int d_len)
{
  int num = d_len;
  int index;
  memset(h->count_table,0,256);
  // DBG_LOG("calc count_table\n");
  for(int i=0;i<num;i++)
  {
    h->count_table[d[i]]++;
  }
  // DBG_LOG("calc index_table\n");
  for(int i=0;i<256;i++)
  {
    if(h->count_table[i]>0){
      index=h->index_table_index;
      h->index_table[index]=hm_creat_tree_point(h);
      h->index_table[index]->count=h->count_table[i];
      h->index_table[index]->data=i;
      h->index_table_index++;
    }
  }
  // DBG_LOG("sort index_table\n");
  hm_sort_index_table(h->index_table,h->index_table_index);
  // hm_index_table_print(h);
}

// 计算编码后的长度
// 需要先计算index_table和生成huffman树
static int hm_calc_encode_len(huffman_def* h)
{
  // index_table_len(1byte)+index_data(index_table_len bytes)
  int sum =1+ h->index_table_index;
  int bit_count = 0;
  huff_tree* t;
  for (int i = 0; i < h->index_table_index; i++) {
    // 计数占用的字节数
    t = h->index_table[i];
    sum += t->count/255+1;
    // 压缩后占用的bit数
    bit_count += hm_calc_deep_of_child(t) * t->count;
  }
  // 补零数目字节
  sum += 1;
  sum += (bit_count + 7) / 8;
  DBG_LOG("data len for encode:%d\n", sum);
  return sum;
}


// 计算树的值
static int hm_calc_value_of_tree(huff_tree *t)
{
  int sum=0;
  if(t->left&&t->right)
    sum=hm_calc_value_of_tree(t->left)+hm_calc_value_of_tree(t->right);
  else
    sum=t->count;
  // DBG_LOG("tree sum:%d\n",sum);
  return sum;
}


// 计算子节点的深度
static int hm_calc_deep_of_child(huff_tree* t)
{
  int deep = 0;
  while (t->parant) {
    deep++;
    t = t->parant;
  }
  return deep;
}



// 打印huffman树
static void hm_tree_print(huff_tree *t)
{
  if(t->left&&t->right){
    DBG_LOG("point:,count:%d\n",hm_calc_value_of_tree(t));
    hm_tree_print(t->left);
    hm_tree_print(t->right);
  }else{
    DBG_LOG("data:%d,count:%d\n",t->data,t->count);
  }

}




// 建立huffman树
static void hm_creat_tree(huffman_def *h)
{
  int tail=h->index_table_index;
  huff_tree *sub1,*sub2;
  huff_tree **table=calloc(tail,sizeof(huff_tree *));
  for(int i=0;i<tail;i++){
    table[i]=h->index_table[i];
  }
  while(tail>1){
    huff_tree *temp;
    sub1=table[tail-1];
    sub2=table[tail-2];
    // 大在左，小在右
    temp=hm_creat_tree_point(h);
    sub1->parant=temp;
    sub2->parant=temp;
    // 左为1，右为0
    if(hm_calc_value_of_tree(sub1)>hm_calc_value_of_tree(sub2)){
      temp->left=sub1;
      sub1->pos=1;
      temp->right=sub2;
      sub2->pos=0;
    }else{
      temp->left=sub2;
      sub2->pos=1;
      temp->right=sub1;
      sub1->pos=0;
    }
    table[tail-2]=temp;
    tail--;
    hm_sort_index_table(table,tail);
    // DBG_LOG("-----table-----\n");
    // for(int i=0;i<tail;i++){
    //   DBG_LOG("index:%d,count:%d\n",i,hm_calc_value_of_tree(table[i]));
    // }
  }
  h->tree=table[0];
  free(table);
}


// 删除树
static void hm_del_tree(huffman_def *h,huff_tree *t)
{
  if(t->left&&t->right){
    hm_del_tree(h,t->left);
    hm_del_tree(h,t->right);
  }
  hm_del_tree_point(h,t);
}

// 数据中添加一个bit
static void hm_add_bit(uint8_t *d,int *d_len,int bit,int *index)
{
  if(*index<(*d_len )*8){
    uint8_t c = d[*d_len - 1];
    c|=bit<<(*index%8);
    d[*d_len - 1] = c;
  }else{
    d[*d_len] = bit;
    (*d_len)++;
  }
  (*index)++;
}


// 根据数据添加bit
static int hm_encode_byte(huffman_def *h,uint8_t d)
{
  huff_tree *t=0;
  // 这里默认一定能找到对应的值
  for(int i=0;i<h->index_table_index;i++)
  {
    t=h->index_table[i];
    if(t->data==d)
      break;
  }
  if(t->data!=d){
    DBG_WARN("can not encode.\n");
    exit(-1);
  }
  while(t->parant){
    hm_add_bit(h->out,&h->out_len,t->pos,&h->arr_bit_index);
    t=t->parant;
  }
  return 0;
}

// 生成索引
static int hm_creat_index_table(huffman_def *h,uint8_t *data,int *data_len)
{
  int temp;
  int diff;
  int temp_num;
  data[*data_len] = h->index_table_index; (*data_len)++;
  // hm_index_table_print(h);
  for(int i=0;i<h->index_table_index;i++)
  {
    data[*data_len] = h->index_table[i]->data; (*data_len)++;
    temp=h->index_table[i]->count;
    temp_num = temp / 255 + 1;
    for (int i = 0; i < temp_num; i++) {
      if (i < temp_num - 1) {
        data[*data_len] = 255; (*data_len)++;
      }
      else {
        data[*data_len] = temp % 255; (*data_len)++;
      }
    }
  }
  // 填充0个数
  temp=8-(h->arr_bit_index%8);
  //DBG_LOG("fill with 0 by:%d\n", temp);
  data[*data_len] = temp; (*data_len)++;
  return 0;
}

// huffman编码
/*
压缩后数据格式
data[0]:索引表长度
data[1 ~ n]:索引表，每个索引由值(1byte)和频次(1byte,小于255)(2byte,大于等于255,频次由两个字节相加)
data[n+1]:数据中填充0个数
data[n+2 ~ m]:压缩后的数据

*/
int hm_encode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len)
{
  int input_len = in_len;
  int output_len=0;
  int output_index = 0;
  huffman_def *h=calloc(1,sizeof(huffman_def));
  hm_calc_count(h,in, input_len);
  hm_creat_tree(h);
  DBG_LOG("huffman tree point num:%d\n",h->tree_point_num);
  output_len = hm_calc_encode_len(h);
  (*out) = calloc(output_len + 1, sizeof(uint8_t));
  hm_creat_index_table(h, *out, &output_index);
  DBG_LOG("output_len=%d\n", output_index);
  h->out = &(*out)[output_index];
  for(int i=0;i<input_len;i++)
  {
    hm_encode_byte(h,in[i]);
  }
  DBG_LOG("bitcount:%d\n", h->arr_bit_index);
  (*out)[output_index-1] = h->out_len*8- h->arr_bit_index;
  DBG_LOG("fill with 0 by:%d\n", (*out)[output_index - 1]);
  (*out_len) = output_len;
  hm_del_tree(h,h->tree);
  DBG_LOG("after del tree point num:%d\n",h->tree_point_num);

  DBG_LOG("lenth_in:%d,length_encode:%d\n",input_len, output_len);
  free(h);
  return 0;
}


// 读取编码表,返回数据开始的位置
static int hm_unpack_count(huffman_def *h,const uint8_t *d,int d_len)
{
  int num = d[0]==0?256:d[0];;
  int index=1;
  uint8_t temp;
  for(int i=0;i<num;i++)
  {
    h->index_table[i]=hm_creat_tree_point(h);
    h->index_table[i]->data=d[index];index++;
    do{
      temp= d[index];index++;
      h->index_table[i]->count+=temp;
    }while(temp==0xff);
    h->index_table_index++;
  }
  temp= d[index];index++;
  h->in_bit_count=(d_len -index)*8-temp;
  h->in=&d[index];
  // hm_index_table_print(h);
  printf("bitcount:%d,\n",h->in_bit_count);
  return index;
}


// 获取指定index的bit值
static inline int hm_get_bit(const uint8_t *d,int index)
{
  uint8_t t=d[index/8];
  return t&(1<<(index%8))?1:0;
}


// 对比树节点，匹配返回bit数，不匹配返回0
static inline int hm_cmp_bits(huffman_def *h,huff_tree *t)
{
  int count=0;
  // DBG_LOG("tree pos:",t->pos);
  while(t){
    // DBG_LOG("%d",t->pos);
    if(hm_get_bit(h->in,h->arr_bit_index+count)!=t->pos){
      // DBG_LOG(" |failed\n");
      return 0;
    }
    else{
      count++;
      t=t->parant;
    }
  }
  h->arr_bit_index+=count;
  // DBG_LOG(" |ok,\n");
  return count;
}


static uint8_t hm_decode_byte(huffman_def *h)
{
  huff_tree *t=h->tree;
  int bit;
  // DBG_LOG("decode:");
  while(t->left&&t->right){
    bit=hm_get_bit(h->in,h->arr_bit_index-1);
    // DBG_LOG("%d",bit);
    if(bit==t->left->pos)
      t=t->left;
    else
      t=t->right;
    h->arr_bit_index--;
  }
  // DBG_LOG(" | decode byte:%c\n",t->data);
  return t->data;
}



static int hm_calc_decode_len(huffman_def *h)
{
  int sum=0;
  for(int i=0;i<h->index_table_index;i++){
    sum+=h->index_table[i]->count;
  }
  DBG_LOG("data len for decode:%d\n",sum);
  return sum;
}



// huffman解码
/*
*/
int hm_decode(const uint8_t* in, const int in_len, uint8_t** out, int* out_len)
{
  int decode_len,decode_index;
  uint8_t *decode_data=0;
  uint8_t c;
  huffman_def *h=calloc(1,sizeof(huffman_def));
  if (h == 0) {
    return -1;
  }
  hm_unpack_count(h,in,in_len);
  hm_creat_tree(h);
  // hm_data_code_print(h);
  // hm_tree_print(h->tree);
  DBG_LOG("huffman tree point num:%d\n",h->tree_point_num);
  decode_len=hm_calc_decode_len(h);
  decode_index=decode_len;
  decode_data=calloc(decode_len+1,sizeof(uint8_t));
  h->arr_bit_index=h->in_bit_count;
  while(decode_index >0){
    c=hm_decode_byte(h);
    decode_data[decode_index-1]=c;
    decode_index--;
  }
  hm_del_tree(h,h->tree);
  DBG_LOG("after del tree point num:%d\n",h->tree_point_num);
  free(h);
  (*out) = decode_data;
  (*out_len) = decode_len;
  return 0;
}

实验

无重复数据的压缩情况

编写实验代码如下：


int main(int argc, char *argv[])
{
  // encode_file(argv[1]);
  //const uint8_t file_data[]="2023 5830628A000005830628A000015830628A000025830628A000035830628A000045830628A000055830628A000065830628A000075830628A000085830628A00009";
  const uint8_t file_data[]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
    39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70};
  uint8_t *encode_data=0;
  int encode_size;
  uint8_t *decode_data=0;
  int decode_size;
  hm_encode(file_data,sizeof(file_data),&encode_data,&encode_size);
  print_data(encode_data,encode_size);
  hm_decode(encode_data,encode_size,&decode_data,&decode_size);
  print_data(decode_data,decode_size);
  free(encode_data);
  free(decode_data);

  return 0;
}

实验1

可以看到在输入数据没有重复性的时候压缩之后的数据反而增大了（原始数据长度为70，压缩之后的数据长度为196），扩大了接近3倍。

有重复数据的压缩情况

编写验证代码如下：


int main(int argc, char *argv[])
{
  // encode_file(argv[1]);
  const uint8_t file_data[]="2023 5830628A000005830628A000015830628A000025830628A000035830628A000045830628A000055830628A000065830628A000075830628A000085830628A00009";
  // const uint8_t file_data[]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
  //   39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70};
  uint8_t *encode_data=0;
  int encode_size;
  uint8_t *decode_data=0;
  int decode_size;
  hm_encode(file_data,sizeof(file_data),&encode_data,&encode_size);
  print_data(encode_data,encode_size);
  hm_decode(encode_data,encode_size,&decode_data,&decode_size);
  // print_data(decode_data,decode_size);
  printf("%s",(const char *)decode_data);
  free(encode_data);
  free(decode_data);

  return 0;
}

实验2
原始数据存在重复数据的时候，Huffman编码则可以大放异彩（原始数据长度136，压缩之后的数据长度76），数据量减小了接近一半。

数据中只有一种字符的情况

编写如下代码：

int main(int argc, char *argv[])
{
  // encode_file(argv[1]);
  // const uint8_t file_data[]="2023 5830628A000005830628A000015830628A000025830628A000035830628A000045830628A000055830628A000065830628A000075830628A000085830628A00009";
  // const uint8_t file_data[]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,
  //   39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70};
  uint8_t file_data[200]={0};

  uint8_t *encode_data=0;
  int encode_size;
  uint8_t *decode_data=0;
  int decode_size;
  hm_encode(file_data,sizeof(file_data),&encode_data,&encode_size);
  print_data(encode_data,encode_size);
  hm_decode(encode_data,encode_size,&decode_data,&decode_size);
  print_data(decode_data,decode_size);
  // printf("%s",(const char *)decode_data);
  free(encode_data);
  free(decode_data);

  return 0;
}