Huffman

// Huffman.cpp : Defines the entry point for the console application.
//

#include "StdAfx.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define NUM_CHARS 256   // number of characters in the character set
#define MAX_SIZE 1000000   // maximal buffer size for input from file


//Huffman 编码
class HuffCode
{
public:
 bool used;
 unsigned char bits[2];
 unsigned int size;
};

//Huffman Tree 节点
class HuffNode
{
public:
 char c;             // for leaves, the corresponding character from the character set
 unsigned int freq;  // frequency of the node (for non-leaves sum of frequencies of child nodes)
 HuffNode *left;     // pointer to left child node
 HuffNode *right;    // pointer to right child node
};


// This class is used in the construction of the Huffman tree.
// 优先级队列

class HuffNodePriorityQueue
{
public:
 HuffNode* HuffNodes[NUM_CHARS];
 unsigned int size;

 void init()
 {
  size=0;
 }

 void heapify(int i)
 {
  int l,r,smallest;
  HuffNode* tmp;
   
  l=2*i; /*left child*/
  r=2*i+1; /*right child*/
   
  if ((l < size)&&(HuffNodes[l]->freq < HuffNodes[i]->freq))
   smallest=l;
  else
   smallest=i;
  if ((r < size)&&(HuffNodes[r]->freq < HuffNodes[smallest]->freq))
   smallest=r;
   
  if (smallest!=i)
  {
   /*exchange to maintain heap property*/
   tmp=HuffNodes[smallest];
   HuffNodes[smallest]=HuffNodes[i];
   HuffNodes[i]=tmp;
   heapify(smallest);
  }
 }

 void addItem(HuffNode* node)
 {
  unsigned int i,parent; 
  size = size+1;
  i = size-1;
  parent = i/2;

  /*find the correct place to insert*/
  while ( (i > 0) && (HuffNodes[parent]->freq > node->freq) )
  {
   HuffNodes[i] = HuffNodes[parent];
   i = parent;
   parent = i/2;
  }
  HuffNodes[i]=node;
 }

 HuffNode* extractMin(void)
 {
  HuffNode* max;
  if (isEmpty())
   return 0;
  max=HuffNodes[0];
  HuffNodes[0]=HuffNodes[size-1];
  size=size-1;
  heapify(0);
  return max;
 }

 int isEmpty(void)
 {
  return size==0;
 }

 int isFull(void)
 {
  return size >= NUM_CHARS;
 }
};

 

// functions to be implemented
void create_freq_array(unsigned int freqs[NUM_CHARS], unsigned char* string, unsigned int size);
void create_table_rec(HuffCode table[NUM_CHARS], HuffNode* node, unsigned char* code, unsigned int pos);
unsigned int compress(HuffCode table[NUM_CHARS], unsigned char* compressed, unsigned char* string, unsigned int size);
void uncompress(HuffNode* root, unsigned char* compressed, unsigned char* string, unsigned int size);

 

// functions that are already implemented
void print_freqs(unsigned int freqs[NUM_CHARS]); //打印字符频率表
void print_table(HuffCode table[NUM_CHARS]);
unsigned int get_bit(unsigned char* bits, unsigned int pos);
void set_bit(unsigned char* bits, unsigned int pos, unsigned int state);
HuffNode* build_Huffman_tree(unsigned int freqs[NUM_CHARS]);
void delete_Huffman_tree(HuffNode* root);
void create_table(HuffCode table[NUM_CHARS], HuffNode* tree);
void create_header(unsigned int freqs[NUM_CHARS], HuffCode table[NUM_CHARS], unsigned char* header, unsigned int size);
unsigned int huffman_uncompress(unsigned char* compressed, unsigned char** string);
unsigned int huffman_compress(unsigned char** compressed, unsigned char* string, unsigned int size);

void write_text(unsigned char* string, unsigned int size, char* filename);
void read_text(unsigned char** string, char* filename);
void write_binary(unsigned char* compressed, unsigned int size, char* filename);
void read_binary(unsigned char** compressed, char* filename);


//打印字符频率表
void print_freqs(unsigned int freqs[NUM_CHARS])
{
 int i;
 for(i = 0; i < NUM_CHARS; i++)
 {
  if(freqs[i])
   printf("char: %c freq: %-4d/n", i, freqs[i]);
 }
}


// print the table of Huffman codes to the screen
// 打印haffman编码表

void print_table(HuffCode table[NUM_CHARS])
{
 unsigned int i, j;
 printf("/ncharacter encodings:/n");
 for (i = 0; i < NUM_CHARS; i++)
 if (table[i].used)
 {
  printf("char: %c code: ", char(i));
  for (j = 0; j < table[i].size; j++)
   printf("%d", get_bit(table[i].bits, j));
  printf("/n");
 }
 printf("/n");
}

// create the Huffman tree from the array of frequencies
// returns a pointer to the root node of the Huffman tree
// 根据字符频率数组,创建一个huffman树。返回根节点。
HuffNode* build_Huffman_tree(unsigned int freqs[NUM_CHARS])
{
 // create priority queue
 HuffNodePriorityQueue priority_queue;
 priority_queue.init();

 for (unsigned int i = 0; i < NUM_CHARS; i++)
 {
  if (freqs[i] > 0)
  {
   HuffNode* node = new HuffNode;
   node->c = i;
   node->freq = freqs[i];
   node->left = NULL;
   node->right = NULL;
   priority_queue.addItem(node);
  }
 }

 printf("number of characters: %d/n", priority_queue.size);

 // create the Huffman tree
 while (priority_queue.size > 1)
 {
  HuffNode* left = priority_queue.extractMin();
  HuffNode* right = priority_queue.extractMin();

  HuffNode* root = new HuffNode;
  root->freq = left->freq + right->freq;
  root->left = left;
  root->right = right;
  priority_queue.addItem(root);
 }

 // return pointer to the root of the Huffman tree
 return priority_queue.extractMin();
}


// recursively free memory of the Huffman tree
// 递归函数:释放huffman树(所有节点)占用的内存

void delete_Huffman_tree(HuffNode* root)
{
 HuffNode* node = root;

 // base case at leaf
 if (node->left == NULL && node->right == NULL)
 {
  delete node;
  return;
 }

 if (node->left)
  delete_Huffman_tree(node->left);
 if (node->right)
  delete_Huffman_tree(node->right);

 delete node;
 return;
}


// create table of Huffman codes
// calls create_table_rec to do the actual work

void create_table(HuffCode table[NUM_CHARS], HuffNode* tree)
{
 // initialize table data so that there are no undefined values
 for (unsigned int i = 0; i < NUM_CHARS; i++)
 {
  table[i].used = false;
  table[i].size = 0;
 }

 //code中保存节点的huffman编码。
 unsigned char code[2];
 create_table_rec(table, tree, code, 0);
 return;
}


// set the bit at position pos in the array bits to the value state
void set_bit(unsigned char* bits, unsigned int pos, unsigned int state)
{
 unsigned char mask = 0x80;  // = 128 dec = 10000000 bin
 for (unsigned int i = 0; i < (pos % 8); i++)
  mask = mask >> 1;  // shift bitmask to right

 if (state)
  bits[pos/8] = bits[pos/8] | mask;
 else
  bits[pos/8] = bits[pos/8] & (~mask);

 return;
}


// get the state of the bit at position pos in the array bits
unsigned int get_bit(unsigned char* bits, unsigned int pos)
{
 unsigned char mask = 0x80;  // = 128 dec = 10000000 bin
 for (unsigned int i = 0; i < (pos % 8); i++)
  mask = mask >> 1;  // shift bitmask to right

 return (((mask & bits[(int)(pos/8)]) == mask) ? 1 : 0);
}


// create header information for the compressed data
// 创建文件头

void create_header(unsigned int freqs[NUM_CHARS], HuffCode table[NUM_CHARS], unsigned char* header, unsigned int size)
{
 // store the size of the string at the beginning of the header
 memcpy(header, &size, sizeof(int));

 // store the scaled frequencies for all symbols in the character set
 for (unsigned int i = 0; i < NUM_CHARS; i++)
  header[sizeof(int) + i] = (unsigned char)freqs[i];

 return;
}


// write the text in string to the file filename, with size many bytes
void write_text(unsigned char* string, unsigned int size, char* filename)
{
 FILE * outfile;
 outfile = fopen(filename, "w");
 if (outfile != NULL)
 {
  fwrite(string, 1, size, outfile);
  fclose(outfile);
 }
 return;
}


// read the contents of the text file filename into *string
void read_text(unsigned char** string, char* filename)
{
 unsigned char* str = new unsigned char[MAX_SIZE];

 FILE* infile;
 infile = fopen(filename, "rb");
 if (infile != NULL)
 {
  fread(str, 1, MAX_SIZE, infile);
  fclose(infile);
 }

 *string = str;
 return;
}


// write binary data from compressed to the file filename
void write_binary(unsigned char* compressed, unsigned int size, char* filename)
{
 FILE * outfile;
 outfile = fopen(filename, "wb");
 if (outfile != NULL)
 {
  fwrite(compressed, 1, size, outfile);
  fclose(outfile);
 }

 return;
}


// read binary data from filename into *compressed
void read_binary(unsigned char** compressed, char* filename)
{
 unsigned char* comp = new unsigned char[MAX_SIZE];

 FILE* infile;
 infile = fopen(filename, "rb");
 if (infile != NULL)
 {
  fread(comp, 1, MAX_SIZE, infile);
  fclose(infile);
 }

 *compressed = comp;

 return;
}


// compress the size-many symbols in string into *compressed
// returns the size of the compressed data (in bytes)
unsigned int huffman_compress(unsigned char** compressed, unsigned char* string, unsigned int size)
{
 // create array of frequencies for all ascii characters
 unsigned int freqs[NUM_CHARS];  // frequencies of the ascii characters
 create_freq_array(freqs, string, size);

 //打印字符频率
 print_freqs(freqs);

 // create Huffman tree
 HuffNode* huff_tree = build_Huffman_tree(freqs);

 // create symbol table
 HuffCode table[NUM_CHARS];
 create_table(table, huff_tree);

 print_table(table);


 //create header
 unsigned int hsize = sizeof(int) + NUM_CHARS;  // size of the header
 unsigned char* header = new unsigned char[hsize];
 create_header(freqs, table, header, size);

 // create compressed text
 unsigned char* comp = new unsigned char[size];
 unsigned int codesize = compress(table, comp, string, size);

 printf("compressed string: (size: %d bit)/n", 8*codesize);
 for (unsigned int i = 0; i < 8*codesize; i++)
  printf("%d", get_bit(comp, i));
 printf("/n");


 // join header and compressed text
 // 将Header 和 编码结果 合并到一起(compressed_tmp)。
 unsigned char* compressed_tmp = new unsigned char[hsize + codesize];
 memcpy(compressed_tmp, header, hsize);
 memcpy(&compressed_tmp[hsize], comp, codesize);

 // free Huffman tree
 delete_Huffman_tree(huff_tree);

 // assing pointer to compressed data
 *compressed = compressed_tmp;

 // 释放临时性申请的内存
 delete[] header;
 delete[] comp;

 return hsize + codesize;
}


// uncompress the data in compressed into *string
// returns the size of the uncompressed text (in bytes)
unsigned int huffman_uncompress(unsigned char* compressed, unsigned char** string)
{
 unsigned int size = 0;  // size of the string
 unsigned int freqs[NUM_CHARS];  // frequencies of the ascii characters
 unsigned int hsize = sizeof(int) + NUM_CHARS;  // size of the header
 unsigned int i;

 // get number of symbols in compressed
 // 读取原始字符串的长度(存储在前4个字节中)
 memcpy(&size, compressed, sizeof(int));

 printf("size of compressed string: %d/n", size);

 // allocate memory
 unsigned char* str = new unsigned char[size];

 // restore frequency table
 for (i = 0; i < NUM_CHARS; i++)
  freqs[i] = compressed[sizeof(int) + i];


 // create Huffman tree
 HuffNode* huff_tree = build_Huffman_tree(freqs);

 /*
 HuffCode table[NUM_CHARS];
 create_table(table, huff_tree);
 print_table(table);
 */

 // uncompress the data
 uncompress(huff_tree, &compressed[hsize], str, size);

 // free Huffman tree
 delete_Huffman_tree(huff_tree);

 *string = str;

 printf("uncompressed string: (size: %d bit)/n", 8*sizeof(char)*size);
 for(i = 0; i < size; i++)
  printf("%c", (*string)[i]);
 printf("/n");

 return size;
}


int main(int argc, char **argv)
{
 unsigned int size = 0;
 unsigned char* string;

 // read file
 if (argc > 1)
 {
  string = new unsigned char[MAX_SIZE];
  FILE* infile;
  infile = fopen(argv[1], "r");
  size = fread(string, 1, MAX_SIZE, infile);
  fclose(infile);
 }
 else
 {
  size = 11;
  string = new unsigned char[size];
  string[0] = 't';
  string[1] = 'e';
  string[2] = 's';
  string[3] = 't';
  string[4] = ' ';
  string[5] = 's';
  string[6] = 't';
  string[7] = 'r';
  string[8] = 'i';
  string[9] = 'n';
  string[10] = 'g';
 }

 printf("size of input: %d/n", size);

 unsigned char* compressed;
 unsigned int csize = huffman_compress(&compressed, string, size);

 write_binary(compressed, csize, "testfile.bin");

 unsigned char* compressed2;
 read_binary(&compressed2, "testfile.bin");

 // uncompress text
 unsigned char* string2;
 unsigned int size2 = huffman_uncompress(compressed2, &string2);

 write_text(string2, size2, "testfile_new.txt");

 delete[] string;
 delete[] string2;
 delete[] compressed;
 delete[] compressed2;

 return 0;
}


//给定一个字符串,把字符的出现频率保存到freqs数组中
//Hint: Be carefull that you don’t scale any frequencies to zero for symbols that do appear in the string!
void create_freq_array(unsigned int freqs[NUM_CHARS], unsigned char* string, unsigned int size)
{
 int i, maxfreq = 0;
 
 //初始化成0
 memset(freqs, 0, sizeof(unsigned int) * NUM_CHARS);
 
 for(i=0; i<size; i++)
 {
  freqs[string[i]]++;
  
  if(freqs[string[i]] > maxfreq)
   maxfreq = freqs[string[i]];
 }
 
 //把字符频率压缩到一个字节。 scaled freqs to (0~255)
 if(maxfreq > 0xff)
 {
  for(i=0; i<NUM_CHARS; i++)
  {   
   if(freqs[i])
   {
    freqs[i] = (int)(freqs[i] * 255.0 / maxfreq + 0.5);
   
    //要确保不会被缩小成0!
    if(freqs[i] == 0)
     freqs[i] = 1;
   }
  }
 }
}


//递归函数:深度优先遍历huffman树,并把每个叶子节点的编码设置到响应的 HuffCode 表中相应对象。
//code: 追踪记录当前的HuffCode。(到叶子节点的路径)
//pos: code的当前长度

void create_table_rec(HuffCode table[NUM_CHARS], HuffNode* node, unsigned char* code, unsigned int pos)
{
 if(node->left)
 {
  set_bit(code, pos, 0);
  create_table_rec(table, node->left, code, pos+1);
 }
 
 if(node->right)
 {
  set_bit(code, pos, 1);
  create_table_rec(table, node->right, code, pos+1);
 }
 
 //到达叶子节点?则存储当前节点的HuffCode
 if(node->left == 0 && node->right == 0)
 {
  table[node->c].used = true;
  table[node->c].bits[0] = code[0];
  table[node->c].bits[1] = code[1];
  table[node->c].size = pos;
 }
}


//This function iterates over the input data in string of size size and appends for each symbol the corresponding
//bit sequence to the bitfield compressed. The function returns the size of the bitfield in bytes (so you will have to
//round upwards when the number of bits is not a multiple of 8).
//Hints: the bitfield compressed is chosen large enough to contain all bits, so you do not need to worry about
//memory. You should use a counter that remembers at which position you are in the bitfield.

//将一个指定长度的字符串进行Huffman编码
//compressed: 编码结果
//string: 输入的字符串
//size: 字符串长度
//返回 compressed 的字节长度。

unsigned int compress(HuffCode table[NUM_CHARS], unsigned char* compressed, unsigned char* string, unsigned int size)
{
 unsigned int i, j, pos = 0;
 unsigned char c;
 
 for(i = 0; i < size; i++)
 {
  c=string[i];
  for(j = 0; j < table[c].size; j++)
  {
   set_bit(compressed, pos+j, get_bit(table[c].bits, j));
  }
  pos += table[c].size;
 }
 
 //bits = (pos+1)
 return (pos/8 + 1);
}

//huffman解码:
//root: 已经重建好的Huffman树的根节点
//comprerssed: huffman编码后的bits
//string: 解码后的字符串
//size: 原始字符串(编码前的明文字符串)长度,它是从header中读取的

void uncompress(HuffNode* root, unsigned char* compressed, unsigned char* string, unsigned int size)
{
 unsigned int length = 0;  //已经解码的字符数
 unsigned int pos = 0;   //记录在bits中的位置
 unsigned int bit;
 HuffNode *node = NULL; //当前节点
 
 while(length < size)
 {
  //置到根节点
  node = root;
  
  //寻找叶子节点
  while(node->left != 0 || node->right != 0)
  {
   bit = get_bit(compressed, pos++);
  
   if(bit) node = node->right; //goto right child    
   else node = node->left;   //goto left child
  }
  
  string[length++] = node->c;  
 }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值