// Huffman.cpp : Defines the entry point for the console application.
//
#include "StdAfx.h"
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define NUM_CHARS 256 // number of characters in the character set
#define MAX_SIZE 1000000 // maximal buffer size for input from file
//Huffman 编码
class HuffCode
{
public:
bool used;
unsigned char bits[2];
unsigned int size;
};
//Huffman Tree 节点
class HuffNode
{
public:
char c; // for leaves, the corresponding character from the character set
unsigned int freq; // frequency of the node (for non-leaves sum of frequencies of child nodes)
HuffNode *left; // pointer to left child node
HuffNode *right; // pointer to right child node
};
// This class is used in the construction of the Huffman tree.
// 优先级队列
class HuffNodePriorityQueue
{
public:
HuffNode* HuffNodes[NUM_CHARS];
unsigned int size;
void init()
{
size=0;
}
void heapify(int i)
{
int l,r,smallest;
HuffNode* tmp;
l=2*i; /*left child*/
r=2*i+1; /*right child*/
if ((l < size)&&(HuffNodes[l]->freq < HuffNodes[i]->freq))
smallest=l;
else
smallest=i;
if ((r < size)&&(HuffNodes[r]->freq < HuffNodes[smallest]->freq))
smallest=r;
if (smallest!=i)
{
/*exchange to maintain heap property*/
tmp=HuffNodes[smallest];
HuffNodes[smallest]=HuffNodes[i];
HuffNodes[i]=tmp;
heapify(smallest);
}
}
void addItem(HuffNode* node)
{
unsigned int i,parent;
size = size+1;
i = size-1;
parent = i/2;
/*find the correct place to insert*/
while ( (i > 0) && (HuffNodes[parent]->freq > node->freq) )
{
HuffNodes[i] = HuffNodes[parent];
i = parent;
parent = i/2;
}
HuffNodes[i]=node;
}
HuffNode* extractMin(void)
{
HuffNode* max;
if (isEmpty())
return 0;
max=HuffNodes[0];
HuffNodes[0]=HuffNodes[size-1];
size=size-1;
heapify(0);
return max;
}
int isEmpty(void)
{
return size==0;
}
int isFull(void)
{
return size >= NUM_CHARS;
}
};
// functions to be implemented
void create_freq_array(unsigned int freqs[NUM_CHARS], unsigned char* string, unsigned int size);
void create_table_rec(HuffCode table[NUM_CHARS], HuffNode* node, unsigned char* code, unsigned int pos);
unsigned int compress(HuffCode table[NUM_CHARS], unsigned char* compressed, unsigned char* string, unsigned int size);
void uncompress(HuffNode* root, unsigned char* compressed, unsigned char* string, unsigned int size);
// functions that are already implemented
void print_freqs(unsigned int freqs[NUM_CHARS]); //打印字符频率表
void print_table(HuffCode table[NUM_CHARS]);
unsigned int get_bit(unsigned char* bits, unsigned int pos);
void set_bit(unsigned char* bits, unsigned int pos, unsigned int state);
HuffNode* build_Huffman_tree(unsigned int freqs[NUM_CHARS]);
void delete_Huffman_tree(HuffNode* root);
void create_table(HuffCode table[NUM_CHARS], HuffNode* tree);
void create_header(unsigned int freqs[NUM_CHARS], HuffCode table[NUM_CHARS], unsigned char* header, unsigned int size);
unsigned int huffman_uncompress(unsigned char* compressed, unsigned char** string);
unsigned int huffman_compress(unsigned char** compressed, unsigned char* string, unsigned int size);
void write_text(unsigned char* string, unsigned int size, char* filename);
void read_text(unsigned char** string, char* filename);
void write_binary(unsigned char* compressed, unsigned int size, char* filename);
void read_binary(unsigned char** compressed, char* filename);
//打印字符频率表
void print_freqs(unsigned int freqs[NUM_CHARS])
{
int i;
for(i = 0; i < NUM_CHARS; i++)
{
if(freqs[i])
printf("char: %c freq: %-4d/n", i, freqs[i]);
}
}
// print the table of Huffman codes to the screen
// 打印haffman编码表
void print_table(HuffCode table[NUM_CHARS])
{
unsigned int i, j;
printf("/ncharacter encodings:/n");
for (i = 0; i < NUM_CHARS; i++)
if (table[i].used)
{
printf("char: %c code: ", char(i));
for (j = 0; j < table[i].size; j++)
printf("%d", get_bit(table[i].bits, j));
printf("/n");
}
printf("/n");
}
// create the Huffman tree from the array of frequencies
// returns a pointer to the root node of the Huffman tree
// 根据字符频率数组,创建一个huffman树。返回根节点。
HuffNode* build_Huffman_tree(unsigned int freqs[NUM_CHARS])
{
// create priority queue
HuffNodePriorityQueue priority_queue;
priority_queue.init();
for (unsigned int i = 0; i < NUM_CHARS; i++)
{
if (freqs[i] > 0)
{
HuffNode* node = new HuffNode;
node->c = i;
node->freq = freqs[i];
node->left = NULL;
node->right = NULL;
priority_queue.addItem(node);
}
}
printf("number of characters: %d/n", priority_queue.size);
// create the Huffman tree
while (priority_queue.size > 1)
{
HuffNode* left = priority_queue.extractMin();
HuffNode* right = priority_queue.extractMin();
HuffNode* root = new HuffNode;
root->freq = left->freq + right->freq;
root->left = left;
root->right = right;
priority_queue.addItem(root);
}
// return pointer to the root of the Huffman tree
return priority_queue.extractMin();
}
// recursively free memory of the Huffman tree
// 递归函数:释放huffman树(所有节点)占用的内存
void delete_Huffman_tree(HuffNode* root)
{
HuffNode* node = root;
// base case at leaf
if (node->left == NULL && node->right == NULL)
{
delete node;
return;
}
if (node->left)
delete_Huffman_tree(node->left);
if (node->right)
delete_Huffman_tree(node->right);
delete node;
return;
}
// create table of Huffman codes
// calls create_table_rec to do the actual work
void create_table(HuffCode table[NUM_CHARS], HuffNode* tree)
{
// initialize table data so that there are no undefined values
for (unsigned int i = 0; i < NUM_CHARS; i++)
{
table[i].used = false;
table[i].size = 0;
}
//code中保存节点的huffman编码。
unsigned char code[2];
create_table_rec(table, tree, code, 0);
return;
}
// set the bit at position pos in the array bits to the value state
void set_bit(unsigned char* bits, unsigned int pos, unsigned int state)
{
unsigned char mask = 0x80; // = 128 dec = 10000000 bin
for (unsigned int i = 0; i < (pos % 8); i++)
mask = mask >> 1; // shift bitmask to right
if (state)
bits[pos/8] = bits[pos/8] | mask;
else
bits[pos/8] = bits[pos/8] & (~mask);
return;
}
// get the state of the bit at position pos in the array bits
unsigned int get_bit(unsigned char* bits, unsigned int pos)
{
unsigned char mask = 0x80; // = 128 dec = 10000000 bin
for (unsigned int i = 0; i < (pos % 8); i++)
mask = mask >> 1; // shift bitmask to right
return (((mask & bits[(int)(pos/8)]) == mask) ? 1 : 0);
}
// create header information for the compressed data
// 创建文件头
void create_header(unsigned int freqs[NUM_CHARS], HuffCode table[NUM_CHARS], unsigned char* header, unsigned int size)
{
// store the size of the string at the beginning of the header
memcpy(header, &size, sizeof(int));
// store the scaled frequencies for all symbols in the character set
for (unsigned int i = 0; i < NUM_CHARS; i++)
header[sizeof(int) + i] = (unsigned char)freqs[i];
return;
}
// write the text in string to the file filename, with size many bytes
void write_text(unsigned char* string, unsigned int size, char* filename)
{
FILE * outfile;
outfile = fopen(filename, "w");
if (outfile != NULL)
{
fwrite(string, 1, size, outfile);
fclose(outfile);
}
return;
}
// read the contents of the text file filename into *string
void read_text(unsigned char** string, char* filename)
{
unsigned char* str = new unsigned char[MAX_SIZE];
FILE* infile;
infile = fopen(filename, "rb");
if (infile != NULL)
{
fread(str, 1, MAX_SIZE, infile);
fclose(infile);
}
*string = str;
return;
}
// write binary data from compressed to the file filename
void write_binary(unsigned char* compressed, unsigned int size, char* filename)
{
FILE * outfile;
outfile = fopen(filename, "wb");
if (outfile != NULL)
{
fwrite(compressed, 1, size, outfile);
fclose(outfile);
}
return;
}
// read binary data from filename into *compressed
void read_binary(unsigned char** compressed, char* filename)
{
unsigned char* comp = new unsigned char[MAX_SIZE];
FILE* infile;
infile = fopen(filename, "rb");
if (infile != NULL)
{
fread(comp, 1, MAX_SIZE, infile);
fclose(infile);
}
*compressed = comp;
return;
}
// compress the size-many symbols in string into *compressed
// returns the size of the compressed data (in bytes)
unsigned int huffman_compress(unsigned char** compressed, unsigned char* string, unsigned int size)
{
// create array of frequencies for all ascii characters
unsigned int freqs[NUM_CHARS]; // frequencies of the ascii characters
create_freq_array(freqs, string, size);
//打印字符频率
print_freqs(freqs);
// create Huffman tree
HuffNode* huff_tree = build_Huffman_tree(freqs);
// create symbol table
HuffCode table[NUM_CHARS];
create_table(table, huff_tree);
print_table(table);
//create header
unsigned int hsize = sizeof(int) + NUM_CHARS; // size of the header
unsigned char* header = new unsigned char[hsize];
create_header(freqs, table, header, size);
// create compressed text
unsigned char* comp = new unsigned char[size];
unsigned int codesize = compress(table, comp, string, size);
printf("compressed string: (size: %d bit)/n", 8*codesize);
for (unsigned int i = 0; i < 8*codesize; i++)
printf("%d", get_bit(comp, i));
printf("/n");
// join header and compressed text
// 将Header 和 编码结果 合并到一起(compressed_tmp)。
unsigned char* compressed_tmp = new unsigned char[hsize + codesize];
memcpy(compressed_tmp, header, hsize);
memcpy(&compressed_tmp[hsize], comp, codesize);
// free Huffman tree
delete_Huffman_tree(huff_tree);
// assing pointer to compressed data
*compressed = compressed_tmp;
// 释放临时性申请的内存
delete[] header;
delete[] comp;
return hsize + codesize;
}
// uncompress the data in compressed into *string
// returns the size of the uncompressed text (in bytes)
unsigned int huffman_uncompress(unsigned char* compressed, unsigned char** string)
{
unsigned int size = 0; // size of the string
unsigned int freqs[NUM_CHARS]; // frequencies of the ascii characters
unsigned int hsize = sizeof(int) + NUM_CHARS; // size of the header
unsigned int i;
// get number of symbols in compressed
// 读取原始字符串的长度(存储在前4个字节中)
memcpy(&size, compressed, sizeof(int));
printf("size of compressed string: %d/n", size);
// allocate memory
unsigned char* str = new unsigned char[size];
// restore frequency table
for (i = 0; i < NUM_CHARS; i++)
freqs[i] = compressed[sizeof(int) + i];
// create Huffman tree
HuffNode* huff_tree = build_Huffman_tree(freqs);
/*
HuffCode table[NUM_CHARS];
create_table(table, huff_tree);
print_table(table);
*/
// uncompress the data
uncompress(huff_tree, &compressed[hsize], str, size);
// free Huffman tree
delete_Huffman_tree(huff_tree);
*string = str;
printf("uncompressed string: (size: %d bit)/n", 8*sizeof(char)*size);
for(i = 0; i < size; i++)
printf("%c", (*string)[i]);
printf("/n");
return size;
}
int main(int argc, char **argv)
{
unsigned int size = 0;
unsigned char* string;
// read file
if (argc > 1)
{
string = new unsigned char[MAX_SIZE];
FILE* infile;
infile = fopen(argv[1], "r");
size = fread(string, 1, MAX_SIZE, infile);
fclose(infile);
}
else
{
size = 11;
string = new unsigned char[size];
string[0] = 't';
string[1] = 'e';
string[2] = 's';
string[3] = 't';
string[4] = ' ';
string[5] = 's';
string[6] = 't';
string[7] = 'r';
string[8] = 'i';
string[9] = 'n';
string[10] = 'g';
}
printf("size of input: %d/n", size);
unsigned char* compressed;
unsigned int csize = huffman_compress(&compressed, string, size);
write_binary(compressed, csize, "testfile.bin");
unsigned char* compressed2;
read_binary(&compressed2, "testfile.bin");
// uncompress text
unsigned char* string2;
unsigned int size2 = huffman_uncompress(compressed2, &string2);
write_text(string2, size2, "testfile_new.txt");
delete[] string;
delete[] string2;
delete[] compressed;
delete[] compressed2;
return 0;
}
//给定一个字符串,把字符的出现频率保存到freqs数组中
//Hint: Be carefull that you don’t scale any frequencies to zero for symbols that do appear in the string!
void create_freq_array(unsigned int freqs[NUM_CHARS], unsigned char* string, unsigned int size)
{
int i, maxfreq = 0;
//初始化成0
memset(freqs, 0, sizeof(unsigned int) * NUM_CHARS);
for(i=0; i<size; i++)
{
freqs[string[i]]++;
if(freqs[string[i]] > maxfreq)
maxfreq = freqs[string[i]];
}
//把字符频率压缩到一个字节。 scaled freqs to (0~255)
if(maxfreq > 0xff)
{
for(i=0; i<NUM_CHARS; i++)
{
if(freqs[i])
{
freqs[i] = (int)(freqs[i] * 255.0 / maxfreq + 0.5);
//要确保不会被缩小成0!
if(freqs[i] == 0)
freqs[i] = 1;
}
}
}
}
//递归函数:深度优先遍历huffman树,并把每个叶子节点的编码设置到响应的 HuffCode 表中相应对象。
//code: 追踪记录当前的HuffCode。(到叶子节点的路径)
//pos: code的当前长度
void create_table_rec(HuffCode table[NUM_CHARS], HuffNode* node, unsigned char* code, unsigned int pos)
{
if(node->left)
{
set_bit(code, pos, 0);
create_table_rec(table, node->left, code, pos+1);
}
if(node->right)
{
set_bit(code, pos, 1);
create_table_rec(table, node->right, code, pos+1);
}
//到达叶子节点?则存储当前节点的HuffCode
if(node->left == 0 && node->right == 0)
{
table[node->c].used = true;
table[node->c].bits[0] = code[0];
table[node->c].bits[1] = code[1];
table[node->c].size = pos;
}
}
//This function iterates over the input data in string of size size and appends for each symbol the corresponding
//bit sequence to the bitfield compressed. The function returns the size of the bitfield in bytes (so you will have to
//round upwards when the number of bits is not a multiple of 8).
//Hints: the bitfield compressed is chosen large enough to contain all bits, so you do not need to worry about
//memory. You should use a counter that remembers at which position you are in the bitfield.
//将一个指定长度的字符串进行Huffman编码
//compressed: 编码结果
//string: 输入的字符串
//size: 字符串长度
//返回 compressed 的字节长度。
unsigned int compress(HuffCode table[NUM_CHARS], unsigned char* compressed, unsigned char* string, unsigned int size)
{
unsigned int i, j, pos = 0;
unsigned char c;
for(i = 0; i < size; i++)
{
c=string[i];
for(j = 0; j < table[c].size; j++)
{
set_bit(compressed, pos+j, get_bit(table[c].bits, j));
}
pos += table[c].size;
}
//bits = (pos+1)
return (pos/8 + 1);
}
//huffman解码:
//root: 已经重建好的Huffman树的根节点
//comprerssed: huffman编码后的bits
//string: 解码后的字符串
//size: 原始字符串(编码前的明文字符串)长度,它是从header中读取的
void uncompress(HuffNode* root, unsigned char* compressed, unsigned char* string, unsigned int size)
{
unsigned int length = 0; //已经解码的字符数
unsigned int pos = 0; //记录在bits中的位置
unsigned int bit;
HuffNode *node = NULL; //当前节点
while(length < size)
{
//置到根节点
node = root;
//寻找叶子节点
while(node->left != 0 || node->right != 0)
{
bit = get_bit(compressed, pos++);
if(bit) node = node->right; //goto right child
else node = node->left; //goto left child
}
string[length++] = node->c;
}
}