Huffman编码的数据结构
typedef struct huffman_node_tag
{
unsigned char isLeaf; //是否为树叶
unsigned long count; //节点代表的符号加权和
struct huffman_node_tag *parent; //父节点指针
union
{
struct
{
struct huffman_node_tag *zero, *one;
//子节点指针,分别代表0,1子节点指针
};
unsigned char symbol; //节点代表的符号
};
} huffman_node;
霍夫曼码结构
typedef struct huffman_code_tag
{
unsigned long numbits; //该码所用的比特数
unsigned char *bits; //指向该码比特串的指针
} huffman_code;
第一遍扫描文件,从指定文件中读取数据,统计每个符号发生的概率,并建立相应的树叶节点
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
int c;
unsigned int total_count = 0;//定义符号总数,初始为0
init_frequencies(pSF);//指针数组类型SymbolFrequencies,数组中的每个元素都是指向一个Huffman节点的指针
while((c = fgetc(in)) != EOF)//顺序读入文件中的每个字符,为每个字符开辟新节点。
{
unsigned char uc = c;
if(!(*pSF)[uc])
(*pSF)[uc] = new_leaf_node(uc);//如果是新符号就新建一个该符号的树叶节点
++(*pSF)[uc]->count;
++total_count;//符号总数+1
}
return total_count;
}
======================================================================
//建立相应的树叶节点
static huffman_node*
new_leaf_node(unsigned char symbol)
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开辟空间
p->isLeaf = 1;//为新节点赋值
p->symbol = symbol;
p->count = 0;//初始化该符号出现次数为0
p->parent = 0;//父节点为0
return p;//返回指针
}
//建立相应的非树叶节点
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
huffman_node *p = (huffman_node*)malloc(sizeof(huffman_node));//开辟空间
p->isLeaf = 0;//树叶节点标识符置0
p->count = count;//两个子树叶节点的和作为它们根节点的数值
p->zero = zero;//将形成的二叉树左节点标0
p->one = one;//右节点标1
p->parent = 0;//父节点为0
return p;//返回指针
统计各符号概率
int huffST_getSymFrequencies(SymbolFrequencies *SF, huffman_stat *st,int total_count)
{
int i,count =0;
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*SF)[i])//如果存在SF[i]
{
st->freq[i]=(float)(*SF)[i]->count/total_count;//计算这个符号的概率
count+=(*SF)[i]->count;
}
else
{
st->freq[i]= 0;//不存在SF[i]则频率为0
}
}
if(count==total_count)
return 1;
else
return 0;
}
构建霍夫曼树及生成霍夫曼码
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
unsigned int i = 0;
unsigned int n = 0;
huffman_node *m1 = NULL, *m2 = NULL;
SymbolEncoder *pSE = NULL;
#if 0
printf("BEFORE SORT\n");
print_freqs(pSF);
#endif
//将所有的节点按照字符概率小到大排序
qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);
#if 0
printf("AFTER SORT\n");
print_freqs(pSF);
#endif
for(n = 0; n < MAX_SYMBOLS && (*pSF)[n]; ++n)
;
for(i = 0; i < n - 1; ++i)//每次循环生成一个非树叶节点,循环过后指针数组中只剩下一个根节点元素。
{
//将m1和m2作为每一次选出最小的两个值,作为二叉树叶子节点
m1 = (*pSF)[0];
m2 = (*pSF)[1];
//将两个子树叶的父指针都指向新建的一个非树叶节点,该节点的count值为两个子树叶节点的count相加的和
(*pSF)[0] = m1->parent = m2->parent =
new_nonleaf_node(m1->count + m2->count, m1, m2);
(*pSF)[1] = NULL; //1节点置空
//调用qsort()函数重新排序
qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
}
//生成霍夫曼码
pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
memset(pSE, 0, sizeof(SymbolEncoder));
build_symbol_encoder((*pSF)[0], pSE);
return pSE;
}
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSF)
{
if(subtree == NULL)//如果是空节点,则编码结束
return;
if(subtree->isLeaf)//如果是树叶节点,则则构造霍夫曼码结构
(*pSF)[subtree->symbol] = new_code(subtree);
else//否则递归这个函数,直到达到树叶节点。先遍历左侧的所有子节点,再遍历右侧的子节点
{
build_symbol_encoder(subtree->zero, pSF);
build_symbol_encoder(subtree->one, pSF);
}
}
static huffman_code*
new_code(const huffman_node* leaf)
{ unsigned long numbits = 0;//定义码长为0
unsigned char* bits = NULL;//码字首地址
huffman_code *p;
while(leaf && leaf->parent) //节点及父节点是否存在
{
huffman_node *parent = leaf->parent;
unsigned char cur_bit = (unsigned char)(numbits % 8); //所编位在当前byte中的位置
unsigned long cur_byte = numbits / 8;//当前是第几个byte
if(cur_bit == 0) //开辟一个字节来存储码字
{
size_t newSize = cur_byte + 1;
bits = (char*)realloc(bits, newSize);// realloc与malloc不同,它在保持原有的数据不变的情况下重新分配新的空间,原有数据存在新空间中的前面部分
bits[newSize - 1] = 0; //Initialize the new byte. }
if(leaf == parent->one) //若为1节点,则将对应位置1
bits[cur_byte] |= 1 << cur_bit;
++numbits; //码位数加1
leaf = parent; //置为父节点
}
//对所编码字做逆序处理
if(bits)
reverse_bits(bits, numbits);
p = (huffman_code*)malloc(sizeof(huffman_code));
p->numbits = numbits;
p->bits = bits;
return p;
}
将Huffman码表写入文件
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if(p)
{
unsigned int numbytes;
//写入1字节的符号
fputc((unsigned char)i, out);
//写入1字节的码长数
fputc(p->numbits, out);
//写入码字
numbytes = numbytes_from_numbits(p->numbits);
if(fwrite(p->bits, 1, numbytes, out) != numbytes)
return 1;
}
}
将统计数据写入txt文件
void output_huffman_statistics(huffman_stat *st,FILE *out_Table)
{
int i,j;
unsigned char c;
fprintf(out_Table,"symbol\t freq\t codelength\t code\n");//以列表方式显示字符
for(i = 0; i < MAX_SYMBOLS; ++i)
{
fprintf(out_Table,"%d\t ",i);//输出字符
fprintf(out_Table,"%f\t ",st->freq[i]);// 字符发生的概率
fprintf(out_Table,"%d\t ",st->numbits[i]);// 字符对应编码码字长度
if(st->numbits[i])
{
for(j = 0; j < st->numbits[i]; ++j)
{
c =get_bit(st->bits[i], j);
fprintf(out_Table,"%d",c);//字符对应编码码字
}
}
fprintf(out_Table,"\n");
}
}
第二次扫描:对源文件进行编码并输出
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
int c;
while((c = fgetc(in)) != EOF)// 顺序读入文件中的每个字符
{
unsigned char uc = (unsigned char)c;
huffman_code *code = (*se)[uc];//查表
unsigned long i;
for(i = 0; i < code->numbits; ++i)//将码字写入文件
{
curbyte |= get_bit(code->bits, i) << curbit;
//在curbit达到一个字节时写入文件,并将curbyte、curbit 清零
if(++curbit == 8)
{
fputc(curbyte, out);
curbyte = 0;
curbit = 0;
}
}
}
if(curbit > 0) //将最后没达到一字节的码字也写入文件
fputc(curbyte, out);
return 0;
}
实验结果:
avi文件概率分布图
docx文件概率分布图
jpg文件概率分布图
MP3文件概率分布图
MP4文件概率分布图
mpg文件概率分布图
pdf文件概率分布图
ppt文件概率分布图
xls文件概率分布图
zip文件概率分布图