/*
* huffman.c
* 霍夫曼编码代码
* Created on: Dec 12, 2016
* Author: xuenhappy
*
* sample:
*
============
input file:
============
t 2
h 1
i 2
s 2
_ 7
a 4
n 2
e 4
x 1
m 2
p 1
l 1
o 1
f 3
H 1
u 1
r 1
==================================
output:
==================================
freq file:char_freq_map.dat
--------------------------------------
source H(x)=3.76975 bit
--------------------------------------
_ |7.00 |0.19444 |000
f |3.00 |0.08333 |0010
m |2.00 |0.05556 |0011
s |2.00 |0.05556 |0100
t |2.00 |0.05556 |0101
i |2.00 |0.05556 |0110
n |2.00 |0.05556 |0111
e |4.00 |0.11111 |100
a |4.00 |0.11111 |101
h |1.00 |0.02778 |11000
r |1.00 |0.02778 |11001
x |1.00 |0.02778 |11010
u |1.00 |0.02778 |11011
l |1.00 |0.02778 |11100
H |1.00 |0.02778 |11101
p |1.00 |0.02778 |11110
o |1.00 |0.02778 |11111
--------------------------------------
huffman avg code length=3.80556 bit
*
*
*/
#include <stdio.h>
#include <stdlib.h>
#include <map>
#include <regex.h>
#include <vector>
#include <string.h>
#include <string>
#include <algorithm>
#include <math.h>
typedef struct node{
char* w;
double freq;
double max;
struct node* f;
struct node* n;
}H_NODE;
bool compare(node *a,node *b){
return a->freq>b->freq;
}
void print_node(node* n,int code,int deep,double sum){
if(n->w){
printf("%s\t|%.2f\t|%.5f\t|",n->w,n->freq,n->freq/sum);
for(int i=deep-1;i>=0;i--)
printf("%d", (code&(1<<i)) != 0);
printf("\n");
}
if(n->f)
print_node(n->f,(code<<1)|0,deep+1,sum);
if(n->n)
print_node(n->n,(code<<1)|1,deep+1,sum);
}
void length_node(node* n,int deep,double f_sum,double &h_sum){
if(n->f)
length_node(n->f,deep+1,f_sum,h_sum);
if(n->n)
length_node(n->n,deep+1,f_sum,h_sum);
if(n->w)
h_sum+=deep*n->freq/f_sum;
}
void free_node(node* n){
if(n->n)
free_node(n->n);
if(n->f)
free_node(n->f);
if(n->w){
free(n->w);
}
free(n);
}
/**
* 将指定频率的字母表转化为转化为对应的编码
*/
void huffman_encode(const std::map<std::string,double> &char_freqs){
if(char_freqs.empty())
return;
//init
H_NODE* nodes[char_freqs.size()];
std::map<std::string,double>::const_iterator it;
int i=0;
double sum=0;
for(it=char_freqs.begin();it!=char_freqs.end();it++){
sum+=it->second;
}
double hx=0;
for(it=char_freqs.begin();it!=char_freqs.end();it++){
hx-=log(it->second/sum)*(it->second/sum);
}
hx=hx/log(2);
printf("source H(x)=%.5f bit\n",hx);
printf("--------------------------------------\n");
for(it=char_freqs.begin();it!=char_freqs.end();it++){
nodes[i]=(H_NODE*)malloc(sizeof(H_NODE));
nodes[i]->freq=it->second;
nodes[i]->max=it->second;
nodes[i]->w=strdup(it->first.c_str());
nodes[i]->f=nodes[i]->n=NULL;
i++;
}
//build
int size=char_freqs.size();
while(size>1){
std::sort(nodes,nodes+size, compare);
H_NODE* c=(H_NODE*)malloc(sizeof(H_NODE));
c->freq=nodes[size-1]->freq+nodes[size-2]->freq;
c->w=NULL;
if(nodes[size-2]->max>nodes[size-1]->max){
c->max=nodes[size-2]->max;
c->f=nodes[size-2];
c->n=nodes[size-1];
}else{
c->max=nodes[size-1]->max;
c->f=nodes[size-1];
c->n=nodes[size-2];
}
nodes[size-1]=NULL;
nodes[size-2]=c;
size--;
}
//printf
H_NODE* root=nodes[0];
print_node(root,0,0,sum);
printf("--------------------------------------\n");
double sum_l=0;
length_node(root,0,sum,sum_l);
printf("huffman avg code length=%.5f bit\n",sum_l);
//free
free_node(root);
}
char* ctrim(char* c_str,size_t &c_len){
//去掉头部的空格字符
while(c_len>0&&isspace(*c_str) ){
c_str++;
c_len--;
}
char * start=c_str, *end = NULL;
//去掉尾部
while(c_len>0&&(*c_str != '\0')){
if (!isspace(*c_str)){
end = NULL;
c_len--;
c_str++;
continue;
}
if(!end)end=c_str;
c_len--;
c_str++;
}
if(end){
*end = '\0';
c_len=end-start;
}else{
c_len=c_str-start;
}
return start;
}
void split(const char* input, regex_t ®, std::vector<std::string> &out) {
regmatch_t pm;
while (!regexec(®, input, 1, &pm, 0)) {
if(pm.rm_so>0)
out.push_back(std::string(input, 0, pm.rm_so));
input = input + (pm.rm_eo);
}
if ((*input) != '\0')
out.push_back(input);
}
/**
* 加载指定的频率文件
*/
int load_charc_seqs(const char* file,std::map<std::string,double> &char_freqs){
FILE *fe=fopen(file,"r");
if(!fe){
printf("%s not exits or not read!\n",file);
return EXIT_FAILURE;
}
printf("freq file:%s\n",file);
printf("--------------------------------------\n");
char buffer[1024];
regex_t split_reg;
regcomp(&split_reg, "\\s+", REG_EXTENDED);
std::vector<std::string> strs;
char* line;
while(fgets(buffer,1024,fe)){
strs.clear();
size_t clen=strlen(buffer);
line=ctrim(buffer, clen);
if(clen<=0)
continue;
split(line, split_reg, strs);
if(strs.size()!=2){
printf("bad input line:%s\n",line);
continue;
}
char_freqs[strs[0]]=atol(strs[1].c_str());
}
regfree(&split_reg);
fclose(fe);
return EXIT_SUCCESS;
}
int main(int argc, char **argv) {
if(argc<2){
printf("huffman_encode [charmap_freqs_file]\n",argv[0]);
return EXIT_SUCCESS;
}
std::map<std::string,double> char_freqs;
if(load_charc_seqs(argv[1], char_freqs)!=EXIT_SUCCESS){
return EXIT_FAILURE;
}
huffman_encode(char_freqs);
return EXIT_SUCCESS;
}
08-05
08-05