A compressor writen in C using huffman code. It has successfully compiled in VC6. /* huf.h includes all the function prototypes. */ #ifndef __HUF__HH__ #define __HUF__HH__ /*#define DEBUG*/ /* some included file. */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> /* general function returned value. */ #define OK 1 #define FAILURE 0 /* from stattistic.c */ #define MAX_LEAF_NODE (256+1) /* the max number of nodes in huffman tree. */ #define MAX_NODE (MAX_LEAF_NODE*2-1) /* the buffer size read and written to file. */ #define BUF_SIZE 512 void statistic(const unsigned char *filename, size_t occur[MAX_LEAF_NODE]); /* from hufcode.c */ #define NULL_HF_POINTER ((hf_pointer)-1) #define UNKNOWN_HF_POINTER INT_MAX /* node type of the huffman tree. the field name should be very famillar with you. */ typedef int hf_pointer; /* hf_tree type is a array or table of * hf_node that store a tree. */ typedef struct { /* the occurence of the byte value. */ size_t weight; /* pointer to parent node in a table, if he has no parent, parent is NULL_HF_POINTER. */ hf_pointer parent; /* pointer to left child and right child, if he has no either of them, the corrensponding field will be NULL_HF_POINTER. */ hf_pointer lchild, rchild; } hf_node, *hf_tree; int hufcode(size_t occur[MAX_LEAF_NODE], hf_tree ht, unsigned char* code[MAX_LEAF_NODE]); int hufcompress(unsigned char *filename, hf_tree ht, unsigned char* code[MAX_LEAF_NODE]); int min_hf_node(hf_tree ht, hf_pointer range1, hf_pointer range2, hf_pointer *min1, hf_pointer *min2); int decompress(unsigned char *filename); #endif /* main.c */ #include "huf.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> /* Main data structure. */ /* used to statistic the occurence of the byte value in the file. */ size_t occur[MAX_LEAF_NODE]; /* used to store the huffman tree. Because this structure will used thoughout this program ,so we allocate it statically.*/ hf_node huf_tree[MAX_NODE]; /* usde to store the huffman code represented by string. The reason is same as 'huf_tree'. */ unsigned char* huf_code[MAX_LEAF_NODE]; int main(int argc, unsigned char **argv) { unsigned char use_msg[] = "usage: huf filename/tcompress file./n" " huf -d filename/tdecompress file./n" " huf -version/tdisplay version infomation./n/n" "param: filename/tname of file to be compressed or decompressed./n"; unsigned char ver_msg[] = "huf -- version 2.0 created by Smallsoft team./n"; /* code is used to storing all the huffman code. */ unsigned char filename[20]; size_t i = 0; /* determine the parameter given is properly formed.*/ if (argc < 2) { printf(use_msg); exit(EXIT_FAILURE); } /* determine whether to print version infomation.*/ if (0 == strcmp(argv[1], "-version")) { printf(ver_msg); exit(EXIT_FAILURE); } /* dertermine whether or not to decompress file . */ if (0 == strcmp(argv[1], "-d")) { /* dertermine the file whether or not is a 'huf' file. */ if (0 != strncmp(argv[2], "hf", 2)) { printf("This is not a hf(huffman compression) file./n"); exit(EXIT_FAILURE); } /* decompress file. */ strcpy(filename, argv[2]); decompress(filename); /* decompressing succeed. */ printf("Decompressing succeed./nThe name of file decompressed is u%s/n", filename); exit(EXIT_SUCCESS); } strcpy(filename, argv[1]); /* statistic the byte value in the file. */ statistic(filename, occur); /* generate the huffman tree and huffman string code according to the occurence of the byte values. */ hufcode(occur, huf_tree, huf_code); /* generated the compressed file acorrding to the given file name, huffman tree and huffman string code. */ hufcompress(filename, huf_tree, huf_code); #ifdef DEBUG /* print all the huffman code and free the memory occupied by the string huffman code. */ for ( i = 0; i < MAX_LEAF_NODE; ++i) { printf("%d:%s/n", i, huf_code[i]); if (huf_code[i] != NULL) free(huf_code[i]); } #else /* free memory of string in the 'code' table. */ for ( i = 0; i < MAX_LEAF_NODE; ++i) { if (huf_code[i] != NULL) free(huf_code[i]); } #endif /* exit program successfully !*/ printf("Compressing success./nThe name of file compressed is hf%s/n", filename); exit(EXIT_SUCCESS); } /* Statics.c contain only a function which is created by Zhang Yun on 2010-12-26. Function name: void statistic(unsigned char *filename, int occur[256]) Parameter: The 'filename' is the name of a file to be statistics. The 'ocur' is a empty array used to store the occurrence of the bytes in the file indicated by the parameter 1. Previous condition: File of 'filename' must be exits on the file system. The 'ocur' has already allocated. Post condition: The 'ocur' array store the occurrence of each byte in the file of 'filename'. */ #include "huf.h" #include <stdio.h> #include <stdlib.h> #include <string.h> /* #include <error.h> */ void statistic(const unsigned char *filename, int occur[MAX_LEAF_NODE]) { FILE *fin = NULL; size_t count = 0; unsigned char buf[BUF_SIZE]; size_t i = 0; #ifdef DEBUG printf("/nIn statistic function:/n"); #endif DEBUG /* Open file */ fin = fopen(filename, "rb"); if (fin == NULL) { perror("Can't open file to be compressed");; exit(errno); } /* Initialize the 'occur' array elements to be all zero. */ memset(occur, 0, sizeof(int) * MAX_LEAF_NODE); /* The end-of-huffman-code must be occur one time. */ occur[MAX_LEAF_NODE-1] = 1; do { /* read a 512 byte */ count = fread(buf, 1, BUF_SIZE, fin); /* increase element of 'occur' for every byte. */ for (i = 0; i < count; ++i) occur[buf[i]]++; /* read next 512 byte and loop until end of the file.*/ } while ( count == BUF_SIZE); /* close file */ fclose(fin); #ifdef DEBUG printf("byte value statistic information:/n"); for (i = 0; i < MAX_LEAF_NODE; i++) { printf("%c:%-3d", (unsigned char)i, occur[i]); if (i%16==0) putchar('/n'); } #endif } /* hufcode.c --- generate huffman code. Created by Zhang Yun on 2010-12-26 I choose string representation of huffman code, because string concatenation is convenient using c string library, while bit concatenation is difficult and involve many condition we must deal with of our own. Function name: void hufcode(int ocur[256], unsigned char* code[256]) Parameter: The 'ocur' array stores the ocurrences of all the byte values used to generate the huffman code. The 'code' array used to store the huffman code represented by string, after the function is completed. Previous condition: The 'ocur' array contains the ocurrences of all the byte values used to generate the huffman code. The 'code' array, whose elements is string, is only a pointer array. The pointer has not yet point the memory alloted. Post condiction: The 'code' constains the huffman code, represented by string, of every byte values. The string elements point the momery allocted inside the function. The invoker have the responsibility to free these memories. */ #include "huf.h" #include <stdio.h> #include <stdlib.h> #include <string.h> #include <limits.h> int hufcode(size_t occur[MAX_LEAF_NODE], hf_tree ht, unsigned char* code[MAX_LEAF_NODE]) { /* 'ht' has aready be a parameter for this function */ /*hf_tree ht;*/ hf_pointer root; hf_pointer min1, min2; size_t i; /* pointer used to travel the huffman tree. */ hf_pointer pt; /* path string, '0' stand for go left, '1' stand for go right. */ unsigned char path[MAX_LEAF_NODE]; size_t deepth; /* flag for mark the current node */ #define UNTRAVELED 0 #define LEFT_TRAVELED 1 #define RIGHT_TRAVELED 2 #define TRAVELED RIGHT_TRAVELED int travel_flag[MAX_NODE]; #ifdef DEBUG printf("/nIn hufcode function:/n"); #endif /* Establish the table which stores the huffman tree. */ /* ht has aready be a parameter for this function */ /* ht = (hf_tree)calloc(sizeof(hf_node), MAX_NODE); */ /* Initialize the huffman tree with 'occur' information. */ memset(ht, NULL_HF_POINTER, sizeof(hf_node) * (MAX_NODE)); for (i = 0; i < MAX_NODE; i++) { ht[i].weight = 0; } for (i = 0; i < MAX_LEAF_NODE; ++i) { (ht+i)->weight = occur[i]; #ifdef DEBUG printf("%d ", ht[i].weight); #endif } /* Using the node information to compute the whole tree. */ for (i = MAX_LEAF_NODE; i < MAX_NODE; ++i) { min_hf_node(ht, 0, i, &min1, &min2); /* If there is no node. */ if (min1 == NULL_HF_POINTER && min2 == NULL_HF_POINTER) { perror("The file is empty./n"); exit(EXIT_FAILURE); /* If there is only one node left. */ } else if (min2 == NULL_HF_POINTER) { /* Let min2 be root node. */ ht[min1].parent = NULL_HF_POINTER; break; } else { ht[min1].parent = ht[min2].parent = i; ht[i].lchild = min1; ht[i].rchild = min2; ht[i].weight = ht[ht[i].lchild].weight + ht[ht[i].rchild].weight; } } #ifdef DEBUG printf("Huffman tree table:/n"); for (i = 0; i < MAX_NODE; ++i) { printf("b:%8d w:%8d p:%8d l:%8d r:%8d/n", i, ht[i].weight, ht[i].parent, ht[i].lchild, ht[i].rchild); } printf("Huffman tree table has been generated!/n"); #endif /* Now min1 is the root of the huffman tree, if their value is not NULL_HF_POINTER. It's wrong. Because when in 'for (i = MAX_LEAF_NODE; i < MAX_NODE; ++i)', 'i' reached the MAX_NODE-1, min1, min2 would have be the child of i, that is root. So I need judge if the min1's parent is NULL_HF_POINTER, if so I let min1 be the root, otherwise the parent of min1 be root. */ if( ht[min1].parent != NULL_HF_POINTER) { root = ht[min1].parent; } else { root = min1; } /* Using the tree to generate the huffman code. */ deepth = 0; pt = root; memset(path, '/0', sizeof(unsigned char) * MAX_LEAF_NODE); memset(travel_flag, UNTRAVELED , sizeof(int) * MAX_NODE); #ifdef DEBUG printf("%d/n", pt); printf("path:%send./n", path); for (i=0; i< MAX_NODE; i++) printf("%d ", travel_flag[i]); printf("/n"); #endif while (travel_flag[root] != TRAVELED) { /* if this node the a new reached node, that is, his left or right is either traveled.*/ if (travel_flag[pt] == UNTRAVELED) { /* if the current node has no left child, then it alse has no right child, that is he is a leaf node. we generate the huffman code for this node and back trace to his parent. To mark the parent's flag related to travel, we must dermine he is a left node or a right node. if he is a left child , we mark his parent's flag with LEFT_TRAVELED, otherwise RIGHT_TRAVELED. */ if (ht[pt].lchild == NULL_HF_POINTER) { /* generate the huffman code for this left node. */ path[deepth] = '/0'; code[pt] = (unsigned char *)calloc(sizeof(unsigned char), strlen(path)+1); strcpy(code[pt], path); /* mark the occurrent is traveled. */ travel_flag[pt] = TRAVELED; #ifdef DEBUG printf("pt: %d path:%send/n", pt, path); #endif /* if the occurent node is whether a left child or right child. */ if (ht[ht[pt].parent].lchild == pt) { /* the occurent node is left child. */ /* backtrace to the occurent node's parent and mark the parent's left child tree traveled flag. */ pt = ht[pt].parent; travel_flag[pt] = LEFT_TRAVELED; } else { /* backtrace to the occurent node's parent and mark the parent's right child tree traveled flag. */ pt = ht[pt].parent; travel_flag[pt] = RIGHT_TRAVELED; } /* deal with the path and deepth accordingly. */ path[--deepth] = '/0'; } else { /* if occurrent node has left child, that is, he is not a leaf child. We travel to his left child and corresspondingly deal with the path and deepth. */ #ifdef DEBUG printf("l %d ", ht[pt].lchild); #endif pt = ht[pt].lchild; path[deepth++] = '0'; } } /* if the left child tree of current node have been traveled, then travel his right child. */ else if (travel_flag[pt] == LEFT_TRAVELED) { #ifdef DEBUG printf("r %d ", ht[pt].rchild); #endif pt = ht[pt].rchild; path[deepth++] = '1'; } /* if the right child tree of current node have been traveled, then backtrace to his parent, and mark the parent's flag according to he is a left child or right child. */ else if (travel_flag[pt] == RIGHT_TRAVELED) { #ifdef DEBUG printf("p %d ", ht[pt].parent); #endif /* if the occurent node is whether a left child or right child. */ if (ht[ht[pt].parent].lchild == pt) { /* the occurent node is left child. */ /* backtrace to the occurent node's parent and mark the parent's left child tree traveled flag. */ pt = ht[pt].parent; travel_flag[pt] = LEFT_TRAVELED; } else { /* backtrace to the occurent node's parent and mark the parent's right child tree traveled flag. */ pt = ht[pt].parent; travel_flag[pt] = RIGHT_TRAVELED; } /* deal with the path and deepth. */ path[--deepth] = '/0'; } } /* ok */ return OK; } /* This function retrive the most minmal huffman node index. Previous condition: The 'ht' is a uncompleted or completed huffman tree. The nodes between 'rang1', inclusively and 'range2', exclusively, must have at least two root nodes, otherwise the value of 'min1' and 'min2' would be not properly set. If there is no root nodes between 'range1' and 'range2', the value 'min1' and 'min2' will both be set to NULL_HF_POINTER. If there is only a root node, the 'min1' will be set to point that node, the 'min2' will be set to NULL_HF_POINTER. Post condition: */ int min_hf_node(hf_tree ht, hf_pointer range1, hf_pointer range2, hf_pointer *min1, hf_pointer *min2) { hf_pointer i; size_t min_weight; #ifdef DEBUG printf("/n------------------------------/n"); printf("In min_hf_node function:/n"); #endif /* initialize the min1 and min2 to NULL_HF_POINTER. */ *min1 = *min2 = NULL_HF_POINTER; /* find the correct min1 */ #ifdef DEBUG printf("find min1:/n"); #endif min_weight = INT_MAX; for (i = range1; i < range2; ++i) { /* hf_tree[i].weight > 0 stand for the i node exist. ht_tree[i] == NULL_HF_POINTER stand for the i node has no parent. */ /* #ifdef DEBUG printf("byte: %d ", i); printf("w: %d ", (ht+i)->weight); printf("p: %d ", (ht+i)->parent); printf("m: %d /n", min_weight); #endif */ if ((ht+i)->weight > 0 && (ht+i)->parent == NULL_HF_POINTER && (ht+i)->weight < min_weight ) { min_weight = ht[i].weight; *min1 = i; } } /* If min1 get the properly value, that is there is a minimal node, mark the min1 has a unknown parent. */ if (*min1 != NULL_HF_POINTER) { ht[*min1].parent = UNKNOWN_HF_POINTER; } #ifdef DEBUG /*test */ printf("/nmin 1: %d/n", *min1); #endif #ifdef DEBUG /* test */ printf("find min2:/n"); #endif /* find the correct min2 like min1. */ min_weight = INT_MAX; for (i = range1; i < range2; ++i) { /* #ifdef DEBUG printf("byte: %d ", i); printf("w: %d ", (ht+i)->weight); printf("p: %d ", (ht+i)->parent); printf("m: %d /n", min_weight); #endif */ if ((ht+i)->weight > 0 && (ht+i)->parent == NULL_HF_POINTER && (ht+i)->weight < min_weight ) { min_weight = ht[i].weight; *min2 = i; } } /* If min1 get the properly value, that is there is a minmal node, mark the min1 has a unknown parent. */ if (*min2 != NULL_HF_POINTER) { ht[*min2].parent = UNKNOWN_HF_POINTER; } #ifdef DEBUG printf("result of min2: %d/n", *min2); #endif /* ok */ return OK; } /* hufcompress.c Function name: hufcompress(unsigned char *filename, char *code[256]) Parameter: The 'filename' is the name of file to be compressed. The 'code' array contains the huffman code represented by string. Previous condiction: The file of 'filename' must exist in file system. The 'code' array contains the huffman code in the form of string which is used to compress the file. Post condiction: A compressed file of name which is 'filename'followed by '.hf' has been generated.*/ #include "huf.h" int hufcompress(unsigned char *filename, hf_tree ht, unsigned char *code[MAX_LEAF_NODE]) { FILE *fin, *fout; unsigned char bit8; unsigned char bit8str[9]; unsigned char emptystring[1] = ""; unsigned char bitstr[MAX_LEAF_NODE+8]; unsigned char ofname[20]; /* char buf[BUF_SIZE];*/ size_t count; size_t i; size_t debug_n = 0; #ifdef DEBUG puts("/nIn hufcompress:/n"); printf("file name: %s/n", filename); #endif /* open files */ fin = fopen(filename, "rb"); if (fin == NULL) { perror("Can't open file to be compressed "); exit(errno); } rewind(fin); sprintf(ofname, "hf%s", filename); fout = fopen(ofname, "wb"); if (fout == NULL) { perror("Can't open file to be decompressed "); exit(errno); } #ifdef DEBUG printf("Opening file success./n"); #endif /* Write the huffman tree table to the compressed file.*/ fwrite(ht, sizeof(hf_node), MAX_NODE, fout); #ifdef DEBUG printf("Writing huffman tree table success./n"); #endif /* read every byte of the file, and tranform them to huffman string code. Use the huffman string code to generated huffman binary code. */ strcpy(bitstr, emptystring); #ifdef DEBUG printf("Initialize bitstr:%send", bitstr); #endif /* Assume the whole huffman string code is available, we merely copy 8 chars to a string and convert it to byte, then write to file every time. If the chars copied is not enough 8 chars, that is, the huffman string code is converted completely. byte_string = getNext8Chars(); byte = strtol(byte_string, NULL, 2); fwrite(&byte,sizeof(unsigned char), 1, fout)*/ /* getNext8Chars() module. If the 'bitstr' is not enought 8 chars, append 'bitstr' the next byte's huffman string code, until there is enough 8 chars in 'bitstr'. Then copy the first 8 chars of 'bitstr' to bit8str, and let 'bitstr' move forward 8 chars. */ #ifdef DEBUG printf("length of bitstr: %d./n", strlen(bitstr)); #endif while (1) { while (strlen(bitstr) < 8 ) { /* determine the file has already reached end.*/ #ifdef DEBUG printf("To read %d byte./n", debug_n++); /*clearerr(fin);*/ #endif if(fread(&bit8, sizeof(unsigned char), 1, fin) == 0) { /* if it has already reached end of file, exit circle without appending to 'bitstr'. */ break; } #ifdef DEBUG printf("Concate code[%d]:%send to bitstr:%send./n", bit8, code[bit8], bitstr); #endif strcat(bitstr, code[bit8]); #ifdef DEBUG printf("Get bitstr: %send/n", bitstr); #endif } /* Now, the strlen is enough 8 chars. Or it has reached the end of file, we append some '0's to 'bitstr' and write into file to end the compression.*/ if (strlen(bitstr) < 8) { if (strlen(bitstr) == 0) { /* if the 'bitstr' is empty, there is no need to write any bits to file. */ break; } #ifdef DEBUG printf("/nlast part of byte: %send/n", bitstr); #endif strcat(bitstr, code[MAX_LEAF_NODE-1]); strncpy(bit8str, bitstr, 8); bit8 = strtol(bit8str, NULL, 2); fwrite(&bit8, sizeof(unsigned char), 1, fout); break; } /* construct a 8-bit string. */ strncpy(bit8str,bitstr, 8); bit8str[8] = '/0'; /*#ifdef DEBUG printf("b:%s ",bitstr); #endif */ strcpy(bitstr, bitstr+8); /* #ifdef DEBUG printf("a:%s ",bitstr); #endif */ #ifdef DEBUG printf("%s ",bit8str); #endif bit8 = strtol(bit8str, NULL, 2); fwrite(&bit8, sizeof(unsigned char), 1, fout); } /* (I wouldn't like to consider buffer and efficience.) buf[i] must have correspond huffman string code, because the huffman string code is generated against the file. */ /* close files */ fclose(fin); fclose(fout); #ifdef DEBUG printf("The huffman bianry code has generated./n"); #endif return OK; } /* decompress.c Function name: decompress(unsigned char *filename) Parameter: The 'filename' is the name of a file compressed using the 'huf'. Previous condition: The file of name 'filename' is exisit, and generated by the 'huf' program. Post condiction: A new file same as the compressed file corresponding to file of name 'filename' is generated. */ #include "huf.h" int decompress(unsigned char *filename) { FILE *fin, *fout; unsigned char ofname[20]; hf_tree ht; hf_pointer root, pt; unsigned int bit8, bit, mask; size_t i; #ifdef DEBUG printf("In decompress function:/n"); #endif /* open file. */ if((fin = fopen(filename, "rb")) == NULL) { perror("Can't open compressed file"); exit(EXIT_FAILURE); } sprintf(ofname, "u%s", filename); if ((fout = fopen(ofname, "wb")) == NULL) { perror("Can't open decompressed file"); exit(EXIT_FAILURE); } /* read out the huffman tree. */ ht = calloc(sizeof(hf_node), MAX_NODE); fread(ht, sizeof(hf_node), MAX_NODE, fin); #ifdef DEBUG for (i=0; i < MAX_NODE; ++i) { printf("byte: %d ", i); printf("w: %d ", (ht+i)->weight); printf("p: %d ", (ht+i)->parent); printf("l: %d ", (ht+i)->lchild); printf("r: %d/n", (ht+i)->rchild); } #endif /* travel the huffman tree to get a the byte value corrensponding to the huffman code. */ /* find de root of the huffman tree. notice the root node'weight is maximal(I find the the root is the last node in the huffman tree table such that his weight is not zero. ). So , I decide use this special to find the root. */ root = NULL_HF_POINTER; for (pt = MAX_LEAF_NODE; pt < MAX_NODE; pt++) { /* root==NULL' stand for root haven't temporary selected. ht[pt].weight != 0 stand for pt node is a existed. */ if ( ht[pt].weight > 0 && (root == NULL_HF_POINTER || ht[root].weight < ht[pt].weight)) { /* if the root's weight is less than the occurrent node's node, then the temporarily selected root is impossible to be the root. So we set 'root' equal 'pt' which is possibel to be the root.*/ root = pt; } } #ifdef DEBUG printf("root: %d/n", root); #endif /* If the root now is still NULL_HF_POINTER, then the huffman tree is a empty node, that is, the orignal file which had been compressed at some time is a empty file.*/ if (root != NULL_HF_POINTER) { /* Now root is already determined, next we will take up generating the byte values. */ /* travel the tree according the huffman code bit stream. */ pt = root; /* get a bit. */ /* trvel the tree. */ /* if it is reched a left node, * write the node to output file. */ while(fread(&bit8, sizeof(unsigned char), 1, fin)) { /* use to retrive the bit from left to right. */ mask = 128; /* 10000000b */ while (mask != 0) { /* get a bit */ bit = bit8 & mask; mask = mask >> 1; /* travel to his child properly. */ if (bit == 0) { /* go to left child. */ pt = ht[pt].lchild; } else { /* go to right child. */ pt = ht[pt].rchild; } /* dertermine occurent node is whether a leaf node. if so, generated the byte value for the huffman binary code, and write to file. Last but not least, set the 'pt' to root, go to next decode circle. */ if (ht[pt].lchild == NULL_HF_POINTER && ht[pt].rchild == NULL_HF_POINTER) { /* he is really a leaf node. */ fwrite(&pt, sizeof(unsigned char), 1, fout); pt = root; } } /* end of while(mask != 0)*/ }/* end of while (fread())*/ } /* free the memory allocated for 'ht'. */ free(ht); /* close file */ fclose(fin); fclose(fout); /* haha, OK! :-)*/ return OK; }