很难想象我们以后考完研大半个月了,挺怀念那段时光的,虽然一切都是未知数,但是总有一个目标指引着自己前进,这段时间闲下来,于是把自己以前的一些数据结构算法知识回顾了下,所以有了这篇文章,接下来几天会陆续发布出来。
正文开始
哈夫曼编码我就不介绍了,很多博客都有,这也是数据结构里很基础的树的应用。可以参考这篇博客,我就不重复造轮子了。huffman编码——原理与实现
整个程序的执行流程就是统计文本,构造哈夫曼树,生成哈夫曼编码,根据哈夫曼编码对文本进行压缩,然后根据压缩文件和内存中的哈夫曼树对压缩后的二进制文件进行解压复现,该程序只支持纯ASCII字符文件。
废话不多说,直接上代码,在ubuntu18.04下gcc C99标准下编译
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#define ASCII_LEN 128 //总ASCII码数量
const char *SOURCE_FILE = "source.txt"; //待编码的源文件
const char *HUFFMAN_RESULT_FILE = "Huffman.txt"; //编码的统计表包括编码的二进制串
const char *CODE_FILE = "code.dat"; //压缩后的文件
const char *RECODE_FILE = "recode.txt"; //压缩后译码出来的文件
typedef struct {
int n; //频次
int bits; //压缩后的bit数
char c; //对应字符
char *data; //编码的二进制字符串
} Letter;
Letter word[ASCII_LEN]; //统计各字符的使用情况
typedef struct {
int n;
char *data;
int top;
} Stack; //简单地实现了一个栈
typedef struct Node {
int weight; //权值
int parent; //父节点的下标
int lChild, rChild; //左右孩子的下标
char c;
} HTNode, *HuffmanTree; //哈夫曼树结构
typedef struct {
int n; //该节点占用几位 一个unsigned char 8位
char *data;
char c;
} HuffmanCodeNode, *HuffmanCode; //用来存储每个节点的编码
//在HT数组的前K个元素中选出权值最小且parent=-1的节点
int min(HuffmanTree HT, int k) {
bool first = true;
int min, minWeight;
for (int i = 0; i < k; ++i) {
if (HT[i].parent != -1) continue;
if (first) {
min = i;
minWeight = HT[i].weight;
first = false;
continue;
}
if (minWeight > HT[i].weight) {
min = i;
minWeight = HT[i].weight;
}
}
HT[min].parent = 1;
return min;
}
void selectMinium(HuffmanTree HT, int k, int *min1, int *min2) {
*min1 = min(HT, k);
*min2 = min(HT, k);
}
//创建哈夫曼树
HuffmanTree createHuffmanTree(Letter *word, int n) {
int total = 2 * n - 1;
HuffmanTree HT = malloc(sizeof(HTNode) * total);
int i = 0;
for (int j = 0; j < ASCII_LEN; ++j) {
if (!word[j].n) continue;
HT[i].parent = HT[i].lChild = HT[i].rChild = -1; //初始化节点值为-1
HT[i].weight = word[j].n;
HT[i++].c = (char) j;
}
for (i = n; i < total; ++i) { //合并后的节点初始化
HT[i].parent = HT[i].lChild = HT[i].rChild = -1;
HT[i].weight = 0; //不初始化为0行不行?
}
int min1 = 0, min2 = 0;
for (i = n; i < total; ++i) {
selectMinium(HT, i, &min1, &min2);
HT[min1].parent = HT[min2].parent = i;
HT[i].lChild = min1;
HT[i].rChild = min2;
HT[i].weight = HT[min1].weight + HT[min2].weight;
}
return HT;
}
//根据哈夫曼树从终端节点回溯到树根,填入编码字符串(用栈倒序)
HuffmanCode huffmanCoding(HuffmanTree HT, int n) {
HuffmanCode HC = malloc(n * sizeof(HuffmanCodeNode));
Stack s;
s.data = malloc(n * sizeof(char));
for (int i = 0; i < n; ++i) {
s.top = 0, s.n = 0; //top表示栈顶元素的下一个位置,同时也是栈的大小
memset(s.data, '\0', sizeof(char) * n);
int current = i, father = HT[i].parent;
while (father != -1) {
if (HT[father].lChild == current) {
s.data[s.top++] = '0';
} else {
s.data[s.top++] = '1';
}
current = father;
father = HT[father].parent;
}
s.n = s.top; //记录当前大小
HC[i].n = s.n;
HC[i].data = malloc(sizeof(char) * s.n);
memset(HC[i].data, '\0', sizeof(char) * s.n);
for (int j = 0; j < s.n; ++j) {
HC[i].data[j] = s.data[--s.top];
}
}
free(s.data);
return HC;
}
//获取文本中字符的频次只支持ASCII字符
void getLetterFrequency() {
FILE *file = fopen(SOURCE_FILE, "r"); //只读方式打开文本文件
memset(word, 0, sizeof(word));
while (!feof(file)) {
++word[fgetc(file)].n; //按照其ascii码值存放频次
}
fclose(file);
}
int main() {
getLetterFrequency();
int noNull = 0;
for (int i = 0; i < ASCII_LEN; ++i) { //统计一共多少字符
if (word[i].n) ++noNull;
}
int j = 0;
HuffmanTree ht = createHuffmanTree(word, noNull);
HuffmanCode hc = huffmanCoding(ht, noNull);
for (int i = 0, j = 0; i < ASCII_LEN && j < noNull; ++i) { //将编码结果从树写回结构体数组中
if (word[i].n) {
word[i].data = hc[j].data;
word[i].bits = hc[j++].n;
}
}
FILE *resultFile = fopen(HUFFMAN_RESULT_FILE, "w"); //统计结果写入文件
for (int i = 0, j = 0; i < ASCII_LEN; ++i) {
if (word[i].n) {
fprintf(resultFile, "%c:%d %s\n", (char) i, word[i].n, word[i].data);
}
}
fclose(resultFile);
FILE *sourceFile = fopen(SOURCE_FILE, "r");
FILE *encodeFile = fopen(CODE_FILE, "wb+"); //二进制形式写入
unsigned char buf = 0x00;
int count = 0;
while (!feof(sourceFile)) {
int ch = fgetc(sourceFile);
for (int i = 0; i < word[ch].bits; ++i) {
++count;
buf <<= 1;
if (word[ch].data[i] == '1') //如果是0则只需要左移
buf |= 1;
if (count % 8 == 0) {
fwrite(&buf, sizeof(unsigned char), 1, encodeFile);
buf = 0x00;
}
}
}
while (count % 8 != 0) { //将剩下的bit位前移,会有多余的
buf <<= 1;
++count;
}
fwrite(&buf, sizeof(unsigned char), 1, encodeFile);
FILE *recodeFile = fopen(RECODE_FILE, "w");
fseek(encodeFile, 0, SEEK_SET); //设置文件指针置头部,刚才写过一次现在读;
int i = 2 * noNull - 2; //根节点位置
count = 0;
while (!feof(encodeFile)) {
if (ht[i].lChild == -1 && ht[i].rChild == -1) {
fputc(ht[i].c, recodeFile);
i = 2 * noNull - 2;
}
if (count % 8 == 0)fread(&buf, sizeof(unsigned char), 1, encodeFile); //读完8bit再继续读一次
int t = buf & 0x80; //右移取一位
buf <<= 1;
++count;
if (t == 0x80) { //最高位为1
i = ht[i].rChild;
} else if (t == 0) {
i = ht[i].lChild;
}
}
fclose(sourceFile);
fclose(encodeFile);
fclose(recodeFile);
for (int k = 0; k < noNull; ++k) {
free(hc[k].data);
}
free(hc);
free(ht);
return 0;
}
程序运行过程中没有输出,都记录在文件中
code.dat相比源文件压缩了约一半的空间,这取决于文本字符的统计分布。