前言:
在做这个项目之前,我有很大的抵触情绪,因为我认为这个任务已经完全超出了我的能力和认知范围,这个时候突然想到胡船长的一句话:当我们的生活遇到困难时,不应该总是抱怨生活有多么不公,而是应该想办法去改变这一现状。随着对问题的不断剖析,我逐渐地摸索出了一种解决方案.
Day1:
在做这个项目之前我参阅了很多有关中文编码的资料,当今计算机主要采用的还是UTF-8编码,虽然也有使用GBK编码的但毕竟是少数。由于一个中文字符所占用的存储空间为两个字节, 这对于我们构造构建哈夫曼树的统计词频阶段造成了很大困难。 而我们不难联想到short类型也是两个字节,也就是说我们其实可以使用一个unsigned short类型的空间存储一个汉字信息。为此我还做了个小实验.
#include <stdio.h>
#include <stdlib.h>
#define MAX_N 256
int main() {
unsigned char s[] = "我爱学习";
unsigned short* ch = (unsigned short*)calloc(sizeof(unsigned short), MAX_N);
int cnt = 0;
for (int i = 0; s[i]; i += 2) {
ch[cnt] |= s[i];
ch[cnt] <<= 8;
ch[cnt] |= s[i + 1];
cnt++;
}
for (int i = 0; i < cnt; ++i) {
unsigned char a = ch[i] >> 8;
unsigned char b = ch[i];
printf("%c%c", a, b);
}
printf("\n");
printf("\n");
return 0;
}
我采用了一个unsigned short类型的数组存储了一段中文字段。 原理很简单:我们首先将一个汉字的第一个字节放入到unsigned short类型里面,然后立即左移8位,然后将下一个字节的信息放到低字节。再打印的时候我也以这样的方式进行解码即可。
Day2:
首先当我们输入多个模式串,然我们将其放入到一个二维的unsigned short数组:
typedef unsigned short US;
typedef unsigned char UC;
US **convertToShortArr(UC (*str)[64], int str_cnt) {
US **short_arr = (US **)calloc(str_cnt, sizeof(US *));
for (int i = 0; i < str_cnt; ++i) {
int len = mystrlen(str[i]);
short_arr[i] = (US *)calloc(len, sizeof(US));
for (int j = 0; str[i][j]; j += 2) {
short_arr[i][j / 2] |= str[i][j];
short_arr[i][j / 2] <<= 8;
short_arr[i][j / 2] |= str[i][j + 1];
}
}
return short_arr;
}
接下来我们统计词频就十分简单了,我想大家都是可以理解的。统计完词频之后我们就可以轻松加愉快的构建一颗哈夫曼树了。
#define swap(a, b) { \
__typeof(a) temp; \
temp = a; \
a = b; \
b = temp; \
}
typedef struct HFNode {
unsigned short ch;
int freq;
struct HFNode *lchild, *rchild;
} HFNode;
HFNode *getNode() {
HFNode *p = (HFNode *)malloc(sizeof(HFNode));
p->freq = p->ch = 0;
p->lchild = p->rchild = NULL;
return p;
}
void build(int n, HFNode *arr[]) {
for (int times = 0; times < n - 1; times++) {
HFNode *minNode = arr[0];
int ind = 0;
for (int i = 1; i < n - times; i++) {
if (arr[i]->freq >= minNode->freq) continue;
minNode = arr[i];
ind = i;
}
swap(arr[ind], arr[n - times - 1]);
minNode = arr[0];
ind = 0;
for (int i = 1; i < n - times - 1; i++) {
if (arr[i]->freq >= minNode->freq) continue;
minNode = arr[i];
ind = i;
}
swap(arr[ind], arr[n - times - 2]);
HFNode *new_node = getNode();
new_node->lchild = arr[n - times - 1];
new_node->rchild = arr[n - times - 2];
new_node->freq = arr[n - times - 1]->freq + arr[n - times - 2]->freq;
arr[n - times - 2] = new_node;
}
return ;
}
void extract(HFNode *root, char *buff, char (*huffman_code)[100], int n) {
buff[n] = '\0';
if (root->lchild == NULL && root->rchild == NULL) {
strcpy(huffman_code[root->ch], buff);
return ;
}
buff[n] = '0';
extract(root->lchild, buff, huffman_code, n + 1);
buff[n] = '1';
extract(root->rchild, buff, huffman_code, n + 1);
return ;
}
最终我得到了哈夫曼编码(演示程序):
我们通过得到的哈夫曼编码将原先的中文模式串编码为一个01串然后插入到一颗二叉字典树之中。二叉字典树的封装代码。
#define BASE 2
#define BEGIN_LETTER '0'
typedef struct Node {
int flag;
unsigned char *str;
struct Node *fail;
struct Node *next[BASE];
} Node, *Trie;
#define BASE 2
#define BEGIN_LETTER '0'
Node *get_new_node() {
Node *p = (Node *)calloc(sizeof(Node), 1);
p->flag = 0;
return p;
}
void clear(Trie root) {
if (root == NULL) return ;
for (int i = 0; i < BASE; i++) {
clear(root->next[i]);
}
free(root);
return ;
}
Node *insert(Trie root, const char *str, UC *org) {
if (root == NULL) root = get_new_node();
Node *p = root;
for (int i = 0; str[i]; i++) {
int ind = str[i] - BEGIN_LETTER;
if (p->next[ind] == NULL) p->next[ind] = get_new_node();
p = p->next[ind];
}
p->flag = 1;
p->str = (unsigned char *)calloc(512, sizeof(unsigned char));
memcpy(p->str, org, sizeof(org) * 2);
return root;
}
int search(Trie root, const char *str) {
Node *p = root;
int i = 0;
while (p && str[i]) {
int ind = str[i++] - BEGIN_LETTER;
p = p->next[ind];
}
return (p && p->flag);
}
void get_random_string(char *str) {
int len = rand() % 10;
for (int i = 0; i < len; i++) str[i] = BEGIN_LETTER + rand() % BASE;
str[len] = 0;
return ;
}
void output(Trie root, char *str, int level) {
if (root == NULL) return ;
str[level] = 0;
if (root->flag) {
printf("find word : %s\n", str);
}
for (int i = 0; i < BASE; i++) {
if (root->next[i] == NULL) continue;
str[level] = i + BEGIN_LETTER;
output(root->next[i], str, level + 1);
}
return ;
}
void build_ac(Node* root) {
Node **queue = (Node**)calloc(10000, sizeof(Node*));
int head = 0, tail = 0;
queue[tail++] = root;
while (head < tail) {
Node* now = queue[head++];
for (int i = 0; i < 2; ++i) {
if (now->next[i]) {
Node *p = now->fail;
while (p && p->next[i] == NULL) p = p->fail;
if (p == NULL) now->next[i]->fail = root;
else now->next[i]->fail = p->next[i];
queue[tail++] = now->next[i];
}
}
}
}
void match(Node* root, const char* text) {
Node* p = root;
for (int i = 0; text[i]; ++i) {
while (p && p->next[text[i] - BEGIN_LETTER] == NULL) p = p->fail;
if (p == NULL) p = root;
else p = p->next[text[i] - BEGIN_LETTER];
Node* q = p;
while (q) {
if (q->flag) printf("Find Word : %s\n", q->str);
q = q->fail;
}
}
}
在构建完字典树之后,我们通过输入一个母串就可以快速查询出母串中出现了多少子串,在这里我采用了AC自动机算法实现多模匹配,首先在得到一颗字典树之后我通过层次遍历,为当前节点的每个子节点分配fail指针。再构建完fail指针之后,我们就可以对与母串进行查询操作的,这个算法的效率十分高效。
最终效果图:
完整的项目代码,我已经放到github上了(欢迎感兴趣的小朋宇follow~)
github地址:https://github.com/RugerMcCarthy/ChineseHuffman_Trie