数据规模
模式串:127w
目标文本:750M(没统计有多少字符)
AC自动机
其实就是trie树模仿KMP算法,构建了一个fail指针,实现匹配失败后不用回溯直接转移。对于结点cur,构建其孩子结点child的fail指针的算法:
- 如果cur为根节点,设置child.fail为cur,否则下一步;
- 如果cur.fail存在与child字符相同的孩子节点fchild,设置child.fail为fchild,否则下一步;
- cur = cur.fail;
- 跳到1;
问题记录
- 百万模式串,且字符不局限于英文字母(汉字占2字节,用2个char存储),所以用孩子兄弟法存储trie树,不预留孩子空间。数据结构:
typedef struct _NODE {
char c;
unsigned int isWord;
struct _NODE *child, *bro, *fail;//用孩子兄弟法存储,节省空间
}node;
- AC自动机成功匹配一个单词以后!
当前结点匹配成功的话,他的fail结点也一定能匹配成功,即fail结点结尾的模式串是当结点结尾的模式串的后缀。 - 孩子兄弟表示法的树的广搜
原想偷个懒,结果找了找,一下还没找到。一开始想的写个非递归,按照传统的队列辅助的话,如果要实现带记录的遍历,就要记录每层压入的结点数。所以还是递归怼上去吧…
void bfs(struct _NODE *cur, FILE *fp)
{
static char stack[100];
static int top = 0;
if (cur->c != '\0') {
stack[top++] = cur->c;
if (cur->isWord > 0) {
stack[top] = '\0';
fprintf(fp, "%s %d\n", stack, cur->isWord);
}
}
struct _NODE *tmp = cur->child;
while (tmp != NULL) {
bfs(tmp, fp);
tmp = tmp->bro;
}
if (cur->c != '\0')
--top;
}
- 百分比进度显示
其实这个和AC自动机没什么关系。我以前没有写过数据集这么大的程序,匹配目标文本的时候我还以为是卡住了…所以写了个显示百分比进度的功能,大概思路就是利用文件指针的索引位置计算百分比,程序每循环一定次数显示(刷新)一次。
但是发现,加了这个功能之后,程序的运行速度明显慢了大概五分之一,应该是比较语句和printf的速度慢于文件读取速度的问题。
网上也没有找到完美的方法,我猜测应该是用多线程写,但是多线程想要知道主线程的进度就要用临界区变量,感觉这个开销更大啊。
待考!
源码
因为要求用C语言写,所以自己写了个queue,就是一个常规的queue,包含push、pop、empty啥的,懒得贴代码了。
typedef struct _NODE {
char c;
unsigned int isWord;
struct _NODE *child, *bro, *fail;//用孩子兄弟法存储,节省空间
}node;
typedef struct _TREE {
struct _NODE *root;
int size;
}tree;
bool isReady;
//初始化trie树
void init(struct _TREE *tree)
{
tree->root = (struct _NODE *)malloc(sizeof(struct _NODE));
memset(tree->root, 0, sizeof(struct _NODE));
tree->size = 0;
isReady= false;
}
//在cur的孩子结点中寻找值为c的结点,没有则返回NULL
struct _NODE *findNext(struct _NODE *cur, char c)
{
if (cur == NULL || cur->child == NULL)
return NULL;
struct _NODE *tmp = cur->child;
while (tmp != NULL) {
if (tmp->c == c)
return tmp;
tmp = tmp->bro;
}
return NULL;
}
//插入值为c的孩子节点到cur,返回该插入的结点
struct _NODE *putNext(struct _NODE *cur, char c)
{
if (cur == NULL)
return NULL;
struct _NODE *tmp = cur->child;
if (tmp != NULL) {
while (tmp->bro != NULL) tmp = tmp->bro;
tmp = tmp->bro = (struct _NODE *)malloc(sizeof(struct _NODE));
}
else
tmp = cur->child = (struct _NODE *)malloc(sizeof(struct _NODE));
memset(tmp, 0, sizeof(struct _NODE));
tmp->c = c;
return tmp;
}
//将字符串data+index插入以cur为根的trie树
void insert(struct _NODE *cur, const char *data, int index)
{
if (data[index] == '\0') {
cur->isWord = 1;
return;
}
struct _NODE *tmp = findNext(cur, data[index]);
if (tmp == NULL)
tmp = putNext(cur, data[index]);
insert(tmp, data, index + 1);//尾递归便于优化
}
//释放trie树空间,太麻烦了,就没写...
void clear(struct _TREE *tree)
{
}
//更新fail指针
void updateFail(struct _NODE *root)
{
if (root == NULL)
return;
myQueue queue;
queueInit(&queue);
if (root->c == '\0') {//root为树的根节点
struct _NODE *tmp = root->child;
while (tmp != NULL) {
tmp->fail = root;
push(tmp, &queue);
tmp = tmp->bro;
}
}
while (!empty(&queue)) {
root = (node *)pop(&queue);
if (root->child == NULL)
continue;
struct _NODE *ffail, *tmp = root->child, *res;
while (tmp != NULL) {
ffail = root->fail;
while ((res = findNext(ffail, tmp->c)) == NULL && ffail->fail != NULL)
ffail = ffail->fail;
if (res != NULL)//上面的while循环条件的与运算符会先判断左边的表达式,所以这里要先判断while是否是因为左边的表达式为假退出的
tmp->fail = res;
else//此情况下ffail->fail == NULL,表明ffail为root
tmp->fail = ffail;
push(tmp, &queue);
tmp = tmp->bro;
}
}
queueClear(&queue);
isUpdateFail = true;
}
//从文件中读取字符串,构建trie树和fail指针
int insertFromFile(struct _TREE *tree, const char *fileName)
{
clear(tree);
FILE *fp = fopen(fileName, "r");
if (fp == NULL) {
printf("error when opening file \"%s\"\n", fileName);
return;
}
else {
printf("已打开\"%s\",开始读取模式串...\n", fileName);
}
fseek(fp,0,SEEK_END);
long int total = ftell(fp);//获取文件长度
rewind(fp);
int i = 0;
char buffer[100];
while (!feof(fp)) {
fgets(buffer, 99, fp);
if (buffer[0] == '\0')
continue;
buffer[strlen(buffer) - 1] = '\0';
insert(tree->root, buffer, 0);
if(++i > 10000){
i = 0;
printf("%5.2f\r", (float)ftell(fp) / total);
}
}
fclose(fp);
printf("模式串读取完毕! \n");
printf("开始构建fail指针...\r");
updateFail(tree->root);
printf("构建fail指针完毕! \n");
isReady = true;
}
//目标文本匹配
void queryInFile(struct _TREE *tree, const char *fileName)
{
if (!isReady) {
printf("请先读取模式串!\n");
return;
}
int i = 0;
FILE *fp = fopen(fileName, "r");
if (fp == NULL) {
printf("error when opening file \"%s\"\n", fileName);
return;
}
else {
printf("已打开\"%s\",开始匹配...\n", fileName);
}
fseek(fp,0,SEEK_END);
long int total = ftell(fp);//获取文件长度
rewind(fp);
struct _NODE *cur = tree->root, *tmp;
while (!feof(fp)) {
char c = fgetc(fp);
if (c == '\n') {
cur = tree->root;
continue;
}
tmp = findNext(cur, c);
if (tmp == NULL && cur != tree->root) {//匹配失败
cur = cur->fail;
tmp = findNext(cur, c);//从fail处继续匹配
}
if (tmp == NULL) {//表明是从root处匹配失败
cur = tree->root;
continue;
}
cur = tmp;
while (tmp != tree->root && tmp->isWord != 0) {
++tmp->isWord;
tmp = tmp->fail;//防止某些模式串是此模式串的后缀
}
if(++i > 1000000){
i = 0;
printf("%5.2f\r", (float)ftell(fp) / total);
}
}
fclose(fp);
printf("目标文本匹配完毕!\n");
}
void bfs(struct _NODE *cur, FILE *fp)
{
static char stack[100];
static int top = 0;
if (cur->c != '\0') {
stack[top++] = cur->c;
if (cur->isWord > 0) {
stack[top] = '\0';
fprintf(fp, "%s %d\n", stack, cur->isWord);
}
}
struct _NODE *tmp = cur->child;
while (tmp != NULL) {
bfs(tmp, fp);
tmp = tmp->bro;
}
if (cur->c != '\0')
--top;
}
void printToFile(struct _TREE *tree, const char *fileName)
{
if (tree->root->child == NULL)
return;
printf("正在保存匹配结果...\r");
FILE *fp = fopen(fileName, "w");
if (fp == NULL) {
printf("error when opening file \"%s\"\n", fileName);
return;
}
bfs(tree->root, fp);
fclose(fp);
printf("匹配结果已保存到\"%s\"!\n", fileName);
}
入口main函数:
int main(int argc, char **argv)
{
tree test;
init(&test);
insertFromFile(&test, "patterns-127w.txt");
queryInFile(&test, "content.txt");
printToFile(&test, "result_unsorted.txt");
system("pause");
return 0;
}