字符串匹配之AC自动机

最新推荐文章于 2024-06-04 08:36:20 发布

jinnlxl

最新推荐文章于 2024-06-04 08:36:20 发布

阅读量783

点赞数

分类专栏： algorithms

本文链接：https://blog.csdn.net/yejing_utopia/article/details/39485739

版权

algorithms 专栏收录该内容

39 篇文章 1 订阅

订阅专栏

/*
file:Aho-Corasick automaton.c
brief:该算法在1975年产生于贝尔实验室，是著名的多模匹配算法之一,
一个常见的使用场景就是给出n个单词，再给出一段包含m个字符的文章，让你找出有多少个单词在文章里出现过。
这个Aho就是Alfred V.Aho，龙书的第一作者。。。。 
auther:yejing
data:2014.08.26
ver:1(create the file 8/26)
test pc:ubuntu 12.14
*/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#define ALPHABET_NUM 26

typedef struct NODE{
	struct node* fail;
	struct node* next;
	struct node* child[ALPHABET_NUM];
	int    words_num;
}node_t;

typedef struct {
	node_t front;
	node_t rear;
}queue_t;

static queue_t* create_queue(void){
	queue_t* queue = (queue_t*)malloc(sizeof(queue_t));
	queue->front = queue->rear = NULL;
	
	return queue;
}

static void des_queue(queue_t* queue){
	if(!queue)
		return;
	
	while(!is_queue_empty(queue)){
		node_t* tmp = queue->front;
		queue->front = tmp->next;
		free(tmp);
	}
	free(queue);
	
	return;
}

#define is_queue_empty(queue) ((queue->front == queue->rear): 1 ? 0)

static node_t* deque(queue_t* queue){
	if(!queue || queue->front == queue->rear)
		return NULL;
		
	node_t* tmp;
	tmp = queue->front;
	queue->front = queue->front->next;
	
	return tmp;
}

static void enque(queue_t* queue, node_t* node){
	if(!queue || !node)
		return;
		
	node_t* tmp = queue->rear;
	queue->rear->next = node;
	queue->rear = node;
	return;
}

void make_fail_by_bfs(node_t* trie_root){
	if(!trie_root)
		return;
		
	int i;
	node_t* tmp, tmp_next;
	
	queue_t* queue = create_queue();
	if(!queue)
		return;
	enque(queue, trie_root);
	while(is_queue_empty(queue)){
		tmp = queue->front;
		for(i = 0; i < ALPHABET_NUM; ++i){
			if(tmp == trie_root)
				tmp->child[i]->next = trie_root;//first layer
			else{
				tmp_next = tmp->next;
				while(tmp_next){
					if(tmp_next->child[i]){
						tmp->child[i]->next = tmp_next->child[i];
						break;
					}
					tmp_next = tmp_next->next;
				}
				if(!tmp_next)
					tmp->child[i]->next = trie_root;
			}
			enque(queue, tmp->child[i]);
		}
	}
	
}


static void init_single_node(node_t *p_node){
	memset((char*)p_node, 0, sizeof(node_t));
	return;
}

static void trie_insert(char* buf, int len, node_t* trie_root){
	if(!buf || !trie_root)
		return;
	
	int i = 0;
	int index = 0;
	node_t* tmp = trie_root;
	
	for(i = 0; i < len; ++i){
		index = buf[i] - 'a';
		
		if(!tmp->child[index]){
			tmp->child[index] = (node_t *)malloc(sizeof(node_t));
			init_single_node(tmp->child[index]);
		}
		tmp = tmp->child[index];
	}
	trie_root->words_amount++;
	return;
}

static void des_trie(node_t* trie_root){
	if(!trie_root)
		return;
		
	int i = 0;
	for(i = 0; i < ALPHABET_NUM; ++i){
		if(trie_root->child[i])
			des_trie(trie_root->child[i]);
	}
	free(trie_root);
	return;
}

static node_t* build_trie(){
	int i = 0;
	int pattern_num = 0;
	
	char tmp[ALPHABET_NUM];
	memset(tmp, 0, sizeof(char) * ALPHABET_NUM);
	
	printf("please input the total pattern number \n");
	scanf("%d", &pattern_num);
	getchar();
	
	node_t* trie_root = (node_t *)malloc(sizeof(node_t));
	init_single_node(trie_root);
	
	while(pattern_num--){
		printf("please input a pattern \n");
		scanf("%s",tmp);
		getchar();
		trie_insert(tmp, strlen(tmp), trie_root);
	}
}

void static aho_corasick_main(char* buf, int len, node_t* trie_root){
	if(!buf || !tire_root)
		return;
		
	int i = 0, match_num = 0;
	node_t* tmp = tire_root;
	int tmp_char_hex;
	
	while(i < len){
		tmp_char_hex = buf[i] - 'a';
		while(tmp != trie_root && !tmp->child[tmp_char_hex])
			tmp = tmp->next;
			
		tmp = tmp->child[tmp_char_hex];
		if(!tmp)
			tmp = trie_root;
		node_t tmp1 = tmp;
		while(tmp1 != trie_root && tmp1->words_num){
			match_num = tmp1->words_num;
			tmp1->words_num = 0;
			tmp1 = tmp1->next;
		}
	}
	
	return match_num;
}

int main(int argc, char* argv[])
{
	node_t* trie_node
	char *tmp_buf = (char *)malloc(sizeof(char) * 9527);
	
	trie_node = build_trie();
	build_fail_array(trie_node);
	aho_corasick_main(tmp, trie_node);
	des_trie(trie_node);
	
	return 1;
}

jinnlxl

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
字符串匹配之AC自动机

/*file:Aho-Corasick automaton.cbrief:该算法在1975年产生于贝尔实验室，是著名的多模匹配算法之一,一个常见的使用场景就是给出n个单词，再给出一段包含m个字符的文章，让你找出有多少个单词在文章里出现过。这个Aho就是Alfred V.Aho，龙书的第一作者。。。。 auther:yejingdata:2014.08.26ver:1(create t
复制链接

扫一扫

专栏目录