在利用字典树判断文章相似度的基础上，能否使用ac自动机来优化时间

wang_xiaoniao

于 2020-07-01 11:24:31 发布

阅读量281

点赞数

分类专栏：算法 # 字典树文章标签： AC自动机 trie

本文链接：https://blog.csdn.net/wang_xiaoniao/article/details/107057423

版权

算法同时被 2 个专栏收录

2 篇文章 0 订阅

订阅专栏

字典树

2 篇文章 0 订阅

订阅专栏

利用字典树判断两篇文章相似度时，大致思路是这样的。文章中的某个单词（注意，它是一个整体。这个单词中的任意一连续的部分如果在字典树中存在，都不计数）如果在字典树中出现，就令该单词在这篇文章中的出现次数+1。这样可得到每篇文章里各个单词出现的数量，然后判断高频单词是否类似即可。

ac自动机是什么

那么如果使用ac自动机的话，首先什么是ac自动机。ac自动机的本质就是，在字典树的基础上，对其每个节点都添加fail指针。那么当在某个节点之后查询不到待匹配的字母时，就跳到fail指针指向的节点继续匹配，这样这棵字典树只要匹配一遍字符串文本就可以得到答案，消除了原本只使用字典树时不可避免的回溯。

如果不了解字典树和ac自动机，可以看下面这个b站视频，讲的非常好，听一遍就能学肥 ac自动机—b站

使用了ac自动机后会如何匹配

举个例子，P={“is” ,“this”,“his”}，T=“this is me”。对P建立字典树并添加fail指针，如下图所示。
在这里插入图片描述

希望的结果是is出现1次，this出现1次，但如果用ac自动机来判断的话，is会出现两次，his也出现了一次。因为T中的this里也包含了is和his，就导致结果出现偏差。

因此，我觉得，给字典树加上fail指针后，优化不了时间，并且会增加时间消耗。原因有两点：

字典树的节点添加fail指针和vector向量后，变得更复杂。
本来只使用字典树时，带匹配的文本可以被认为是由一个个单词组成，此时单词是作为一个整体去字典树里匹配，单词不可拆散，某个单词的字母匹配失败时，就认为该单词不存在，匹配下一个单词。而加上ac自动机的话，单词没法作为一个整体去匹配，带匹配的文本相当于是由一个个字符组成，如果某个单词的字母匹配失败，就会跳到fail指针指向的节点继续匹配，这样的话单词已被拆散

添加了ac自动机的代码

如果硬要加上ac自动机查看最后运行效果的话，代码如下

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <queue>
#include <vector>

using namespace std;

#define MAX 26
#define swapnum(a,b) {int temp=a;a=b;b=temp;}

#define TIMEA
#define _CRT_SECURE_NO_WARNINGS；
#pragma warning(disable:4996)；
//字典树的结构体
typedef struct _TrieNode
{
	int cnt1;//以该节点为底的单词在article1中出现的次数
	int cnt2;//以该节点为底的单词在article2中出现的次数
	vector<int> exist;//记录以该节点为底的单词的长度
	struct _TrieNode *fail;//失配指针
	struct _TrieNode *next[MAX];//指针数组
	_TrieNode()//构造函数
	{
		cnt1 = 0;
		cnt2 = 0;
		fail = NULL;
		memset(next, NULL, MAX * sizeof(struct _TrieNode *));
	}
}TrieNode, *TrieNodeptr;

//保存单词的结构体
typedef struct Node
{
	int flag;//该单词在另一篇文章中是否出现
	int num;
	char word[50];
}Node, *Nodeptr;

Node words1[400000];//结构体数组 存所有词
Node words2[400000];
TrieNodeptr createTrieNode();
void insertTrie(TrieNodeptr pRoot, char *str);
int searchTrie1(TrieNodeptr root, char *str);
int searchTrie2(TrieNodeptr root, char *str);
void deleteTrie(TrieNodeptr root, char *str);
int getword1();
int getword2();
void countSort1(int exp);
void countSort2(int exp);
void radixSort1();
void radixSort2();
void swapchar(char *s1, char *s2);
void TraverseTrie1(TrieNodeptr root);
void TraverseTrie2(TrieNodeptr root);
void TraverseTrie(TrieNodeptr root);//用于检查字典树的遍历函数
void switchstru(Nodeptr a, Nodeptr b);
int ReadFile(char *path, char data[]);
void ac_build(TrieNodeptr root);
void ac_query1(TrieNodeptr root, const char *T);
void ac_query2(TrieNodeptr root, const char *T);

int sum1;//article1里所有不重复单词的数量
int sum2;
Node output1[400000];
Node output2[400000];
char word[50];//某个单词
FILE *indic, *instp, *in1, *in2;
FILE *out;
char dic[7000000];//保存字典里所有的单词
char stp[7000000];//
char art1[7000000];//文章1
char art2[7000000];//文章2

clock_t s4, e4, s5, e5;
float ms4 = 0.0;
float ms5 = 0.0;

int main()
{
	int n;
	int i = 0;
	int j = 0;
	int flag;

#ifdef TIMEA
	clock_t start, end;
	start = clock();
	end = clock();
	printf("time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//读入字典单词
	indic = fopen("dictionary.txt", "rb");
	fseek(indic, 0, SEEK_END);//移动文件流的读写位置
	int lengthdic = ftell(indic);//记录当前的读写位置，指向最后一个字符的下一个位置。在文本文件中，lendthdic直接输出可能没有实际意义
	fseek(indic, 0, SEEK_SET);//？
	fread(dic, lengthdic, 1, indic);//把indic文本中的字符全部读如dic数组中
	dic[lengthdic++] = 0;// \0的ascii码是0
	//printf("%s",dic);
#ifdef TIMEA
	end = clock();
	printf("111 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//读入停止单词
	instp = fopen("stopwords.txt", "rb");
	fseek(instp, 0, SEEK_END);
	int lengthstp = ftell(instp);
	fseek(instp, 0, SEEK_SET);
	fread(stp, lengthstp, 1, instp);
	stp[lengthstp++] = 0;
	//printf("%d",lengthstp);
#ifdef TIMEA
	end = clock();
	printf("222 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//读入第一篇文章
	in1 = fopen("article1.txt", "rb");
	fseek(in1, 0, SEEK_END);
	int length1 = ftell(in1);
	fseek(in1, 0, SEEK_SET);
	fread(art1, length1, 1, in1);
	art1[length1++] = 0;
	//printf("%s",art1);
	//printf("%d",length1);
#ifdef TIMEA
	end = clock();
	printf("333 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//读入第二篇文章
	in2 = fopen("article2.txt", "rb");
	fseek(in2, 0, SEEK_END);
	int length2 = ftell(in2);
	fseek(in2, 0, SEEK_SET);
	fread(art2, length2, 1, in2);
	art2[length2++] = 0;
	//printf("%d",length2);
#ifdef TIMEA
	end = clock();
	printf("444 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
	clock_t start1, end1, start2, end2, start3, end3;
	//m1：记录跳过所有dic中的换行符的时间
	//m2：记录把所有dic中的字母依次写到word单词中的时间
	//m3：记录把所有word中的单词依次写入字典树中的时间
	float ms1, ms2, ms3;
	ms1 = ms2 = ms3 = 0.0;
#endif // TIMEA
	// 一 建立关于dictionary的字典树 分三步
	out = fopen("results.txt", "w");
	TrieNodeptr rootdic = new TrieNode();
	for (i = 0; i < lengthdic; i++)
	{
#ifdef TIMEA
		start1 = clock();
#endif // TIMEA
		flag = 0;
		// 1 如果dic数组中的字符不是字母，就跳过；如果是\0，则结束for循环
		while ((dic[i] | 32) > 'z' || 'a' > (dic[i] | 32))//dic[i]|32表示把字母都转化成小写的
		{
			//当dic到了末尾，就记flag为1
			if (dic[i] == '\0')
			{
				flag = 1;
				break;
			}
			i++;
		}
#ifdef TIMEA
		end1 = clock();
		ms1 += (double)(end1 - start1) * 1000 / CLK_TCK;
#endif // TIMEA
		if (flag == 1) break;
		j = 0;
#ifdef TIMEA
		start2 = clock();
#endif // TIMEA
		// 2 如果字符在a到z之间，就逐记录整个单词到word中
		while ('a' <= (dic[i] | 32) && (dic[i] | 32) <= 'z')
		{
			word[j++] = dic[i] | 32;
			i++;
		}
		word[j] = '\0';
#ifdef TIMEA
		end2 = clock();
		ms2 += (double)(end2 - start2) * 1000 / CLK_TCK;
#endif // TIMEA

#ifdef TIMEA
		start3 = clock();
#endif // TIMEA
		//printf("%s\n",word);
		// 3 把刚找到的单词word，计入dictionary字典树中
		insertTrie(rootdic, word);
#ifdef TIMEA
		end3 = clock();
		ms3 += (double)(end3 - start3) * 1000 / CLK_TCK;
#endif // TIMEA
		//if(dic[i]=='\0') break;
		//printf("%s\n",word);
	}
	//检查dic字典树
	//TraverseTrie(rootdic);
#ifdef TIMEA
	//建立关于dictionary的字典树的每步的时间
	printf("444---inner time=%f,%f,%f\n", ms1, ms2, ms3);

#endif // TIMEA

#ifdef TIMEA
	end = clock();
	//构建dic字典树的整体时间
	printf("555 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//根据stopwords文本，删除字典树rootdic中对应的单词
	for (i = 0; i < lengthstp; i++)
	{
		flag = 0;
		while ((stp[i] | 32) > 'z' || 'a' > (stp[i] | 32))
		{
			if (stp[i] == '\0')
			{
				flag = 1;
				break;
			}
			i++;
		}
		if (flag == 1) break;
		j = 0;
		while ('a' <= (stp[i] | 32) && (stp[i] | 32) <= 'z')
		{
			word[j++] = stp[i] | 32;
			i++;
		}
		word[j] = '\0';
		deleteTrie(rootdic, word);
		//if(stp[i]=='\0') break;
	}
	//检查stp字典树
	//TraverseTrie(rootdic);

	/* 二  对rootdic字典树的每个节点构建fail指针 */
	ac_build(rootdic);
	//检查
	//printf("\n%d\n",rootdic->next[1]->fail!=NULL?1:0);
#ifdef TIMEA
	end = clock();
	//删除字典树rootdic中stopwords单词，以及构建fail指针的时间【占主要部分】
	printf("666---构建fail指针 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//rootdic字典树与article1匹配
	ac_query1(rootdic, art1);

#ifdef TIMEA
	end = clock();
	//rootdic字典树与article1匹配的时间
	printf("777---rootdic与article1匹配 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//rootdic字典树与article2匹配
	ac_query2(rootdic, art2);
	//检查
	//TraverseTrie(rootdic);
#ifdef TIMEA
	end = clock();
	//rootdic字典树与article2匹配的时间
	printf("777---rootdic与article2匹配 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	
	//把article1、article2出现的所有单词和出现次数保存到words1[]、words2[]中的时间
	TraverseTrie1(rootdic);

	//for (i = 0; i < sum1; i++) printf("%s %d %d\n", words1[i].word, words1[i].num, words1[i].flag);
#ifdef TIMEA
	end = clock();
	//把article1、article2出现的所有单词和出现次数保存到words1[]、words2[]中的时间
	printf("777---把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	
	//排words1.num
	radixSort1();

#ifdef TIMEA
	end = clock();
	//word1按单词出现频率排序的时间
	printf("888 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//排words2.num
	radixSort2();

#ifdef TIMEA
	end = clock();
	printf("999 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
	scanf("%d", &n);
#ifdef TIMEA
	start = clock();
#endif // TIMEA
	//这里进行比较操作 得出m个值

	//root1 第一篇文章中的前n个高频词的字典树
	TrieNodeptr root1 = new TrieNode();
	for (i = 0; i < n; i++)
	{
		insertTrie(root1, words1[i].word);
	}
	//TraverseTrie(root1);
	//puts("");

#ifdef TIMEA
	end = clock();
	printf("aaa time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//root2 第二篇文章中的前n个高频词的字典树
	TrieNodeptr root2 = new TrieNode();
	for (i = 0; i < n; i++)
	{
		insertTrie(root2, words2[i].word);
	}
	//TraverseTrie(root2);
	//puts("");

#ifdef TIMEA
	end = clock();
	printf("bbb time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	/* 判断单词的cnt1和cnt2是否同时大于0即可 */
	//判断文章一的前n个高频单词在文章二的字典树中是否出现
	for (i = 0; i < n; i++)
	{
		if (searchTrie1(root2, words1[i].word))
		{
			words1[i].flag = 1;
		}
	}

#ifdef TIMEA
	end = clock();
	printf("ccc time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//判断文章二的前n个高频单词在文章一的字典树中是否出现
	for (i = 0; i < n; i++)
	{
		if (searchTrie2(root1, words2[i].word))
		{
			words2[i].flag = 1;
		}
	}

#ifdef TIMEA
	end = clock();
	printf("ddd time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//检查words1数组
	//for(i=0;i<n;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
	//for(i=0;i<n;i++) printf("%s %d %d\n",words2[i].word,words2[i].num,words2[i].flag);

	//计算pro1和pro2
	int freqm1 = 0;//目标出现次数
	int freqn1 = 0;
	int freqm2 = 0;
	int freqn2 = 0;
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
	for (i = 0; i < n; i++)
	{
		if (words1[i].flag == 1) freqm1 += words1[i].num;
		if (words2[i].flag == 1) freqm2 += words2[i].num;
		freqn1 += words1[i].num;
		freqn2 += words2[i].num;
	}
	double pro1 = (1.0)*freqm1 / freqn1;
	double pro2 = (1.0)*freqm2 / freqn2;
	double sim = (pro1 > pro2) ? (pro2 / pro1) : (pro1 / pro2);
	printf("%.5lf", sim);
	fprintf(out, "%.5lf\n\n", sim);

#ifdef TIMEA
	end = clock();
	//计算pro的时间
	printf("\neee time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	fputc('\n', out);
	for (i = 0; i < n; i++)
	{
		fprintf(out, "%s %d\n", words1[i].word, words1[i].num);
	}

#ifdef TIMEA
	end = clock();
	//文章一的高频词写入result文本的时间
	printf("fff time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	for (i = 0; i < n; i++)
	{
		fprintf(out, "%s %d\n", words2[i].word, words2[i].num);
	}

#ifdef TIMEA
	end = clock();
	//文章二的高频词写入result文本的时间
	printf("ggg time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	fclose(in1);
	fclose(in2);
	//fclose(indic);
	//fclose(instp);
	fclose(out);
}

/* 创建字典树的节点 */
//TrieNodeptr createTrieNode()
//{
//	TrieNodeptr tmp = (TrieNodeptr)malloc(sizeof(TrieNode));
//	tmp->nCount = 0;
//	int i;
//	for (i = 0; i < MAX; i++)
//	{
//		tmp->next[i] = NULL;
//	}
//	return tmp;
//}

/* 把word单词计入rootdic字典树中 */
void insertTrie(TrieNodeptr pRoot, char *str)
{
	TrieNodeptr tmp = pRoot;
	int i = 0, k;
	while (str[i] && str[i] != '\n'&&str[i] != '\r')
	{
		k = str[i] - 'a';
		if (tmp->next[k] == NULL)
		{
			tmp->next[k] = new TrieNode();
		}
		tmp = tmp->next[k];
		i++;
	}
	tmp->exist.push_back(strlen(word));//在单词的最后一个字母上，记录该单词的长度
}

/* 通过宽度遍历，构建fail指针 */
void ac_build(TrieNodeptr root)
{
	//构建一个存放字典树节点地址的队列
	queue<TrieNodeptr> q;
	//先把第一层节点的都指向root
	for (int i = 0; i < MAX; i++)
	{
		if (root->next[i])
		{
			root->next[i]->fail = root;
			q.push(root->next[i]);
		}
	}

	//!!!构建第二层及以下的节点的fail指针
	while (!q.empty())
	{
		TrieNodeptr x = q.front(); q.pop();
		for (int i = 0; i < MAX; i++)
		{
			//如果某个节点存在，表示有字母i+'a'
			if (x->next[i])
			{
				/* 寻找y的fail指针 */
				TrieNodeptr y = x->next[i], fafail = x->fail;

				//当节点y的父亲节点x的fail指针指向的【节点】存在
				//且【该节点】不存在和节点y存放的字母相同的节点
				//就不断重复地去寻找【该节点】的fail指针
				while (fafail && NULL == fafail->next[i])
					fafail = fafail->fail;
				//如果最终找到的【节点】为空，则令y节点指向root
				if (NULL == fafail)
					y->fail = root;
				else//否则，令y的fail指针指向父亲节点x的fail指针指向的节点的内容为i+'a'的儿子节点
					y->fail = fafail->next[i];

				//如果那个节点是某个【或某些】单词的末尾，把它们全都计入y的exist[]中
				if (y->fail->exist.size())
					for (int j = 0; j < y->fail->exist.size(); j++)
						y->exist.push_back(y->fail->exist[j]);
				q.push(y);
			}
		}
	}
}

/* 字典树与字符串匹配 */
void ac_query1(TrieNodeptr root, const char *T)
{
	TrieNodeptr tmp = root;
	for (int i = 0; i < strlen(T); i++)
	{
		int c = T[i] | 32;
		if (c > 'z' || 'a' > c) continue;//如果不是字母，就跳过
		c = c - 'a';
		//如果儿子节点不存在该字母，但该节点存在fail指针，则【不断地】去匹配fail指向的节点【找最长后缀】
		//直到找到某个节点它的儿子节点具有该字母，或tmp指向了root节点（root的fail为NULL），就不再继续
		while (NULL == tmp->next[c] && tmp->fail) {
			tmp = tmp->fail;
			//如果该节点可以构成单词，以该节点为底的单词数++
			if (tmp->exist.size())
				tmp->cnt1++;
		}

		//如果此时tmp指向的节点有【儿子节点】保存着这个字母，进入这个儿子节点
		//否则，匹配下一个字符
		if (tmp->next[c])
			tmp = tmp->next[c];
		else continue;
		//如果该节点可以构成单词，以该节点为底的单词数++
		if (tmp->exist.size())
			tmp->cnt1++;

		//这段代码得按以上的逻辑来写，虽然看起来有些别扭，但这样写，代码会简单很多

	}
}

void ac_query2(TrieNodeptr root, const char *T)
{
	TrieNodeptr tmp = root;
	for (int i = 0; i < strlen(T); i++)
	{
		int c = T[i] | 32;
		if (c > 'z' || 'a' > c) continue;//如果不是字母，就跳过
		c = c - 'a';

		//如果儿子节点不存在该字母，但该节点存在fail指针，则【不断地】去匹配fail指向的节点【找最长后缀】
		//直到找到某个节点它的儿子节点具有该字母，或tmp指向了root节点（root的fail为NULL），就不再继续
		while (NULL == tmp->next[c] && tmp->fail) {
			tmp = tmp->fail;
			if (tmp->exist.size())
				tmp->cnt2++;
		}
			

		//如果此时tmp指向的节点有【儿子节点】保存着这个字母，进入这个儿子节点
		//否则，匹配下一个字符
		if (tmp->next[c])
			tmp = tmp->next[c];
		else continue;
		//如果该节点可以构成单词，以该节点为底的单词数++
		if (tmp->exist.size())
			tmp->cnt2++;

		//这段代码得按以上的逻辑来写，虽然看起来有些别扭，但这样写，代码会简单很多

	}
}

/* 返回某个单词word在某棵字典树中的出现次数 */
int searchTrie1(TrieNode *root, char *str)
{
	if (root == NULL) return 0;
	TrieNode *tmp = root;
	int i = 0, k;
	while (str[i])//'\0'的ASCII码为0
	{
		k = str[i] - 'a';
		if (tmp->next[k]) tmp = tmp->next[k];
		else return 0;
		i++;
	}
	return tmp->cnt2;
}

/* 返回某个单词word在某棵字典树中的出现次数 */
int searchTrie2(TrieNode *root, char *str)
{
	if (root == NULL) return 0;
	TrieNode *tmp = root;
	int i = 0, k;
	while (str[i])//'\0'的ASCII码为0
	{
		k = str[i] - 'a';
		if (tmp->next[k]) tmp = tmp->next[k];
		else return 0;
		i++;
	}
	return tmp->cnt1;
}
//根据stopwords文本，删除字典树rootdic中对应的单词
void deleteTrie(TrieNodeptr root, char *str)
{
	int i, index;
	TrieNodeptr temp = root;
	for (i = 0; i < strlen(str); i++)
	{
		index = str[i] - 'a';
		if (temp->next[index] == NULL) return;
		temp = temp->next[index];
	}
	//temp->exist.pop_back();//？为什么是--，不是=0
	temp->exist.clear();
	return;
}
void swapchar(char *s1, char *s2)
{
	char *temp;
	temp = (char *)malloc(sizeof(char) * 50);
	strcpy(temp, s1);
	strcpy(s1, s2);
	strcpy(s2, temp);
	free(temp);
}
//把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中
void TraverseTrie1(TrieNodeptr root)
{
	int i;
	static char word[40] = { 0 };//全部置为'\0'
	static int j = 0;//因为是递归，所以这里设为静态
	if (root == NULL) return;
	//for循环里面递归,先j++,后j--  ————>回溯
	for (i = 0; i < 26; i++)
	{
		if (root->next[i] == NULL) continue;
		word[j++] = i + 'a';//这里j++
		if (root->next[i]->cnt1 > 0)//如果它已经构成了一个单词
		{
			word[j] = '\0';
			strcpy(words1[sum1].word, word);//把这个单词写入words1[]中
			words1[sum1++].num = root->next[i]->cnt1;//words1[]里的这个单词的数量也更新
		}
		if (root->next[i]->cnt2 > 0)
		{
			word[j] = '\0';
			strcpy(words2[sum2].word, word);
			words2[sum2++].num = root->next[i]->cnt2;
		}
		TraverseTrie1(root->next[i]);
		j--;//撤销操作
	}
}
//把article2出现的所有单词和出现次数保存到words2[]中
/*void TraverseTrie2(TrieNodeptr root)
{
	int i;
	static char word2[40] = { 0 };
	static int j2 = 0;
	if (root == NULL) return;
	for (i = 0; i < 26; i++)
	{
		if (root->next[i] == NULL) continue;
		word2[j2++] = i + 'a';
		if (root->next[i]->cnt2 > 0)
		{
			word2[j2] = '\0';
			strcpy(words2[sum2].word, word2);
			words2[sum2++].num = root->next[i]->cnt2;
		}
		TraverseTrie2(root->next[i]);
		j2--;
	}
}*/
//用于检查字典树的遍历函数
void TraverseTrie(TrieNodeptr root)
{
	int i;
	static char word[40] = { 0 };
	static int j = 0;
	if (root == NULL) return;
	for (i = 0; i < 26; i++)
	{
		if (root->next[i] == NULL) continue;
		word[j++] = i + 'a';
		if (root->next[i]->exist.size() > 0)
		{
			word[j] = '\0';
			printf("%s %d %d %d\n", word, root->next[i]->exist.size(),root->next[i]->cnt1, root->next[i]->cnt2);
		}
		TraverseTrie(root->next[i]);
		j--;
	}
}

void countSort1(int exp)
{
	int i, buckets[10] = { 0 };
	for (i = 0; i < sum1; i++)
	{
		buckets[(words1[i].num / exp) % 10]++;
	}//每个桶的的值
	swapnum(buckets[0], buckets[9]);
	swapnum(buckets[1], buckets[8]);
	swapnum(buckets[2], buckets[7]);
	swapnum(buckets[3], buckets[6]);
	swapnum(buckets[4], buckets[5]);
	for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序
	for (i = sum1 - 1; i >= 0; i--)
	{
		switchstru(&output1[buckets[9 - (words1[i].num / exp) % 10] - 1], &words1[i]);
		buckets[9 - (words1[i].num / exp) % 10]--;
	}
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num);
	for (i = 0; i < sum1; i++)
	{
		switchstru(&words1[i], &output1[i]);
	}
}
void countSort2(int exp)
{
	int i, buckets[10] = { 0 };
	for (i = 0; i < sum2; i++)
	{
		buckets[(words2[i].num / exp) % 10]++;
	}//每个桶的的值
	swapnum(buckets[0], buckets[9]);
	swapnum(buckets[1], buckets[8]);
	swapnum(buckets[2], buckets[7]);
	swapnum(buckets[3], buckets[6]);
	swapnum(buckets[4], buckets[5]);
	for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序
	for (i = sum2 - 1; i >= 0; i--)
	{
		switchstru(&output2[buckets[9 - (words2[i].num / exp) % 10] - 1], &words2[i]);
		buckets[9 - (words2[i].num / exp) % 10]--;
	}
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num);
	for (i = 0; i < sum2; i++)
	{
		switchstru(&words2[i], &output2[i]);
	}
}
void radixSort1()
{
	int exp;
	int max = 50000;
	for (exp = 1; max / exp > 0; exp *= 10) countSort1(exp);
}
void radixSort2()
{
	int exp;
	int max = 50000;
	for (exp = 1; max / exp > 0; exp *= 10) countSort2(exp);
}
void switchstru(Nodeptr a, Nodeptr b)
{
	*a = *b;
}

对比时间

由于我本人比较菜，写出的代码质量不高，改了以后导致花费时间上升了好几个数量级。最后的运行时间对比如下：第一张是加上ac自动机后的运行时间；第二张是原本只是用字典树的运行时间

只使用字典树

wang_xiaoniao

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
在利用字典树判断文章相似度的基础上，能否使用ac自动机来优化时间

目录字典树是怎么判断文章相似度的ac自动机是什么使用了ac自动机后会如何匹配添加了ac自动机的代码对比时间先说结论，我认为不行。字典树是怎么判断文章相似度的利用字典树判断两篇文章相似度时，大致思路是这样的。文章中的某个单词（注意，它是一个整体。这个单词中的任意一连续的部分如果在字典树中存在，都不计数）如果在字典树中出现，就令该单词在这篇文章中的出现次数+1。这样可得到每篇文章里各个单词出现的数量，然后判断高频单词是否类似即可。ac自动机是什么那么如果使用ac自动机的话，首先什么是ac自动机。ac自
复制链接

扫一扫