如何用字典树来判断两篇文章的相似性

问题描述

简单讲就是,给你两篇文章,问你这两篇文章是不是同一个人写的。1
具体内容如下图:
图一
图2

解题思路

把dictionary文本里的所有单词读入dic[]中,输出读入时间111
把stop words文本里的所有单词读入stp[]中,输出读入时间222
把artical1文本里的所有单词读入art1[]中,输出读入时间333
把artical2文本里的所有单词读入art2[]中,输出读入时间444

利用dic[],建立关于dictionary的字典树rootdic,(分三步)输出建立时间55
m1:记录跳过所有dic[]中的换行符的时间,并输出
m2:记录把所有dic[]中的字母依次写到word[]中的时间,并输出
m3:记录把所有word[]中的单词依次写入字典树rootdic中的时间,并输出
利用stp[],删除dictionary的字典树rootdic中对应的单词

构建第一篇文章的字典树rootwords1,并把其中所有的单词写入结构体数组words1[]中。输出删除、构建、写入的时间666
【在构建字典树的同时,记录单词出现的次数】
构建第二篇文章的字典树rootwords2,并把其中所有的单词写入结构体数组words2[]中。输出构建、写入的时间777

给words1中所有的单词按出现频率排序,并输出排序时间888
给words2中所有的单词按出现频率排序,并输出排序时间999

构建article1的前n个高频词的字典树root1,并输出构建时间aaa
构建article2的前n个高频词的字典树root2,并输出构建时间bbb

判断article1的前n个高频单词在article2的字典树中是否出现,并输出判断时间ccc
判断article2的前n个高频单词在article1的字典树中是否出现,并输出判断时间ddd

计算两篇文章的pro,并输出计算时间eee
article1的前n个高频词写入result文本的时间,并输出写入时间fff
article2的前n个高频词写入result文本的时间,并输出写入时间ggg

源代码

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>


#define MAX 26
#define swapnum(a,b) {int temp=a;a=b;b=temp;}


#define TIMEA


typedef struct TrieNode
{
	int nCount;
	struct TrieNode *next[MAX];
}TrieNode, *TrieNodeptr;//字典树的结构体


typedef struct Node
{
	int flag;//该单词在另一篇文章中是否出现
	int num;
	char word[50];
}Node, *Nodeptr;


Node words1[400000];//结构体数组 存article1的所有词
Node words2[400000];
TrieNodeptr createTrieNode();
void insertTrie(TrieNodeptr pRoot, char *str);
int searchTrie(TrieNodeptr root, char *str);
void deleteTrie(TrieNodeptr root, char *str);
int getword1();
int getword2();
void countSort1(int exp);
void countSort2(int exp);
void radixSort1();
void radixSort2();
void swapchar(char *s1, char *s2);
void TraverseTrie1(TrieNodeptr root);
void TraverseTrie2(TrieNodeptr root);
void TraverseTrie(TrieNodeptr root);//用于检查字典树的遍历函数
void switchstru(Nodeptr a, Nodeptr b);
int ReadFile(char *path, char data[]);


int sum1;//article1里所有不重复单词的数量
int sum2;
Node output1[400000];
Node output2[400000];
char word[50];//某个单词
FILE *indic, *instp, *in1, *in2;
FILE *out;
char dic[7000000];//保存字典里所有的单词
char stp[7000000];//
char art1[7000000];//文章1
char art2[7000000];//文章2

clock_t s4, e4, s5, e5;
float ms4 = 0.0;
float ms5 = 0.0;


int main()
{
	int n;
	int i = 0;
	int j = 0;
	int flag;


#ifdef TIMEA
	clock_t start, end;
	start = clock();
	end = clock();
	printf("time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
#endif // TIMEA
	//读入字典单词
	indic = fopen("dictionary.txt", "rb");
	fseek(indic, 0, SEEK_END);//移动文件流的读写位置
	int lengthdic = ftell(indic);//记录当前的读写位置,指向最后一个字符的下一个位置。在文本文件中,lendthdic直接输出可能没有实际意义
	fseek(indic, 0, SEEK_SET);//?
	fread(dic, lengthdic, 1, indic);//把indic文本中的字符全部读如dic数组中
	dic[lengthdic++] = 0;// \0的ascii码是0
	//printf("%s",dic);
#ifdef TIMEA
	end = clock();
	printf("111 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//读入停止单词
	instp = fopen("stopwords.txt", "rb");
	fseek(instp, 0, SEEK_END);
	int lengthstp = ftell(instp);
	fseek(instp, 0, SEEK_SET);
	fread(stp, lengthstp, 1, instp);
	stp[lengthstp++] = 0;
	//printf("%d",lengthstp);
#ifdef TIMEA
	end = clock();
	printf("222 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//读入第一篇文章
	in1 = fopen("article1.txt", "rb");
	fseek(in1, 0, SEEK_END);
	int length1 = ftell(in1);
	fseek(in1, 0, SEEK_SET);
	fread(art1, length1, 1, in1);
	art1[length1++] = 0;
	//printf("%s",art1);
	//printf("%d",length1);
#ifdef TIMEA
	end = clock();
	printf("333 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//读入第二篇文章
	in2 = fopen("article2.txt", "rb");
	fseek(in2, 0, SEEK_END);
	int length2 = ftell(in2);
	fseek(in2, 0, SEEK_SET);
	fread(art2, length2, 1, in2);
	art2[length2++] = 0;
	//printf("%d",length2);
#ifdef TIMEA
	end = clock();
	printf("444 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
	clock_t start1, end1, start2, end2, start3, end3;
	//m1:记录跳过所有dic中的换行符的时间
	//m2:记录把所有dic中的字母依次写到word单词中的时间
	//m3:记录把所有word中的单词依次写入字典树中的时间
	float ms1, ms2, ms3;
	ms1 = ms2 = ms3 = 0.0;
#endif // TIMEA
	//建立关于dictionary的字典树 分三步
	out = fopen("results.txt", "w");
	TrieNodeptr rootdic = createTrieNode();
	for (i = 0; i < lengthdic; i++)
	{
#ifdef TIMEA
		start1 = clock();
#endif // TIMEA
		flag = 0;
		// 1 如果dic[]中的字符不是字母,就跳过;如果是\0,则结束for循环
		while ((dic[i] | 32) > 'z' || 'a' > (dic[i] | 32))//dic[i]|32表示把字母都转化成小写的
		{
			if (dic[i] == '\0')
			{
				flag = 1;
				break;
			}
			i++;
		}
#ifdef TIMEA
		end1 = clock();
		ms1 += (double)(end1 - start1) * 1000 / CLK_TCK;
#endif // TIMEA
		if (flag == 1) break;
		j = 0;
#ifdef TIMEA
		start2 = clock();
#endif // TIMEA
		// 2 如果字符在a到z之间,就逐记录整个单词到word中
		while ('a' <= (dic[i] | 32) && (dic[i] | 32) <= 'z')
		{
			word[j++] = dic[i] | 32;
			i++;
		}
		word[j] = '\0';
#ifdef TIMEA
		end2 = clock();
		ms2 += (double)(end2 - start2) * 1000 / CLK_TCK;
#endif // TIMEA


#ifdef TIMEA
		start3 = clock();
#endif // TIMEA
		//printf("%s\n",word);
		// 3 把刚找到的单词word,计入dictionary字典树中
		insertTrie(rootdic, word);
#ifdef TIMEA
		end3 = clock();
		ms3 += (double)(end3 - start3) * 1000 / CLK_TCK;
#endif // TIMEA
		//if(dic[i]=='\0') break;
		//printf("%s\n",word);
	}
	//检查dic字典树
	//TraverseTrie(rootdic);
#ifdef TIMEA
	//建立关于dictionary的字典树的每步的时间
	printf("444---inner time=%f,%f,%f,%f,%f\n", ms1, ms2, ms3, ms4, ms5);


#endif // TIMEA


#ifdef TIMEA
	end = clock();
	//构建dic字典树的整体时间
	printf("555 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	//根据stopwords文本,删除字典树rootdic中对应的单词
	for (i = 0; i < lengthstp; i++)
	{
		flag = 0;
		while ((stp[i] | 32) > 'z' || 'a' > (stp[i] | 32))
		{
			if (stp[i] == '\0')
			{
				flag = 1;
				break;
			}
			i++;
		}
		if (flag == 1) break;
		j = 0;
		while ('a' <= (stp[i] | 32) && (stp[i] | 32) <= 'z')
		{
			word[j++] = stp[i] | 32;
			i++;
		}
		word[j] = '\0';
		deleteTrie(rootdic, word);
		//if(stp[i]=='\0') break;
	}
	//检查stp字典树
	//TraverseTrie(rootstp);


	TrieNodeptr rootwords1 = createTrieNode();//建立第一篇文章的字典树
	TrieNodeptr rootwords2 = createTrieNode();//建立第二篇文章的字典树

	//通过j来判断是否为单词末尾,是(j!=-1),不是(j==-1),如   这样。
	j = -1;//(这行我添加的,不然第一个单词无法正确读入word中)
	//构建第一篇文章的字典树的过程
	for (i = 0; i <= length1; i++)
	{
		//如果是小写字母,j++
		if (art1[i] >= 'a'&&art1[i] <= 'z') j++;
		//如果是大写字母,转化成小写字母
		else if (art1[i] >= 'A'&&art1[i] <= 'Z')
		{
			art1[i] |= 32;
			j++;
		}
		//如果不是字母,且为单词末尾,就把该位置的字符转化成\0,然后判断这个单词
		else if (j != -1)
		{
			j++;
			art1[i] = 0;
			strcpy(word, art1 + i - j);//strcpy当碰到str2的\0标识时就会停止复制
			if (searchTrie(rootdic, word))//判断单词word是否在rootdic中存在
			{
				insertTrie(rootwords1, word);
			}
			j = -1;
			//printf("%s\n",word);
		}
		//printf("%s\n",word);
	}
	//TraverseTrie(rootwords1);
	TraverseTrie1(rootwords1);


#ifdef TIMEA
	end = clock();
	//删除rootdic中stopwords出现的单词和建立第一篇文章字典树的整体时间
	printf("666 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA

	//构建第二篇文章的字典树的过程
	for (i = 0; i <= length2; i++)
	{
		if (art2[i] >= 'a'&&art2[i] <= 'z') j++;
		else if (art2[i] >= 'A'&&art2[i] <= 'Z')
		{
			art2[i] |= 32;
			j++;
		}
		else if (j != -1)
		{
			j++;
			art2[i] = 0;
			strcpy(word, art2 + i - j);
			if (searchTrie(rootdic, word))//在dic中出现 在stp中不出现
			{
				insertTrie(rootwords2, word);
			}
			j = -1;
			//printf("%s\n",word);
		}
		//printf("%s\n",word);
	}
	//TraverseTrie(rootwords2);
	TraverseTrie2(rootwords2);


#ifdef TIMEA
	end = clock();
	//建立第二篇文章的字典树的时间
	printf("777 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	//排words1.num
	radixSort1();


#ifdef TIMEA
	end = clock();
	//word1按单词出现频率排序的时间
	printf("888 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	//排words2.num
	radixSort2();


#ifdef TIMEA
	end = clock();
	printf("999 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
	scanf("%d", &n);
#ifdef TIMEA
	start = clock();
#endif // TIMEA
	//这里进行比较操作 得出m个值

	//root1 第一篇文章中的前n个高频词的字典树
	TrieNodeptr root1 = createTrieNode();
	for (i = 0; i < n; i++)
	{
		insertTrie(root1, words1[i].word);
	}
	//TraverseTrie(root1);
	//puts("");


#ifdef TIMEA
	end = clock();
	printf("aaa time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA
	//root2 第二篇文章中的前n个高频词的字典树
	TrieNodeptr root2 = createTrieNode();
	for (i = 0; i < n; i++)
	{
		insertTrie(root2, words2[i].word);
	}
	//TraverseTrie(root2);
	//puts("");


#ifdef TIMEA
	end = clock();
	printf("bbb time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	//判断文章一的前n个高频单词在文章二的字典树中是否出现
	for (i = 0; i < n; i++)
	{
		if (searchTrie(root2, words1[i].word))
		{
			words1[i].flag = 1;
		}
	}


#ifdef TIMEA
	end = clock();
	printf("ccc time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	//判断文章二的前n个高频单词在文章一的字典树中是否出现
	for (i = 0; i < n; i++)
	{
		if (searchTrie(root1, words2[i].word))
		{
			words2[i].flag = 1;
		}
	}


#ifdef TIMEA
	end = clock();
	printf("ddd time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	//检查words1数组
	//for(i=0;i<n;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
	//for(i=0;i<n;i++) printf("%s %d %d %d\n",words2[i].word,words2[i].num,words2[i].flag,i);

	//计算pro1和pro2
	int freqm1 = 0;//目标出现次数
	int freqn1 = 0;
	int freqm2 = 0;
	int freqn2 = 0;
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
	for (i = 0; i < n; i++)
	{
		if (words1[i].flag == 1) freqm1 += words1[i].num;
		if (words2[i].flag == 1) freqm2 += words2[i].num;
		freqn1 += words1[i].num;
		freqn2 += words2[i].num;
	}
	double pro1 = (1.0)*freqm1 / freqn1;
	double pro2 = (1.0)*freqm2 / freqn2;
	double sim = (pro1 > pro2) ? (pro2 / pro1) : (pro1 / pro2);
	printf("%.5lf", sim);
	fprintf(out, "%.5lf\n\n", sim);


#ifdef TIMEA
	end = clock();
	//计算pro的时间
	printf("\neee time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	fputc('\n', out);
	for (i = 0; i < n; i++)
	{
		fprintf(out, "%s %d\n", words1[i].word, words1[i].num);
	}


#ifdef TIMEA
	end = clock();
	//文章一的高频词写入result文本的时间
	printf("fff time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	for (i = 0; i < n; i++)
	{
		fprintf(out, "%s %d\n", words2[i].word, words2[i].num);
	}


#ifdef TIMEA
	end = clock();
	//文章二的高频词写入result文本的时间
	printf("ggg time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
	start = clock();
#endif // TIMEA


	fclose(in1);
	fclose(in2);
	//fclose(indic);
	//fclose(instp);
	fclose(out);
}


/* 创建字典树的节点 */
TrieNodeptr createTrieNode()
{
	TrieNodeptr tmp = (TrieNodeptr)malloc(sizeof(TrieNode));
	tmp->nCount = 0;
	int i;
	for (i = 0; i < MAX; i++)
	{
		tmp->next[i] = NULL;
	}
	return tmp;
}


/* 把word单词计入rootdic字典树中 */
void insertTrie(TrieNodeptr pRoot, char *str)
{
	TrieNodeptr tmp = pRoot;
	int i = 0, k;
	while (str[i] && str[i] != '\n'&&str[i] != '\r')
	{
		k = str[i] - 'a';
		if (tmp->next[k] == NULL)
		{
#ifdef TIMEA
			s4 = clock();
#endif // TIMEA
			tmp->next[k] = createTrieNode();
#ifdef TIMEA
			e4 = clock();
			ms4 += (double)(e4 - s4) * 1000 / CLK_TCK;
#endif // TIMEA
		}
		tmp = tmp->next[k];
		i++;
	}
#ifdef TIMEA
	s5 = clock();
#endif // TIMEA
	tmp->nCount++;//nCount 在单词的最后一个字母上+1,记录该单词出现的次数
#ifdef TIMEA
	e5 = clock();
	ms5 += (double)(e5 - s5) * 1000 / CLK_TCK;
#endif // TIMEA
}


/* 返回某个单词word在某棵字典树中的出现次数 */
int searchTrie(TrieNode *root, char *str)
{
	if (root == NULL) return 0;
	TrieNode *tmp = root;
	int i = 0, k;
	while (str[i])//'\0'的ASCII码为0
	{
		k = str[i] - 'a';
		if (tmp->next[k]) tmp = tmp->next[k];
		else return 0;
		i++;
	}
	return tmp->nCount;
}
void deleteTrie(TrieNodeptr root, char *str)
{
	int i, index;
	TrieNodeptr temp = root;
	for (i = 0; i < strlen(str); i++)
	{
		index = str[i] - 'a';
		if (temp->next[index] == NULL) return;
		temp = temp->next[index];
	}
	temp->nCount--;//?为什么是--,不是=0
	return;
}
void swapchar(char *s1, char *s2)
{
	char *temp;
	temp = (char *)malloc(sizeof(char) * 50);
	strcpy(temp, s1);
	strcpy(s1, s2);
	strcpy(s2, temp);
	free(temp);
}
//把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中
void TraverseTrie1(TrieNodeptr root)
{
	int i;
	static char word[40] = { 0 };//全部置为'\0'
	static int j = 0;//因为是递归,所以这里设为静态
	if (root == NULL) return;
	//for循环里面递归,先j++,后j--  ————>回溯
	for (i = 0; i < 26; i++)
	{
		if (root->next[i] == NULL) continue;
		word[j++] = i + 'a';//这里j++
		if (root->next[i]->cnt1 > 0)//如果它已经构成了一个单词
		{
			word[j] = '\0';
			strcpy(words1[sum1].word, word);//把这个单词写入words1[]中
			words1[sum1++].num = root->next[i]->cnt1;//words1[]里的这个单词的数量也更新
		}
		if (root->next[i]->cnt2 > 0)
		{
			word[j] = '\0';
			strcpy(words2[sum2].word, word);
			words2[sum2++].num = root->next[i]->cnt2;
		}
		TraverseTrie1(root->next[i]);
		j--;//撤销操作
	}
}
void TraverseTrie2(TrieNodeptr root)
{
	int i;
	static char word[40] = { 0 };
	static int j = 0;
	if (root == NULL) return;
	for (i = 0; i < 26; i++)
	{
		if (root->next[i] == NULL) continue;
		word[j++] = i + 'a';
		if (root->next[i]->nCount > 0)
		{
			word[j] = '\0';
			strcpy(words2[sum2].word, word);
			words2[sum2++].num = root->next[i]->nCount;
		}
		TraverseTrie2(root->next[i]);
		j--;
	}
}
//用于检查字典树的遍历函数
void TraverseTrie(TrieNodeptr root)
{
	int i;
	static char word[40] = { 0 };
	static int j = 0;
	if (root == NULL) return;
	for (i = 0; i < 26; i++)
	{
		if (root->next[i] == NULL) continue;
		word[j++] = i + 'a';
		if (root->next[i]->nCount > 0)
		{
			word[j] = '\0';
			printf("%s %d\n", word, root->next[i]->nCount);
		}
		TraverseTrie(root->next[i]);
		j--;
	}
}


void countSort1(int exp)
{
	int i, buckets[10] = { 0 };
	for (i = 0; i < sum1; i++)
	{
		buckets[(words1[i].num / exp) % 10]++;
	}//每个桶的的值
	swapnum(buckets[0], buckets[9]);
	swapnum(buckets[1], buckets[8]);
	swapnum(buckets[2], buckets[7]);
	swapnum(buckets[3], buckets[6]);
	swapnum(buckets[4], buckets[5]);
	for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序
	for (i = sum1 - 1; i >= 0; i--)
	{
		switchstru(&output1[buckets[9 - (words1[i].num / exp) % 10] - 1], &words1[i]);
		buckets[9 - (words1[i].num / exp) % 10]--;
	}
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num);
	for (i = 0; i < sum1; i++)
	{
		switchstru(&words1[i], &output1[i]);
	}
}
void countSort2(int exp)
{
	int i, buckets[10] = { 0 };
	for (i = 0; i < sum2; i++)
	{
		buckets[(words2[i].num / exp) % 10]++;
	}//每个桶的的值
	swapnum(buckets[0], buckets[9]);
	swapnum(buckets[1], buckets[8]);
	swapnum(buckets[2], buckets[7]);
	swapnum(buckets[3], buckets[6]);
	swapnum(buckets[4], buckets[5]);
	for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序
	for (i = sum2 - 1; i >= 0; i--)
	{
		switchstru(&output2[buckets[9 - (words2[i].num / exp) % 10] - 1], &words2[i]);
		buckets[9 - (words2[i].num / exp) % 10]--;
	}
	//检查
	//for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num);
	for (i = 0; i < sum2; i++)
	{
		switchstru(&words2[i], &output2[i]);
	}
}
void radixSort1()
{
	int exp;
	int max = 50000;
	for (exp = 1; max / exp > 0; exp *= 10) countSort1(exp);
}
void radixSort2()
{
	int exp;
	int max = 50000;
	for (exp = 1; max / exp > 0; exp *= 10) countSort2(exp);
}
void switchstru(Nodeptr a, Nodeptr b)
{
	*a = *b;
}


  1. 这是我研究生阶段导师布置的第一个任务,所以写的比较认真 😃。其实当时老师给了源代码,她的本意让我在字典树的基础上能否用ac自动机的方法做优化,我思忖了一番觉得不太行,于是认认真真对源码进行了解读。 ↩︎

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值