问题描述
简单讲就是,给你两篇文章,问你这两篇文章是不是同一个人写的。1
具体内容如下图:
解题思路
把dictionary文本里的所有单词读入dic[]中,输出读入时间111
把stop words文本里的所有单词读入stp[]中,输出读入时间222
把artical1文本里的所有单词读入art1[]中,输出读入时间333
把artical2文本里的所有单词读入art2[]中,输出读入时间444
利用dic[],建立关于dictionary的字典树rootdic,(分三步)输出建立时间55
m1:记录跳过所有dic[]中的换行符的时间,并输出
m2:记录把所有dic[]中的字母依次写到word[]中的时间,并输出
m3:记录把所有word[]中的单词依次写入字典树rootdic中的时间,并输出
利用stp[],删除dictionary的字典树rootdic中对应的单词
构建第一篇文章的字典树rootwords1,并把其中所有的单词写入结构体数组words1[]中。输出删除、构建、写入的时间666
【在构建字典树的同时,记录单词出现的次数】
构建第二篇文章的字典树rootwords2,并把其中所有的单词写入结构体数组words2[]中。输出构建、写入的时间777
给words1中所有的单词按出现频率排序,并输出排序时间888
给words2中所有的单词按出现频率排序,并输出排序时间999
构建article1的前n个高频词的字典树root1,并输出构建时间aaa
构建article2的前n个高频词的字典树root2,并输出构建时间bbb
判断article1的前n个高频单词在article2的字典树中是否出现,并输出判断时间ccc
判断article2的前n个高频单词在article1的字典树中是否出现,并输出判断时间ddd
计算两篇文章的pro,并输出计算时间eee
article1的前n个高频词写入result文本的时间,并输出写入时间fff
article2的前n个高频词写入result文本的时间,并输出写入时间ggg
源代码
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#define MAX 26
#define swapnum(a,b) {int temp=a;a=b;b=temp;}
#define TIMEA
typedef struct TrieNode
{
int nCount;
struct TrieNode *next[MAX];
}TrieNode, *TrieNodeptr;//字典树的结构体
typedef struct Node
{
int flag;//该单词在另一篇文章中是否出现
int num;
char word[50];
}Node, *Nodeptr;
Node words1[400000];//结构体数组 存article1的所有词
Node words2[400000];
TrieNodeptr createTrieNode();
void insertTrie(TrieNodeptr pRoot, char *str);
int searchTrie(TrieNodeptr root, char *str);
void deleteTrie(TrieNodeptr root, char *str);
int getword1();
int getword2();
void countSort1(int exp);
void countSort2(int exp);
void radixSort1();
void radixSort2();
void swapchar(char *s1, char *s2);
void TraverseTrie1(TrieNodeptr root);
void TraverseTrie2(TrieNodeptr root);
void TraverseTrie(TrieNodeptr root);//用于检查字典树的遍历函数
void switchstru(Nodeptr a, Nodeptr b);
int ReadFile(char *path, char data[]);
int sum1;//article1里所有不重复单词的数量
int sum2;
Node output1[400000];
Node output2[400000];
char word[50];//某个单词
FILE *indic, *instp, *in1, *in2;
FILE *out;
char dic[7000000];//保存字典里所有的单词
char stp[7000000];//
char art1[7000000];//文章1
char art2[7000000];//文章2
clock_t s4, e4, s5, e5;
float ms4 = 0.0;
float ms5 = 0.0;
int main()
{
int n;
int i = 0;
int j = 0;
int flag;
#ifdef TIMEA
clock_t start, end;
start = clock();
end = clock();
printf("time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
#endif // TIMEA
//读入字典单词
indic = fopen("dictionary.txt", "rb");
fseek(indic, 0, SEEK_END);//移动文件流的读写位置
int lengthdic = ftell(indic);//记录当前的读写位置,指向最后一个字符的下一个位置。在文本文件中,lendthdic直接输出可能没有实际意义
fseek(indic, 0, SEEK_SET);//?
fread(dic, lengthdic, 1, indic);//把indic文本中的字符全部读如dic数组中
dic[lengthdic++] = 0;// \0的ascii码是0
//printf("%s",dic);
#ifdef TIMEA
end = clock();
printf("111 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//读入停止单词
instp = fopen("stopwords.txt", "rb");
fseek(instp, 0, SEEK_END);
int lengthstp = ftell(instp);
fseek(instp, 0, SEEK_SET);
fread(stp, lengthstp, 1, instp);
stp[lengthstp++] = 0;
//printf("%d",lengthstp);
#ifdef TIMEA
end = clock();
printf("222 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//读入第一篇文章
in1 = fopen("article1.txt", "rb");
fseek(in1, 0, SEEK_END);
int length1 = ftell(in1);
fseek(in1, 0, SEEK_SET);
fread(art1, length1, 1, in1);
art1[length1++] = 0;
//printf("%s",art1);
//printf("%d",length1);
#ifdef TIMEA
end = clock();
printf("333 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//读入第二篇文章
in2 = fopen("article2.txt", "rb");
fseek(in2, 0, SEEK_END);
int length2 = ftell(in2);
fseek(in2, 0, SEEK_SET);
fread(art2, length2, 1, in2);
art2[length2++] = 0;
//printf("%d",length2);
#ifdef TIMEA
end = clock();
printf("444 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
clock_t start1, end1, start2, end2, start3, end3;
//m1:记录跳过所有dic中的换行符的时间
//m2:记录把所有dic中的字母依次写到word单词中的时间
//m3:记录把所有word中的单词依次写入字典树中的时间
float ms1, ms2, ms3;
ms1 = ms2 = ms3 = 0.0;
#endif // TIMEA
//建立关于dictionary的字典树 分三步
out = fopen("results.txt", "w");
TrieNodeptr rootdic = createTrieNode();
for (i = 0; i < lengthdic; i++)
{
#ifdef TIMEA
start1 = clock();
#endif // TIMEA
flag = 0;
// 1 如果dic[]中的字符不是字母,就跳过;如果是\0,则结束for循环
while ((dic[i] | 32) > 'z' || 'a' > (dic[i] | 32))//dic[i]|32表示把字母都转化成小写的
{
if (dic[i] == '\0')
{
flag = 1;
break;
}
i++;
}
#ifdef TIMEA
end1 = clock();
ms1 += (double)(end1 - start1) * 1000 / CLK_TCK;
#endif // TIMEA
if (flag == 1) break;
j = 0;
#ifdef TIMEA
start2 = clock();
#endif // TIMEA
// 2 如果字符在a到z之间,就逐记录整个单词到word中
while ('a' <= (dic[i] | 32) && (dic[i] | 32) <= 'z')
{
word[j++] = dic[i] | 32;
i++;
}
word[j] = '\0';
#ifdef TIMEA
end2 = clock();
ms2 += (double)(end2 - start2) * 1000 / CLK_TCK;
#endif // TIMEA
#ifdef TIMEA
start3 = clock();
#endif // TIMEA
//printf("%s\n",word);
// 3 把刚找到的单词word,计入dictionary字典树中
insertTrie(rootdic, word);
#ifdef TIMEA
end3 = clock();
ms3 += (double)(end3 - start3) * 1000 / CLK_TCK;
#endif // TIMEA
//if(dic[i]=='\0') break;
//printf("%s\n",word);
}
//检查dic字典树
//TraverseTrie(rootdic);
#ifdef TIMEA
//建立关于dictionary的字典树的每步的时间
printf("444---inner time=%f,%f,%f,%f,%f\n", ms1, ms2, ms3, ms4, ms5);
#endif // TIMEA
#ifdef TIMEA
end = clock();
//构建dic字典树的整体时间
printf("555 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//根据stopwords文本,删除字典树rootdic中对应的单词
for (i = 0; i < lengthstp; i++)
{
flag = 0;
while ((stp[i] | 32) > 'z' || 'a' > (stp[i] | 32))
{
if (stp[i] == '\0')
{
flag = 1;
break;
}
i++;
}
if (flag == 1) break;
j = 0;
while ('a' <= (stp[i] | 32) && (stp[i] | 32) <= 'z')
{
word[j++] = stp[i] | 32;
i++;
}
word[j] = '\0';
deleteTrie(rootdic, word);
//if(stp[i]=='\0') break;
}
//检查stp字典树
//TraverseTrie(rootstp);
TrieNodeptr rootwords1 = createTrieNode();//建立第一篇文章的字典树
TrieNodeptr rootwords2 = createTrieNode();//建立第二篇文章的字典树
//通过j来判断是否为单词末尾,是(j!=-1),不是(j==-1),如 这样。
j = -1;//(这行我添加的,不然第一个单词无法正确读入word中)
//构建第一篇文章的字典树的过程
for (i = 0; i <= length1; i++)
{
//如果是小写字母,j++
if (art1[i] >= 'a'&&art1[i] <= 'z') j++;
//如果是大写字母,转化成小写字母
else if (art1[i] >= 'A'&&art1[i] <= 'Z')
{
art1[i] |= 32;
j++;
}
//如果不是字母,且为单词末尾,就把该位置的字符转化成\0,然后判断这个单词
else if (j != -1)
{
j++;
art1[i] = 0;
strcpy(word, art1 + i - j);//strcpy当碰到str2的\0标识时就会停止复制
if (searchTrie(rootdic, word))//判断单词word是否在rootdic中存在
{
insertTrie(rootwords1, word);
}
j = -1;
//printf("%s\n",word);
}
//printf("%s\n",word);
}
//TraverseTrie(rootwords1);
TraverseTrie1(rootwords1);
#ifdef TIMEA
end = clock();
//删除rootdic中stopwords出现的单词和建立第一篇文章字典树的整体时间
printf("666 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//构建第二篇文章的字典树的过程
for (i = 0; i <= length2; i++)
{
if (art2[i] >= 'a'&&art2[i] <= 'z') j++;
else if (art2[i] >= 'A'&&art2[i] <= 'Z')
{
art2[i] |= 32;
j++;
}
else if (j != -1)
{
j++;
art2[i] = 0;
strcpy(word, art2 + i - j);
if (searchTrie(rootdic, word))//在dic中出现 在stp中不出现
{
insertTrie(rootwords2, word);
}
j = -1;
//printf("%s\n",word);
}
//printf("%s\n",word);
}
//TraverseTrie(rootwords2);
TraverseTrie2(rootwords2);
#ifdef TIMEA
end = clock();
//建立第二篇文章的字典树的时间
printf("777 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//排words1.num
radixSort1();
#ifdef TIMEA
end = clock();
//word1按单词出现频率排序的时间
printf("888 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//排words2.num
radixSort2();
#ifdef TIMEA
end = clock();
printf("999 time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//检查
//for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
scanf("%d", &n);
#ifdef TIMEA
start = clock();
#endif // TIMEA
//这里进行比较操作 得出m个值
//root1 第一篇文章中的前n个高频词的字典树
TrieNodeptr root1 = createTrieNode();
for (i = 0; i < n; i++)
{
insertTrie(root1, words1[i].word);
}
//TraverseTrie(root1);
//puts("");
#ifdef TIMEA
end = clock();
printf("aaa time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//root2 第二篇文章中的前n个高频词的字典树
TrieNodeptr root2 = createTrieNode();
for (i = 0; i < n; i++)
{
insertTrie(root2, words2[i].word);
}
//TraverseTrie(root2);
//puts("");
#ifdef TIMEA
end = clock();
printf("bbb time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//判断文章一的前n个高频单词在文章二的字典树中是否出现
for (i = 0; i < n; i++)
{
if (searchTrie(root2, words1[i].word))
{
words1[i].flag = 1;
}
}
#ifdef TIMEA
end = clock();
printf("ccc time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//判断文章二的前n个高频单词在文章一的字典树中是否出现
for (i = 0; i < n; i++)
{
if (searchTrie(root1, words2[i].word))
{
words2[i].flag = 1;
}
}
#ifdef TIMEA
end = clock();
printf("ddd time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
//检查words1数组
//for(i=0;i<n;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
//for(i=0;i<n;i++) printf("%s %d %d %d\n",words2[i].word,words2[i].num,words2[i].flag,i);
//计算pro1和pro2
int freqm1 = 0;//目标出现次数
int freqn1 = 0;
int freqm2 = 0;
int freqn2 = 0;
//检查
//for(i=0;i<sum1;i++) printf("%s %d %d\n",words1[i].word,words1[i].num,words1[i].flag);
for (i = 0; i < n; i++)
{
if (words1[i].flag == 1) freqm1 += words1[i].num;
if (words2[i].flag == 1) freqm2 += words2[i].num;
freqn1 += words1[i].num;
freqn2 += words2[i].num;
}
double pro1 = (1.0)*freqm1 / freqn1;
double pro2 = (1.0)*freqm2 / freqn2;
double sim = (pro1 > pro2) ? (pro2 / pro1) : (pro1 / pro2);
printf("%.5lf", sim);
fprintf(out, "%.5lf\n\n", sim);
#ifdef TIMEA
end = clock();
//计算pro的时间
printf("\neee time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
fputc('\n', out);
for (i = 0; i < n; i++)
{
fprintf(out, "%s %d\n", words1[i].word, words1[i].num);
}
#ifdef TIMEA
end = clock();
//文章一的高频词写入result文本的时间
printf("fff time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
for (i = 0; i < n; i++)
{
fprintf(out, "%s %d\n", words2[i].word, words2[i].num);
}
#ifdef TIMEA
end = clock();
//文章二的高频词写入result文本的时间
printf("ggg time=%f\n", (double)(end - start) * 1000 / CLK_TCK);
start = clock();
#endif // TIMEA
fclose(in1);
fclose(in2);
//fclose(indic);
//fclose(instp);
fclose(out);
}
/* 创建字典树的节点 */
TrieNodeptr createTrieNode()
{
TrieNodeptr tmp = (TrieNodeptr)malloc(sizeof(TrieNode));
tmp->nCount = 0;
int i;
for (i = 0; i < MAX; i++)
{
tmp->next[i] = NULL;
}
return tmp;
}
/* 把word单词计入rootdic字典树中 */
void insertTrie(TrieNodeptr pRoot, char *str)
{
TrieNodeptr tmp = pRoot;
int i = 0, k;
while (str[i] && str[i] != '\n'&&str[i] != '\r')
{
k = str[i] - 'a';
if (tmp->next[k] == NULL)
{
#ifdef TIMEA
s4 = clock();
#endif // TIMEA
tmp->next[k] = createTrieNode();
#ifdef TIMEA
e4 = clock();
ms4 += (double)(e4 - s4) * 1000 / CLK_TCK;
#endif // TIMEA
}
tmp = tmp->next[k];
i++;
}
#ifdef TIMEA
s5 = clock();
#endif // TIMEA
tmp->nCount++;//nCount 在单词的最后一个字母上+1,记录该单词出现的次数
#ifdef TIMEA
e5 = clock();
ms5 += (double)(e5 - s5) * 1000 / CLK_TCK;
#endif // TIMEA
}
/* 返回某个单词word在某棵字典树中的出现次数 */
int searchTrie(TrieNode *root, char *str)
{
if (root == NULL) return 0;
TrieNode *tmp = root;
int i = 0, k;
while (str[i])//'\0'的ASCII码为0
{
k = str[i] - 'a';
if (tmp->next[k]) tmp = tmp->next[k];
else return 0;
i++;
}
return tmp->nCount;
}
void deleteTrie(TrieNodeptr root, char *str)
{
int i, index;
TrieNodeptr temp = root;
for (i = 0; i < strlen(str); i++)
{
index = str[i] - 'a';
if (temp->next[index] == NULL) return;
temp = temp->next[index];
}
temp->nCount--;//?为什么是--,不是=0
return;
}
void swapchar(char *s1, char *s2)
{
char *temp;
temp = (char *)malloc(sizeof(char) * 50);
strcpy(temp, s1);
strcpy(s1, s2);
strcpy(s2, temp);
free(temp);
}
//把article1、article1出现的所有单词和出现次数保存到words1[]、words1[]中
void TraverseTrie1(TrieNodeptr root)
{
int i;
static char word[40] = { 0 };//全部置为'\0'
static int j = 0;//因为是递归,所以这里设为静态
if (root == NULL) return;
//for循环里面递归,先j++,后j-- ————>回溯
for (i = 0; i < 26; i++)
{
if (root->next[i] == NULL) continue;
word[j++] = i + 'a';//这里j++
if (root->next[i]->cnt1 > 0)//如果它已经构成了一个单词
{
word[j] = '\0';
strcpy(words1[sum1].word, word);//把这个单词写入words1[]中
words1[sum1++].num = root->next[i]->cnt1;//words1[]里的这个单词的数量也更新
}
if (root->next[i]->cnt2 > 0)
{
word[j] = '\0';
strcpy(words2[sum2].word, word);
words2[sum2++].num = root->next[i]->cnt2;
}
TraverseTrie1(root->next[i]);
j--;//撤销操作
}
}
void TraverseTrie2(TrieNodeptr root)
{
int i;
static char word[40] = { 0 };
static int j = 0;
if (root == NULL) return;
for (i = 0; i < 26; i++)
{
if (root->next[i] == NULL) continue;
word[j++] = i + 'a';
if (root->next[i]->nCount > 0)
{
word[j] = '\0';
strcpy(words2[sum2].word, word);
words2[sum2++].num = root->next[i]->nCount;
}
TraverseTrie2(root->next[i]);
j--;
}
}
//用于检查字典树的遍历函数
void TraverseTrie(TrieNodeptr root)
{
int i;
static char word[40] = { 0 };
static int j = 0;
if (root == NULL) return;
for (i = 0; i < 26; i++)
{
if (root->next[i] == NULL) continue;
word[j++] = i + 'a';
if (root->next[i]->nCount > 0)
{
word[j] = '\0';
printf("%s %d\n", word, root->next[i]->nCount);
}
TraverseTrie(root->next[i]);
j--;
}
}
void countSort1(int exp)
{
int i, buckets[10] = { 0 };
for (i = 0; i < sum1; i++)
{
buckets[(words1[i].num / exp) % 10]++;
}//每个桶的的值
swapnum(buckets[0], buckets[9]);
swapnum(buckets[1], buckets[8]);
swapnum(buckets[2], buckets[7]);
swapnum(buckets[3], buckets[6]);
swapnum(buckets[4], buckets[5]);
for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序
for (i = sum1 - 1; i >= 0; i--)
{
switchstru(&output1[buckets[9 - (words1[i].num / exp) % 10] - 1], &words1[i]);
buckets[9 - (words1[i].num / exp) % 10]--;
}
//检查
//for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num);
for (i = 0; i < sum1; i++)
{
switchstru(&words1[i], &output1[i]);
}
}
void countSort2(int exp)
{
int i, buckets[10] = { 0 };
for (i = 0; i < sum2; i++)
{
buckets[(words2[i].num / exp) % 10]++;
}//每个桶的的值
swapnum(buckets[0], buckets[9]);
swapnum(buckets[1], buckets[8]);
swapnum(buckets[2], buckets[7]);
swapnum(buckets[3], buckets[6]);
swapnum(buckets[4], buckets[5]);
for (i = 1; i < 10; i++) buckets[i] += buckets[i - 1];//每个数的排序
for (i = sum2 - 1; i >= 0; i--)
{
switchstru(&output2[buckets[9 - (words2[i].num / exp) % 10] - 1], &words2[i]);
buckets[9 - (words2[i].num / exp) % 10]--;
}
//检查
//for(i=0;i<sum1;i++) printf("%s %d\n",output1[i].word,output1[i].num);
for (i = 0; i < sum2; i++)
{
switchstru(&words2[i], &output2[i]);
}
}
void radixSort1()
{
int exp;
int max = 50000;
for (exp = 1; max / exp > 0; exp *= 10) countSort1(exp);
}
void radixSort2()
{
int exp;
int max = 50000;
for (exp = 1; max / exp > 0; exp *= 10) countSort2(exp);
}
void switchstru(Nodeptr a, Nodeptr b)
{
*a = *b;
}
这是我研究生阶段导师布置的第一个任务,所以写的比较认真 😃。其实当时老师给了源代码,她的本意让我在字典树的基础上能否用ac自动机的方法做优化,我思忖了一番觉得不太行,于是认认真真对源码进行了解读。 ↩︎