进阶实验5-3.3 基于词频的文件相似度

最新推荐文章于 2024-04-29 17:06:11 发布

嗯嗯p_q

最新推荐文章于 2024-04-29 17:06:11 发布

阅读量566

点赞数

本文链接：https://blog.csdn.net/weixin_51442864/article/details/114101915

版权

PTA 专栏收录该内容

18 篇文章

订阅专栏

博客介绍词频统计思路，选择将单词存到哈希表，文件存单词地址，对比时通过指针访问。还将每个文件指向单词的指针按字典序排序，两文件比对时按顺序依次比较，节省时间，另有细节在注释体现。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

题目链接
思路:既然要统计词频,那我们可以选择将单词存到哈希表里,然后在文件里存单词的地址,在需要对比时直接通过指针访问该单词,此外,我将每个文件的指向单词的指针按字典序将指针排好序,这样在两文件进行比对时可以按顺序依次比较过去,节省了一定的时间.其他还有要处理的细节如注释所示.

#include<iostream>
#include<cstdio>
#include<cstring>
#include<vector>
#include<algorithm>
using namespace std;
#define isok(a) ((a>='a'&&a<='z')||(a>='A'&&a<='Z'))
const int MAX = 500009;
const int max1 = 101;
int N;
struct Hash {
	char words[11];
};
struct file {
	vector<Hash*>W;
};
vector<Hash*>e[MAX];
file Files[max1];
bool cmp(Hash* A, Hash* B);//让同一文件内单词的指针按字典序排列
int GetHash(char* A);//得到哈希值
void compare(int p1, int p2);//比较文件相似度
Hash* find_the_pointer(char* A);//找到指向该单词的指针
bool not_existed(int n, Hash* tmp);//判断同一文件内是否有重复单词
void init();
int main() {
	init();
	int M, t1, t2;
	scanf("%d", &M);
	for (int i = 0; i < M; i++) {
		scanf("%d%d", &t1, &t2);
		compare(t1, t2);
	}
}
bool cmp(Hash* A, Hash* B) {
	return strcmp(A->words, B->words) > 0 ? 1 : 0;
}
int GetHash(char* A) {
	unsigned int seed = 31, sum = 0;
	char* B = A;
	while (*B) {//将单词中大写全变小写,便于比较是否相同以及排字典序
		if ((*B) >= 'A' && (*B) <= 'Z')
			*B = *B - 'A' + 'a';
		B++;
	}
	while (*A) {
		sum = sum * seed + (*A) - 'a';
		A++;
	}
	return (sum & 0x7fffffff) % MAX;
}
void compare(int p1, int p2) {
	file* f1 = &Files[p1];
	file* f2 = &Files[p2];
	int cnt = 0, c;
	int s1 = f1->W.size(), s2 = f2->W.size();
	int i = 0, j = 0;
	while (i < s1 && j < s2) {//文件中单词的指针已按字典序排好,故可如此比较:相同则计数,两指针后移一位,否则字典序大者后移一位
		if ((c = strcmp(f1->W[i]->words, f2->W[j]->words)) == 0) {
			cnt++;
			i++;
			j++;
		}
		else if (c < 0)
			j++;
		else
			i++;
	}
	printf("%.1f%%\n", ((cnt * 1.0) / (s1 + s2 - cnt)) * 100);
}
void init() {
	scanf("%d", &N);
	for (int i = 1; i <= N; i++) {
		char tmp[100];
		while (scanf(" %s", tmp) && tmp[0] != '#') {//当未读到#,继续读,前面加个空格防止读到回车
			int k = 0;
			while (tmp[k]) {//此段未读到末尾就一直循环取词
				int j = 0;
				char* Tmp = new char[11];
				while (j != 10 && isok(tmp[k]))//读到了小写或大写字母,塞入
					Tmp[j++] = tmp[k++];
				if (j > 2) {
					Tmp[j] = 0;
					Hash* w = find_the_pointer(Tmp);//在哈希表中找到指向该单词的指针
					if (not_existed(i, w))
						Files[i].W.push_back(w);//塞入文件
				}
				if(j==10){//如果一个单词的字母量超过10,其后面字母不再作为单词,丢弃之
					while(tmp[k]!='\0'&&isok(tmp[k]))
						k++;
				}
				while (tmp[k] != '\0' && !isok(tmp[k]))//当未到达读入末尾且未读到字母就持续后移
					k++;
			}
		}
		sort(Files[i].W.begin(), Files[i].W.end(), cmp);//对文件中单词按照字典序排序
	}
}
Hash* find_the_pointer(char* A) {
	int h = GetHash(A);//取得单词哈希值
	int t = e[h].size();
	Hash* tmp = nullptr;
	int i;
	for (i = 0; i < t; i++)//在哈希位上找该单词的指针
		if (!strcmp(e[h][i]->words, A)) {//找到
			tmp = e[h][i];
			break;
		}
	if (i == t) {//未找到,自己造一个
		tmp = new Hash;
		strcpy(tmp->words, A);
		e[h].push_back(tmp);
	}
	return tmp;
}
bool not_existed(int n, Hash* tmp) {
	int t = Files[n].W.size();
	for (int i = 0; i < t; i++) 
		if (Files[n].W[i] == tmp)
			return false;
	return true;
}