2020秋数据结构实验第三题:文本相似度比较 C++实现

实验作者:

本次实验代码由其他两位助教,有疑问请咨询周恒森助教和夏良伟助教

实验描述

实验代码

/*相似度定义为段落2中每一句与段落1中所有句子对比,取最长公共子串长度,求和后除以段落2总长度*/
#include <stdio.h>
#include<string.h>
#include<fstream>
#include <iostream>
#include<vector>
#include <string>
using namespace std;

#define MAX 100

struct result {
	string file[MAX];
	int length = 0;
	int a[MAX] = { 0 };
};

string getLCS(string str1, string str2) {
	vector<vector<int> > record(str1.length(), vector<int>(str2.length()));
	int maxLen = 0, maxEnd = 0;
	for (int i = 0; i < static_cast<int>(str1.length()); ++i)
		for (int j = 0; j < static_cast<int>(str2.length()); ++j) {
			if (str1[i] == str2[j]) {
				if (i == 0 || j == 0) {
					record[i][j] = 1;
				}
				else {
					record[i][j] = record[i - 1][j - 1] + 1;
				}
			}
			else {
				record[i][j] = 0;
			}
			if (record[i][j] > maxLen) {
				maxLen = record[i][j];
				maxEnd = i; //若记录i,则最后获取LCS时是取str1的子串
			}
		}
	return str1.substr(maxEnd - maxLen + 1, maxLen);
}

result * read(string filename)
{
	result *test = new result;
	fstream f(filename);
	cin.unsetf(ios::skipws);
	char c;
	vector<char>character;
	vector<string> words;
	vector<vector<string>>paragraph;
	int begin = 0, end = 0, ai = 0;
	string word;
	while (!f.eof())
	{
		f.get(c);
		if (f.eof()) {
			words.push_back(word);
			word = "";
			break;
		}
		character.push_back(c);
		word += c;
		if (c == ' ') {
			words.push_back(word);
			word = "";
			test->a[ai]++;
		}
		if (c == '.')
		{
			test->a[ai]++;
			ai++;
			test->a[ai]--;
		}
		if (c == '\n') {
			words.push_back(word);
			word = "";
			end = words.size();
			vector<string>cmp;
			for (int i = begin; i < end; i++)
				cmp.push_back(words[i]);
			paragraph.push_back(cmp);
			begin = end;
		}
	}
	end = words.size();
	vector<string>cmp;
	for (int i = begin; i < end; i++)
		cmp.push_back(words[i]);
	paragraph.push_back(cmp);
	//for (int i = 0; i < words.size(); i++)
	//{
	//	cout << words[i] << endl;
	//}
	//for (int i = 0; i < paragraph.size(); i++)
	//	for (int j = 0; j < paragraph[i].size(); j++)
	//		cout << paragraph[i][j];
	//cout << endl;
	//cout << "words:" << words.size() << endl;
	//cout << "paragraphs:" << paragraph.size() << endl;
	int i = 0, j = 0, sum = 0;
	for (j = 0; sum < words.size(); j++)
	{
		for (i = 0; i < test->a[j]; i++) {
			test->file[j] = test->file[j] + words[i + sum];
		}
		sum = sum + test->a[j];
	}
	test->length = j;
	//	cout << j << endl;
	return test;
}

int main()
{
	result * t1;
	result * t2;
	string frist = "file1.txt";
	string second = "file2.txt";
	t1 = new result;
	t1 = read(frist);
	t2 = new result;
	t2 = read(second);
	int sum = 0, temp = 0, sum1 = 0;
	string tt;
	for (int i = 0; i < t2->length; i++)
	{
		for (int j = 0; j < t1->length; j++)
		{
			tt = getLCS(t2->file[i], t1->file[j]);
			if (temp < tt.length())
				temp = tt.length();
		}
		//cout << tt << tt.length() << temp << endl;
		sum = sum + t2->file[i].length();
		sum1 = sum1 + temp;
		temp = 0;
	}
	float sim;
	sim = (float)sum1 / sum;
	cout << "文本总长度" << sum << endl;
	cout << "重复文本总长度" << sum1 << endl;
	cout << "相似度:" << sim << endl;
	getchar();
	return 0;
}

实验结果

  • 2
    点赞
  • 30
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 9
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

USTC暖暖

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值