实验作者:
本次实验代码由其他两位助教,有疑问请咨询周恒森助教和夏良伟助教
实验描述
实验代码
/*相似度定义为段落2中每一句与段落1中所有句子对比,取最长公共子串长度,求和后除以段落2总长度*/
#include <stdio.h>
#include<string.h>
#include<fstream>
#include <iostream>
#include<vector>
#include <string>
using namespace std;
#define MAX 100
struct result {
string file[MAX];
int length = 0;
int a[MAX] = { 0 };
};
string getLCS(string str1, string str2) {
vector<vector<int> > record(str1.length(), vector<int>(str2.length()));
int maxLen = 0, maxEnd = 0;
for (int i = 0; i < static_cast<int>(str1.length()); ++i)
for (int j = 0; j < static_cast<int>(str2.length()); ++j) {
if (str1[i] == str2[j]) {
if (i == 0 || j == 0) {
record[i][j] = 1;
}
else {
record[i][j] = record[i - 1][j - 1] + 1;
}
}
else {
record[i][j] = 0;
}
if (record[i][j] > maxLen) {
maxLen = record[i][j];
maxEnd = i; //若记录i,则最后获取LCS时是取str1的子串
}
}
return str1.substr(maxEnd - maxLen + 1, maxLen);
}
result * read(string filename)
{
result *test = new result;
fstream f(filename);
cin.unsetf(ios::skipws);
char c;
vector<char>character;
vector<string> words;
vector<vector<string>>paragraph;
int begin = 0, end = 0, ai = 0;
string word;
while (!f.eof())
{
f.get(c);
if (f.eof()) {
words.push_back(word);
word = "";
break;
}
character.push_back(c);
word += c;
if (c == ' ') {
words.push_back(word);
word = "";
test->a[ai]++;
}
if (c == '.')
{
test->a[ai]++;
ai++;
test->a[ai]--;
}
if (c == '\n') {
words.push_back(word);
word = "";
end = words.size();
vector<string>cmp;
for (int i = begin; i < end; i++)
cmp.push_back(words[i]);
paragraph.push_back(cmp);
begin = end;
}
}
end = words.size();
vector<string>cmp;
for (int i = begin; i < end; i++)
cmp.push_back(words[i]);
paragraph.push_back(cmp);
//for (int i = 0; i < words.size(); i++)
//{
// cout << words[i] << endl;
//}
//for (int i = 0; i < paragraph.size(); i++)
// for (int j = 0; j < paragraph[i].size(); j++)
// cout << paragraph[i][j];
//cout << endl;
//cout << "words:" << words.size() << endl;
//cout << "paragraphs:" << paragraph.size() << endl;
int i = 0, j = 0, sum = 0;
for (j = 0; sum < words.size(); j++)
{
for (i = 0; i < test->a[j]; i++) {
test->file[j] = test->file[j] + words[i + sum];
}
sum = sum + test->a[j];
}
test->length = j;
// cout << j << endl;
return test;
}
int main()
{
result * t1;
result * t2;
string frist = "file1.txt";
string second = "file2.txt";
t1 = new result;
t1 = read(frist);
t2 = new result;
t2 = read(second);
int sum = 0, temp = 0, sum1 = 0;
string tt;
for (int i = 0; i < t2->length; i++)
{
for (int j = 0; j < t1->length; j++)
{
tt = getLCS(t2->file[i], t1->file[j]);
if (temp < tt.length())
temp = tt.length();
}
//cout << tt << tt.length() << temp << endl;
sum = sum + t2->file[i].length();
sum1 = sum1 + temp;
temp = 0;
}
float sim;
sim = (float)sum1 / sum;
cout << "文本总长度" << sum << endl;
cout << "重复文本总长度" << sum1 << endl;
cout << "相似度:" << sim << endl;
getchar();
return 0;
}