本文实现了根据余弦距离的文本相似度的C++实现算法,如要要点如下:
1、对1998年1月的人民日报所有文章进行预处理(其中文件已经分化好分词),然后进行去噪声、去停用词等操作。
2、对处理好的数据进行余弦计算,并存储为相应的数据结构。
3、输出前N篇最相似的文章
下面介绍
Statistics.cpp,对预处理文件进行统计词频。
#pragma once
#include "TextSimilarity.h"
#include <windows.h>
void ContentStatistics(string& ArticleContent,Article& SigleArtile)
{
//1.把词分割放入一个list中
//2.用停用词删除list中的元素
//3.遍历list,记住首词,统计出现次数,重复删掉
// 词和次数放入SingleArtile的对象中的map<string,size_t> WordList中
list<string> Wordlist;
list<string>::iterator WordIter;
size_t offset = 0;
size_t begin = 0;
size_t over = ArticleContent.size();
size_t end = 0;
string temp;
while(offset != over)
{
if(ArticleContent[offset]>0 && ArticleContent[offset] < 127)
{
offset++;
}
else
{
begin = offset;
while(!(ArticleContent[offset]>0 && ArticleContent[offset] < 127)
&& (ArticleContent[offset] != ' '))
{
offset++;
}
//end = offset - 1;
temp = ArticleContent.substr(begin,offset-begin);
Wordlist.push_back(temp);
}
}
//2.
fstream FStopWord;
FStopWord = FOpen("stopword.txt");
vector<string> StopWord;
vector<string>::iterator StopWordIter;
char* C_temp = new char[20];
//存储停用词
while(!FStopWord.eof())
{
FStopWord.getline(C_temp,20,'\n');
StopWord.push_back(C_temp);
}
FStopWord.close();
//
for(StopWordIter = StopWord.begin(); StopWordIter != StopWord.end();StopWordIter++ )
{
WordIter = Wordlist.begin();
while((WordIter = find(WordIter,Wordlist.end(),(*StopWordIter))) != Wordlist.end())
{
WordIter = Wordlist.erase(WordIter);
}
}
//3.遍历list,记住首词,统计出现次数,重复删掉
// 词和次数放入SingleArtile的对象中的map<string,size_t> WordList中
size_t WordCount;
list<string>::iterator RearIter;
string TheWord;
double SumCount = 0.0;
for (WordIter = Wordlist.begin();WordIter != Wordlist.end();++WordIter)
{
TheWord = *WordIter;
WordCount = 0;
RearIter = WordIter;
while((RearIter = find(RearIter,Wordlist.end(),TheWord)) != Wordlist.end())
{
if(WordCount >= 1)
{
RearIter = Wordlist.erase(RearIter);
}
else
{
RearIter++;
}
WordCount++;
}
//求和
SumCount += WordCount*WordCount;
SigleArtile.SetWordList(TheWord,WordCount);
}
SumCount = sqrt(SumCount);
SigleArtile.SetVectorLength(SumCount);
}
ProProcessing.cpp,其中bb.txt为间接处理文件。
void ProProcessing(list<Article>& ArticleList)
{
fstream ArticleFile;
const size_t MaxLength = 10240;
string ArticleHead = "";
string ArticleContent = "";
char* C_temp = new char[MaxLength];
string S_temp;
map<string,size_t> Word;
bool FirstHead = true;
fstream save;
save = FOpen("bb.txt");
Article SigleArtile;
ArticleFile = FOpen("199801.txt");
while(!ArticleFile.eof())
{
//每一篇文章
ArticleFile.getline(C_temp,MaxLength,'\n');
S_temp = C_temp;
//文章内
if(S_temp.size() != 0)
{
if(FirstHead)
{
//得到文章标题
ArticleHead = S_temp.substr(0,15);
ArticleHead.append(1,'\n');
SigleArtile.SetArticleDate(ArticleHead);
FirstHead = false;
}
//正文开始
S_temp.replace(0,23,1,' ');
//正文链接
ArticleContent.append(S_temp);
}
else//有空格,一篇读完
{
ArticleContent.append("\n");
//内容统计
SigleArtile.DelWordList();
ContentStatistics(ArticleContent, SigleArtile);
map<string,size_t>::iterator mapiter;
for( mapiter = SigleArtile.GetWordList().begin() ;mapiter != SigleArtile.GetWordList().end();mapiter++)
{
save << (*mapiter).first;
save << ":";
save << (*mapiter).second;
save << "|";
}
save << '\n';
//存储文章
ArticleList.push_back(SigleArtile);
//下一篇文章
FirstHead = true;
ArticleContent.clear();
}
}
//销毁字符
save.close();
delete[] C_temp;
ArticleFile.close();
}
数据以文件的形式存储,最后根据预处理好的文件进行余弦距离运算,计算时当两篇文档其中有一篇无此词时,就忽略运算,保证只进行互有词运算。另外,对每篇文章的长度进行预运算,目的是提高速度。
TextComputing.cpp
https://blog.csdn.net/fy2462/article/details/31770541