本文实现了根据余弦距离的文本相似度的C++实现算法,如要要点如下:
1、对1998年1月的人民日报所有文章进行预处理(其中文件已经分化好分词),然后进行去噪声、去停用词等操作。
2、对处理好的数据进行余弦计算,并存储为相应的数据结构。
3、输出前N篇最相似的文章
下面介绍
Statistics.cpp,对预处理文件进行统计词频。
1 #pragma once 2 #include "TextSimilarity.h" 3 #include <windows.h> 4 5 6 void ContentStatistics(string& ArticleContent,Article& SigleArtile) 7 { 8 //1.把词分割放入一个list中 9 //2.用停用词删除list中的元素 10 //3.遍历list,记住首词,统计出现次数,重复删掉 11 // 词和次数放入SingleArtile的对象中的map<string,size_t> WordList中 12 13 list<string> Wordlist; 14 list<string>::iterator WordIter; 15 16 size_t offset = 0; 17 size_t begin = 0; 18 size_t over = ArticleContent.size(); 19 size_t end = 0; 20 string temp; 21 22 while(offset != over) 23 { 24 if(ArticleContent[offset]>0 && ArticleContent[offset] < 127) 25 { 26 offset++; 27 } 28 else 29 { 30 begin = offset; 31 while(!(ArticleContent[offset]>0 && ArticleContent[offset] < 127) 32 && (ArticleContent[offset] != ' ')) 33 { 34 offset++; 35 } 36 //end = offset - 1; 37 temp = ArticleContent.substr(begin,offset-begin); 38 Wordlist.push_back(temp); 39 } 40 } 41 42 //2. 43 fstream FStopWord; 44 FStopWord = FOpen("stopword.txt"); 45 vector<string> StopWord; 46 vector<string>::iterator StopWordIter; 47 char* C_temp = new char[20]; 48 //存储停用词 49 while(!FStopWord.eof()) 50 { 51 FStopWord.getline(C_temp,20,'\n'); 52 StopWord.push_back(C_temp); 53 } 54 FStopWord.close(); 55 // 56 for(StopWordIter = StopWord.begin(); StopWordIter != StopWord.end();StopWordIter++ ) 57 { 58 WordIter = Wordlist.begin(); 59 while((WordIter = find(WordIter,Wordlist.end(),(*StopWordIter))) != Wordlist.end()) 60 { 61 WordIter = Wordlist.erase(WordIter); 62 } 63 } 64 65 66 //3.遍历list,记住首词,统计出现次数,重复删掉 67 // 词和次数放入SingleArtile的对象中的map<string,size_t> WordList中 68 size_t WordCount; 69 list<string>::iterator RearIter; 70 string TheWord; 71 double SumCount = 0.0; 72 for (WordIter = Wordlist.begin();WordIter != Wordlist.end();++WordIter) 73 { 74 TheWord = *WordIter; 75 WordCount = 0; 76 RearIter = WordIter; 77 78 while((RearIter = find(RearIter,Wordlist.end(),TheWord)) != Wordlist.end()) 79 { 80 if(WordCount >= 1) 81 { 82 RearIter = Wordlist.erase(RearIter); 83 } 84 else 85 { 86 RearIter++; 87 } 88 WordCount++; 89 } 90 //求和 91 SumCount += WordCount*WordCount; 92 SigleArtile.SetWordList(TheWord,WordCount); 93 } 94 SumCount = sqrt(SumCount); 95 SigleArtile.SetVectorLength(SumCount); 96 }
ProProcessing.cpp,其中bb.txt为间接处理文件。
1 void ProProcessing(list<Article>& ArticleList) 2 { 3 fstream ArticleFile; 4 const size_t MaxLength = 10240; 5 string ArticleHead = ""; 6 string ArticleContent = ""; 7 char* C_temp = new char[MaxLength]; 8 string S_temp; 9 map<string,size_t> Word; 10 bool FirstHead = true; 11 12 fstream save; 13 save = FOpen("bb.txt"); 14 Article SigleArtile; 15 ArticleFile = FOpen("199801.txt"); 16 while(!ArticleFile.eof()) 17 { 18 //每一篇文章 19 ArticleFile.getline(C_temp,MaxLength,'\n'); 20 S_temp = C_temp; 21 //文章内 22 if(S_temp.size() != 0) 23 { 24 if(FirstHead) 25 { 26 //得到文章标题 27 ArticleHead = S_temp.substr(0,15); 28 ArticleHead.append(1,'\n'); 29 SigleArtile.SetArticleDate(ArticleHead); 30 FirstHead = false; 31 } 32 //正文开始 33 S_temp.replace(0,23,1,' '); 34 //正文链接 35 ArticleContent.append(S_temp); 36 } 37 else//有空格,一篇读完 38 { 39 ArticleContent.append("\n"); 40 //内容统计 41 SigleArtile.DelWordList(); 42 ContentStatistics(ArticleContent, SigleArtile); 43 44 map<string,size_t>::iterator mapiter; 45 for( mapiter = SigleArtile.GetWordList().begin() ;mapiter != SigleArtile.GetWordList().end();mapiter++) 46 { 47 save << (*mapiter).first; 48 save << ":"; 49 save << (*mapiter).second; 50 save << "|"; 51 } 52 save << '\n'; 53 //存储文章 54 ArticleList.push_back(SigleArtile); 55 //下一篇文章 56 FirstHead = true; 57 ArticleContent.clear(); 58 } 59 } 60 //销毁字符 61 save.close(); 62 delete[] C_temp; 63 ArticleFile.close(); 64 }
数据以文件的形式存储,最后根据预处理好的文件进行余弦距离运算,计算时当两篇文档其中有一篇无此词时,就忽略运算,保证只进行互有词运算。另外,对每篇文章的长度进行预运算,目的是提高速度。
TextComputing.cpp
1 void TextComputing(list<Article>& ArticleList,size_t ArticleNum) 2 { 3 //1.计算文本长度 4 //2.1-100之间和除此之外的乘机之和然后 5 6 //1. 7 // size_t ArticleNum = 100; 8 size_t ArticleOffset = 1; 9 list<Article>::iterator FiArticleIter; 10 list<Article>::iterator SeArticleIter; 11 map<string,size_t>::iterator FiWordIter; 12 map<string,size_t>::iterator SeWordIter; 13 double MulWord = 0.0; 14 fstream SaveResult; 15 SaveResult = FOpen("result.txt"); 16 17 for (FiArticleIter = ArticleList.begin();(FiArticleIter != ArticleList.end())&&(ArticleOffset <= ArticleNum); ++FiArticleIter) 18 { 19 for ( SeArticleIter = FiArticleIter; SeArticleIter != ArticleList.end();) 20 { 21 ++SeArticleIter; 22 for (FiWordIter = FiArticleIter->GetWordList().begin();FiWordIter != FiArticleIter->GetWordList().end();FiWordIter++) 23 { 24 if((SeWordIter = SeArticleIter->GetWordList().find((*FiWordIter).first)) != SeArticleIter->GetWordList().end()) 25 { 26 MulWord += SeWordIter->second * FiWordIter->second; 27 } 28 } 29 //2. 30 MulWord = MulWord/FiArticleIter->GetVectorLength()/SeArticleIter->GetVectorLength(); 31 result(FiArticleIter->GetArticleDate(), SeArticleIter->GetArticleDate(), MulWord,SaveResult); 32 //cout << FiArticleIter->GetArticleDate() << "->"<<SeArticleIter->GetArticleDate()<< ":" <<MulWord << endl; 33 MulWord = 0.0; 34 } 35 //cout << ArticleOffset << endl; 36 ArticleOffset++; 37 } 38 SaveResult.close(); 39 }
运行结果
输出result.txt文件,结果如下:
其运行结果,如图:
小结:本文只是对文本相似的初步验证,余弦响亮只考虑的词频,但是并没有考虑词的重要程度,以后的文本处理将基于TF-IDF进行预处理,在对词的存储上,程序使用了map存储,再查找相应词频时,会有时间损耗,办法是转变为倒排索引,可以大大提高词的查找速度,大约时间可缩减一半左右。
接下来,将进一步完善分词预处理等工作。