-
代码实现功能:
1.构建文档中每个词的倒排索引并输出
2.输入两个查询词,通过合并倒排索引,输出它们共同的文档ID -
实现如下:
1.在工程内添加doc1.txt~doc10.txt文档
2.代码如下
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <map>
#include <stdlib.h>
#include <sstream>
#include <utility>
using namespace std;
//存储倒排列表信息包括:TF,pos
struct DocMes{
int TF=0; //词频
string pos = ""; //在文档出现的位置
};
typedef map<int,DocMes> invertMap;//倒排列表
typedef map<string,map<int,DocMes> > IndexMap; //定义multiMap类型
//把单词转为小写
//void judge(string &word)
//把int转为String
string intToString(int a){
string res;
stringstream ss;
ss<<a;
ss>>res;
return res;
}
//索引建立,在与代码同文件夹的1.txt 2.txt 3.txt读入文档,分词,记录词语与词语对应倒排列表信息
void CreateIndex(IndexMap &index){
//读本地txt文档
ifstream fileIn;
for(int i=1;i<=10;i++){
string filepath = "doc"+intToString(i)+".txt";
fileIn.open(filepath.c_str());
string temp;
int posNum = 0;
while(fileIn>>temp){
posNum++;
index[temp][i].TF++;
index[temp][i].pos +=intToString(posNum)+" , ";
}
fileIn.close();
}
}
//查询处理
void queryProcess(IndexMap &index){
string query;
while(true){
cout<<"请输入查询词:";
pair <string , string> query;
cin>>query.first>>query.second;
if(index.find(query.first)==index.end()){
cout<<"没有找到"<<query.first<<"!请确认您输入的查询词是否有误"<<endl<<endl;
continue;
}
else if(index.find(query.second)==index.end()){
cout<<"没有找到"<<query.second<<"!请确认您输入的查询词是否有误"<<endl<<endl;
continue;
}
vector<int> docID;
invertMap::iterator riter1;
invertMap::iterator riter2;
riter1 = index[query.first].begin();
riter2 = index[query.second].begin();
while(riter1!=index[query.first].end() && riter2 != index[query.second].end()){ //合并倒排索引
if (riter1->first == riter2->first){
docID.push_back(riter1->first);
riter1++;
riter2++;
}else if(riter1->first < riter2->first){
riter1++;
}else {
riter2++;
}
}
if(docID.empty() == 1){ //查询词不在同一文档,输出各自的倒排索引
cout<<"这两个词在不同的文档,其倒排索引如下"<<endl;
invertMap::iterator riter1;
riter1 = index[query.first].begin();
while (riter1 != index[query.first].end()){
cout<<query.first<<" doc"<<riter1->first<<".txt Tf="<<riter1->second.TF<<" pos = ( "<<riter1->second.pos<<" )"<<endl;
riter1++;
}
invertMap::iterator riter2;
riter2 = index[query.second].begin();
while (riter2 != index[query.second].end()){
cout<<query.second<<" doc"<<riter2->first<<".txt Tf="<<riter2->second.TF<<" pos = ( "<<riter2->second.pos<<" )"<<endl;
riter2++;
}
}else {
for (int i = 0;i<docID.size();i++){
cout<<"查询词所在文档为:doc"<<docID[i]<<".txt "<<endl<<endl;
}
}
}
}
//输出倒排索引
int printinverse(IndexMap &index){
IndexMap::iterator iter;//索引表迭代器
invertMap::iterator riter;//倒排记录表迭代器
iter = index.begin();
while (iter != index.end()){
string term = iter->first;
cout<<endl<<term<<endl;
riter = index[term].begin();
while (riter != index[term].end()){
cout<<"doc"<<riter->first<<".txt Tf="<<riter->second.TF<<" pos = ( "<<riter->second.pos<<" )"<<endl;
riter++;
}
iter++;
}
}
int main(){
IndexMap invertIndex;
CreateIndex(invertIndex);
cout<<"建立倒排索引如下:"<<endl;
printinverse(invertIndex);
cout<<endl;
queryProcess(invertIndex);
return 0;
}
实现主要通过c++模板类map和pair。
- 实现效果如下