November 24
Browse the Index
参考文档:http://lemur.wiki.sourceforge.net/Example+Applications+in+CPlusPlus
http://www.lemurproject.org/doxygen/lemur/html/classlemur_1_1api_1_1Index.html
随便写一个测试程序:
- #include<iostream>
- #include"Index.hpp"
- #include"IndexManager.hpp"
- #include"TermInfoList.hpp"
- #include"DocInfoList.hpp"
- #include<string>
- using namespace std;
- using namespace lemur::api;
- int main()
- {
- Index *ind=IndexManager::openIndex("/media/disk/IndexIndri");
- /*
- * term( char* s ): convert term string to a number
- * term( int id ): convert term number to a string
- * termCount(): Total number of terms indexed
- * termCount( int id ): Total number of occurrences of term number id
- * docLength( int docID ): The length, in number of terms, of document number docID.
- * termCountUnique(): Size of the index vocabulary
- * docInfoList( int termID ): Returns an iterator to the inverted list for termID.
- * The list contains all documents that contain termID, including the positions where termID occurs.
- * document( char* s ): convert doc string to a number
- * document( int id ): convert doc number to a string
- * docCount(): Number of documents indexed
- * docCount( int id ): Number of documents that contain term number id
- * docLengthAvg(): Average indexed document length
- * termInfoList( int docID ): Returns an iterator to the direct list for docID.
- * The list contains term numbers for every term contained in document docID, and the number of times each word occurs.
- */
- //irst,we want to get some terms counts and its counts in special documents
- string term1="How",term2="are",term3="you";
- int termID=ind->term(term1);
- cout<<term1<<":"<<ind->termCount(termID)<<"/n"
- <<"Its ID is:"<<ind->term(term1)<<"/n"
- <<"Unique Terms="<<ind->termCountUnique()<<"/n"
- <<"All words="<<ind->termCount()<<endl;
- int docID=ind->document("WTX001-B01-1");
- cout<<ind->document(docID)<<"'s ID is "<<docID<<"/n"
- <<"It has "<<ind->docCount()<<" words /n"
- <<"It has "<<ind->docCount(ind->term(term1))<<" /"how/" /n"
- <<endl;//get one document ID;
- DocInfoList *docList = ind->docInfoList(termID);
- // iterate over entries in docList
- docList->startIteration();
- DocInfo *dEntry;
- int times=0;
- while (docList->hasMore() && times++<20)
- {
- dEntry = docList->nextEntry();
- // note that nextEntry() does *not* return an instance,
- // instead, it passes out a pointer to a local static variable
- // so no "delete" is needed.
- // print out this entry
- cout << "-> " << dEntry->termCount() << " times in doc "
- << ind->document(dEntry->docID()) << endl;
- for (int t=0; t < dEntry->termCount(); t++)
- {
- cout << dEntry->positions()[t] << " ";
- }
- cout << endl;
- }
- delete docList; // note that you MUST delete docList!
- TermInfoList * termInfoList = ind->termInfoList(docID);
- termInfoList->startIteration();
- times=0;
- while(termInfoList->hasMore() &×++<10)
- {
- TermInfo * termInfo = termInfoList->nextEntry();
- cout <<ind->term(termInfo->termID()) << "----" << termInfo->count()<<endl;
- // for(int s=0;s<termInfo->count();++s)
- // cout<<termInfo->positions()[s]<<"/t";
- }
- cout<<endl;
- termInfoList=ind->termInfoListSeq(docID);
- termInfoList->startIteration();
- times=0;
- while(termInfoList->hasMore() && times++<10)
- {
- TermInfo *termInfo=termInfoList->nextEntry();
- cout <<ind->term(termInfo->termID()) << "/t" << termInfo->count()<<endl;
- }
- delete termInfoList;
- return 0;
- }
下一站:Simple Retrieval