Lucene4.0索引一个文件夹下的所有文档,并输出每个文档的向量 (文档中每个term的词频)
已经实现:
首先,建立索引(indexfiles.java),然后(searchfiles.java作相应修改即可)
Searchfiles.java修改部分:
........
.......
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
BufferedReader in = null;
if (queries != null) {
in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
} else {
in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
}
QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer);
while (true) {
if (queries == null && queryString == null) { // prompt the user
System.out.println("Enter query: ");
}
String line = queryString != null ? queryString : in.readLine();
if (line == null || line.length() == -1) {
break;
}
line = line.trim();
if (line.length() == 0) {
break;
}
Query query = parser.parse(line);
System.out.println("Searching for: " + query.toString(field));
if (repeat > 0) { // repeat & time as benchmark
Date start = new Date();
for (int i = 0; i < repeat; i++) {
searcher.search(query, null, 100);
}
Date end = new Date();
System.out.println("Time: "+(end.getTime()-start.getTime())+"ms");
}
//-----------find top-k arguments---------------------------
// TopKarguments.TopKArguments(reader, searcher, query,outputTopArgsFileName);
//-----------generate individual documents vector---------------------------
Vectorize.DocVec(reader, searcher);
.......
.........
Vectorize.java 主要代码:
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
//import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
//my code
import org.apache.lucene.search.similarities.*;
//import org.apache.lucene.index.Term;
//import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
//import org.apache.lucene.index.TermState;
//import org.apache.lucene.index.TermContext;
//import org.apache.lucene.search.*;
//import org.apache.lucene.store.*;
//my code
public static VecArray GlobalVector( IndexReader reader) throws Exception
{
int TotalNumTerms=0;
VecArray Gva=new VecArray();
List<AtomicReaderContext> leaves=reader.leaves();
for(int i=0;i<leaves.size();++i)
{
AtomicReaderContext leaf=leaves.get(i);
AtomicReader ar=leaf.reader();
Fields fields=ar.fields();
if(fields!=null)
{
// for(String f:fields)
// {
Terms terms=fields.terms("contents");
TermsEnum termsEnum=terms.iterator(null);
BytesRef bytesref=null;
while((bytesref=termsEnum.next())!=null)
{
Gva.terms.add(termsEnum.term().utf8ToString());
Gva.freqs.add(termsEnum.totalTermFreq());
TotalNumTerms++;
}
// }
}
}
System.out.println("Total number of terms="+TotalNumTerms);
return Gva;
}
public static void DocLocalVec(IndexReader reader,BufferedReader in, IndexSearcher searcher, Query query, int hitsPerPage) throws Exception
{
//down:code for Global vector
int TotalNumTerms=0;
VecArray Gva=new VecArray();
List<AtomicReaderContext> leaves=reader.leaves();
for(int i=0;i<leaves.size();++i)
{
AtomicReaderContext leaf=leaves.get(i);
AtomicReader ar=leaf.reader();
Fields fields=ar.fields();
if(fields!=null)
{
for(String f:fields)
{
Terms terms=fields.terms("contents");
TermsEnum termsEnum=terms.iterator(null);
BytesRef bytesref=null;
while((bytesref=termsEnum.next())!=null)
{
Gva.terms.add(termsEnum.term().utf8ToString());
Gva.freqs.add(termsEnum.totalTermFreq());
TotalNumTerms++;
}
}
}
}
System.out.println("Total number of terms="+TotalNumTerms);
//up:code for Global vector
//down:code for query vector
String[] q=query.toString("contents").split(" "); //extract individual terms from string
System.out.println("Number of terms in query:"+q.length);
VecArray queryVec=new VecArray();
int count;
for(int j=0;j<TotalNumTerms;j++) //need modified as total arguments
{
count=0;
String tg=(String)Gva.terms.get(j);
for(int l=0;l<q.length;l++)
{
if(tg.equals(q[l]))
{
// System.out.println("GTerm: "+tg+"----queryTerm: "+q[l]);
count++;
}
}
queryVec.OriginalF.add(count); //store the term frequency in query
queryVec.terms.add(tg);
}
//System.out.println("\n"+queryVec.OriginalF.size());
/* System.out.print("query vector is:<");
for(int j=0;j<queryVec.OriginalF.size();j++)
{
if(Integer.parseInt(queryVec.OriginalF.get(j).toString())!=0)
System.out.print(queryVec.OriginalF.get(j)+",");
}
System.out.println(">-----------");*/
CosinSimilarity CSq=new CosinSimilarity();
CSq.normVec(queryVec.OriginalF, queryVec.NormF);
/* System.out.print("Normalized query vector is:<");
for(int j=0;j<queryVec.NormF.size();j++)
{
if(Float.parseFloat(queryVec.NormF.get(j).toString())!=0)
System.out.print(queryVec.NormF.get(j)+" ");
}
System.out.println(">");*/
//up:code for query vector
//-------------------------------------------------------------------------------
//down:code for documents vector
TopDocs results = searcher.search(query, 100 * hitsPerPage);
ScoreDoc[] hits = results.scoreDocs;
int numTotalHits = results.totalHits;
int totalNumDoc=reader.numDocs();
System.out.println(numTotalHits + " total matching documents");
System.out.println();
int start = 0;
int end = Math.min(numTotalHits, hitsPerPage);
end = Math.min(hits.length, start + hitsPerPage);
VecArray[] docVec=new VecArray[totalNumDoc];
Explanation explanation=null;
for (int i = start; i < totalNumDoc; i++)
{
docVec[i]=new VecArray(); //need initialization
//get the score and path of document
Document doc = searcher.doc(i);
String path = doc.get("path");
docVec[i].path=path;
//System.out.println("doc ID:"+hits[i].doc);
explanation=searcher.explain(query, i);
docVec[i].e=explanation;
//
Fields fields=searcher.getIndexReader().getTermVectors(i);
BytesRef vector=null;
if(fields!=null){
for(String field:fields)
{
Terms terms=fields.terms(field);
TermsEnum termsEnum=terms.iterator(null);
BytesRef bytesref=null;
while((bytesref=termsEnum.next())!=null)
{
docVec[i].terms.add(termsEnum.term().utf8ToString());
docVec[i].freqs.add(termsEnum.totalTermFreq());
}
}
}
}
int flag=0;
//top-k similar documents
int threshold=5;
if(threshold>numTotalHits)
{
threshold=numTotalHits;
}
//
float score=0;
for(int j=start;j<totalNumDoc;j++) //need modified as total arguments
{
System.out.print("Vector is:<");
for(int l=0;l<TotalNumTerms;l++)
{
String eg=(String)Gva.terms.get(l);
String e=null;
int f=0;
for(int k=0;k<docVec[j].terms.size();k++)
{
e=(String)docVec[j].terms.get(k);
if(eg.equals(e))
{
f= Integer.parseInt(docVec[j].freqs.get(k).toString()); //new stuff
flag++;
}
}
if(flag==0)
{
docVec[j].OriginalF.add(flag);
System.out.print(flag+",");
}
else
{
docVec[j].OriginalF.add(f);
System.out.print(f+",");
}
flag=0;
}
System.out.println(">");
CosinSimilarity CS=new CosinSimilarity();
CS.normVec(docVec[j].OriginalF, docVec[j].NormF);
score= CS.CosinSimilarityScore(docVec[j].NormF, queryVec.NormF);
docVec[j].score=score;
System.out.println((j+1) + ".Path:"+docVec[j].path+"---Score="+docVec[j].score);
System.out.println();
}
}
public static void DocVec(IndexReader reader,IndexSearcher searcher) throws Exception
{
VecArray Gva=new VecArray();
int totalNumDoc=reader.numDocs();
VecArray[] docVec=new VecArray[totalNumDoc];
Gva=GlobalVector(reader);
docVec=DocumentVector.BuildDocumentVector(reader, searcher, Gva);
for(int i=0;i<totalNumDoc;i++)
{
System.out.println((i+1) + ".path:"+docVec[i].path);
System.out.print("Original vector is:<");
for(int j=0;j<docVec[i].OriginalF.size();j++)
{
System.out.print(docVec[i].OriginalF.get(j)+",");
}
System.out.println(">");
System.out.print("Normalized vector is:<");
for(int j=0;j<docVec[i].NormF.size();j++)
{
System.out.print(docVec[i].NormF.get(j)+",");
}
System.out.println(">");
}
}
}