Lucene版本用的是3.5.0,虽然版本比较老,但是Zoie支持这个版本。
package w0141112;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.*;
import org.apache.lucene.store.FSDirectory;
import java.io.*;
/**
* User: zhijun.he@renren-inc.com
* Date: 2014/11/12.
*/
public class LuceneTask {
public static void main(String[] args) {
BufferedReader input = null;
FSDirectory dir = null;
IndexSearcher searcher = null;
String[] path = {"/data/dmp_index/dmp_index_v1_0", "/data/dmp_index/dmp_index_v1_1"};
try {
MultiReader reader = new MultiReader(new IndexReader[]{
IndexReader.open(FSDirectory.open(new File(path[0]))),
IndexReader.open(FSDirectory.open(new File(path[1])))});
/*dir = FSDirectory.open(new File("/data/dmp_index/dmp_index_v1_0"));
IndexReader reader = IndexReader.open(dir);*/
System.out.println("begin");
System.out.println(reader.numDocs());
/*TermEnum te = reader.terms(new Term("v", "stro04"));
int cnt = 0;
while (te.next()) {
Term term = te.term();
if (term.field().equals("v")) {
System.out.println(term.text());
if (++cnt == 200) {
break;
}
}
}*/
searcher = new IndexSearcher(reader);
// Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35);
InputStream inputStream = new FileInputStream("input.txt");
input = new BufferedReader(new InputStreamReader(inputStream, "utf-8"));
String tag;
while ((tag = input.readLine()) != null) {
System.out.print(tag + " ");
TermQuery query = new TermQuery(new Term("v", tag));
MyCollector collector = new MyCollector();
searcher.search(query, collector);
System.out.println(collector.cnt);
}
System.out.println("over");
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (searcher != null) {
searcher.close();
}
if (input != null) {
input.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
class MyCollector extends Collector {
int cnt = 0;
public MyCollector() {
super();
}
@Override
public void setScorer(Scorer scorer) throws IOException {
}
@Override
public void collect(int i) throws IOException {
cnt++;
}
@Override
public void setNextReader(IndexReader indexReader, int i) throws IOException {
}
@Override
public boolean acceptsDocsOutOfOrder() {
return false;
}
}
这里只是统计了tag这两个索引文件中的数量。说明以下几点收获:
1.对于向两个索引文件一起查找,网上查询的结果是使用ParallelMultiSearcher或者MultiSearcher,但是这两个方法即使在lucene 3.5.0版本都已经deprecated了,所以使用了
MultiReader
2.collector的使用
可以看到这里collector其实根本没有加什么代码,就实现了统计的功能。需要指出的是collect方法是对于每一个匹配query的doc都会进入这个方法,目测i就是匹配的doc的id号了。那我为什么不适用search(query, int)那个方法呢?(我突然发现,用search(query,int)方法更合适,原先理解错了,就是无论int设置多少小,totalHits都会返回所有值...)但是需要指出的是这个int值绝对不能设置太大,因为它会开辟一个相同大小的队列,太大就会使java运行时报错,比如设置为Integer.MaxValue, 会报out of heap)