IndexSearcher的基础使用及注意事项

最新推荐文章于 2020-08-06 18:00:35 发布

ningbohezhijun

最新推荐文章于 2020-08-06 18:00:35 发布

阅读量3.6k

点赞数

分类专栏： Lucene

本文链接：https://blog.csdn.net/ningbohezhijunbl/article/details/20380917

版权

Lucene 专栏收录该内容

8 篇文章 0 订阅

订阅专栏

http://www.cnblogs.com/huangfox/archive/2010/10/16/1853086.html

首先假设建立索引为：

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;


public class ceshi0303 {
	public static void main(String[] args) {
		IndexWriter writer = null;
		FSDirectory dir = null;
		try {
			dir = FSDirectory.open(new File("d:\\20140303index"));
			Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);  
            IndexWriterConfig iwConfig = new IndexWriterConfig(  
                    Version.LUCENE_46, analyzer);
			writer = new IndexWriter(dir, iwConfig);
			Field f1 = new TextField("f1", "", Store.YES);
			Field f2 = new TextField("f2", "", Store.YES);
			long s = System.currentTimeMillis();
			for (int i = 0; i < 500000; i++) {
				Document doc = new Document();
				f1.setStringValue("f1 hello doc" + i);
				doc.add(f1);
				f2.setStringValue("f2 world doc" + i);
				doc.add(f2);		
				writer.addDocument(doc);
			}
			System.out.println(System.currentTimeMillis() - s + "ms");
			System.out.println("over");
		}catch (IOException e) {
			e.printStackTrace();
		}finally {
			try {
				writer.close();
			}catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}

说明：这里可以看出，由于TextField分析器会对该域的值进行相应的解析，所以跑得要比StringField慢（StringField不用解析），由3892.8ms变为了5775ms。

1.search(Query query , int n)示例

取出前n条目标结果。

import java.io.File;
import java.io.IOException;


import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

<pre code_snippet_id="215126" snippet_file_name="blog_20140303_2_7427458" name="code" class="java">public class IndexSearchDemo {
	
	public static void main(String[] args) {
		new IndexSearchDemo().searcher("f1 hello doc1000");
	}
	
	public void searcher(String queryString) {
		try {
			FSDirectory dir = FSDirectory.open(new File("d:\\20140303index"));
			IndexReader reader = DirectoryReader.open(dir);
			IndexSearcher searcher = new IndexSearcher(reader);
			
			QueryParser parser = new QueryParser(Version.LUCENE_46, "f1", new StandardAnalyzer(Version.LUCENE_46));
			Query query = parser.parse(queryString);
			System.out.println(query.toString("f1"));
			TopDocs tds = searcher.search(query, 5);
			ScoreDoc[] sd = tds.scoreDocs;
			System.out.println(tds.totalHits);
			for (ScoreDoc doc : sd) {
				System.out.println(reader.document(doc.doc));
				//注意点2：怎么查看每个文档的打分的详情。
				System.out.println("Explanation: " + searcher.explain(query, doc.doc));
			}
		}catch (Exception e) {
			e.printStackTrace();
		}
	}
}

输出：

500000
Document<stored,indexed,tokenized<f1:f1 hello doc1000> stored,indexed,tokenized<f2:f2 world doc1000>>
Explanation: 6.7517376 = (MATCH) sum of:
  0.037027355 = (MATCH) weight(f1:f1 in 1000) [DefaultSimilarity], result of:
    0.037027355 = score(doc=1000,freq=1.0 = termFreq=1.0
), product of:
      0.07405486 = queryWeight, product of:
        0.999998 = idf(docFreq=500000, maxDocs=500000)
        0.07405501 = queryNorm
      0.499999 = fieldWeight in 1000, product of:
        1.0 = tf(freq=1.0), with freq of:
          1.0 = termFreq=1.0
        0.999998 = idf(docFreq=500000, maxDocs=500000)
        0.5 = fieldNorm(doc=1000)
  0.037027355 = (MATCH) weight(f1:hello in 1000) [DefaultSimilarity], result of:
    0.037027355 = score(doc=1000,freq=1.0 = termFreq=1.0
), product of:
      0.07405486 = queryWeight, product of:
        0.999998 = idf(docFreq=500000, maxDocs=500000)
        0.07405501 = queryNorm
      0.499999 = fieldWeight in 1000, product of:
        1.0 = tf(freq=1.0), with freq of:
          1.0 = termFreq=1.0
        0.999998 = idf(docFreq=500000, maxDocs=500000)
        0.5 = fieldNorm(doc=1000)
  6.677683 = (MATCH) weight(f1:doc1000 in 1000) [DefaultSimilarity], result of:
    6.677683 = score(doc=1000,freq=1.0 = termFreq=1.0
), product of:
      0.99450076 = queryWeight, product of:
        13.429216 = idf(docFreq=1, maxDocs=500000)
        0.07405501 = queryNorm
      6.714608 = fieldWeight in 1000, product of:
        1.0 = tf(freq=1.0), with freq of:
          1.0 = termFreq=1.0
        13.429216 = idf(docFreq=1, maxDocs=500000)
        0.5 = fieldNorm(doc=1000)

Document<stored,indexed,tokenized<f1:f1 hello doc0> stored,indexed,tokenized<f2:f2 world doc0>>
Explanation: 0.04936981 = (MATCH) product of:
  0.07405471 = (MATCH) sum of:
    0.037027355 = (MATCH) weight(f1:f1 in 0) [DefaultSimilarity], result of:
      0.037027355 = score(doc=0,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 0, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=0)
    0.037027355 = (MATCH) weight(f1:hello in 0) [DefaultSimilarity], result of:
      0.037027355 = score(doc=0,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 0, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=0)
  0.6666667 = coord(2/3)

Document<stored,indexed,tokenized<f1:f1 hello doc1> stored,indexed,tokenized<f2:f2 world doc1>>
Explanation: 0.04936981 = (MATCH) product of:
  0.07405471 = (MATCH) sum of:
    0.037027355 = (MATCH) weight(f1:f1 in 1) [DefaultSimilarity], result of:
      0.037027355 = score(doc=1,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 1, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=1)
    0.037027355 = (MATCH) weight(f1:hello in 1) [DefaultSimilarity], result of:
      0.037027355 = score(doc=1,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 1, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=1)
  0.6666667 = coord(2/3)

Document<stored,indexed,tokenized<f1:f1 hello doc2> stored,indexed,tokenized<f2:f2 world doc2>>
Explanation: 0.04936981 = (MATCH) product of:
  0.07405471 = (MATCH) sum of:
    0.037027355 = (MATCH) weight(f1:f1 in 2) [DefaultSimilarity], result of:
      0.037027355 = score(doc=2,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 2, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=2)
    0.037027355 = (MATCH) weight(f1:hello in 2) [DefaultSimilarity], result of:
      0.037027355 = score(doc=2,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 2, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=2)
  0.6666667 = coord(2/3)

Document<stored,indexed,tokenized<f1:f1 hello doc3> stored,indexed,tokenized<f2:f2 world doc3>>
Explanation: 0.04936981 = (MATCH) product of:
  0.07405471 = (MATCH) sum of:
    0.037027355 = (MATCH) weight(f1:f1 in 3) [DefaultSimilarity], result of:
      0.037027355 = score(doc=3,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 3, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=3)
    0.037027355 = (MATCH) weight(f1:hello in 3) [DefaultSimilarity], result of:
      0.037027355 = score(doc=3,freq=1.0 = termFreq=1.0
), product of:
        0.07405486 = queryWeight, product of:
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.07405501 = queryNorm
        0.499999 = fieldWeight in 3, product of:
          1.0 = tf(freq=1.0), with freq of:
            1.0 = termFreq=1.0
          0.999998 = idf(docFreq=500000, maxDocs=500000)
          0.5 = fieldNorm(doc=3)
  0.6666667 = coord(2/3)

奇怪的是所有建立索引的都匹配了，只是返回按照评分大小返回的，所以第一个是我们要的。

2.search(Query query , Collector results)

在介绍这个方法之前，先来了解下Collector：

Collectors are primarily meant to be used to gather raw results from a search, and implement sorting or custom result filtering, collation, etc.

先重点了解：

TopScoreDocCollector is a concrete subclass TopDocsCollector and sorts according to score + docID.

This is used internally by the IndexSearcher search methods that do not take an explicitSort. It is likely the most frequently used collector.

他是最常用的collector子类，是默认相关度排序的。下面给出一个实例，使用TopScoreDocCollector进行结果的收集，并提供简单的分页功能。

import java.io.File;  
import java.io.IOException;  
  
import org.apache.lucene.analysis.standard.StandardAnalyzer;  
import org.apache.lucene.index.DirectoryReader;  
import org.apache.lucene.index.IndexReader;  
import org.apache.lucene.queryparser.classic.ParseException;  
import org.apache.lucene.queryparser.classic.QueryParser;  
import org.apache.lucene.search.IndexSearcher;  
import org.apache.lucene.search.Query;  
import org.apache.lucene.search.ScoreDoc;  
import org.apache.lucene.search.TopDocs;  
import org.apache.lucene.search.TopScoreDocCollector;  
import org.apache.lucene.store.FSDirectory;  
import org.apache.lucene.util.Version;  
  
  
public class IndexSearchDemo {  
      
    public static void main(String[] args) {  
        new IndexSearchDemo().searcher("f1 hello doc1000", 10, 10);  
    }  
      
    public void searcher(String queryString, int start, int howMany) {  
        try {  
            FSDirectory dir = FSDirectory.open(new File("d:\\20140303index"));  
            IndexReader reader = DirectoryReader.open(dir);  
            IndexSearcher searcher = new IndexSearcher(reader);  
              
            QueryParser parser = new QueryParser(Version.LUCENE_46, "f1", new StandardAnalyzer(Version.LUCENE_46));  
            Query query = parser.parse(queryString);  
              
            int hm = start + howMany;  
            TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);  
            System.out.println("total hits : " + res.getTotalHits());  
            searcher.search(query, res);  
            System.out.println("total hits : " + res.getTotalHits());  
            //注意点2：这里可以控制分页。  
            TopDocs tds = res.topDocs(start, howMany);  
            ScoreDoc[] docs = tds.scoreDocs;  
            for (ScoreDoc doc : docs) {  
                System.out.println(reader.document(doc.doc));  
            }  
        }catch (IOException e) {  
            e.printStackTrace();  
        }catch (ParseException e) {  
            e.printStackTrace();  
        }  
    }  
}

我只能说，第一次碰到collector,不是很懂，但是对于分页，想说下自己目前的理解。TopScoreDocCollector res = TopScoreDocCollector.create(hm, false );hm就是searcher.search时取得数据条数，第二个参数为 false 表明文档号是按照从小到大遍历的。TopDocs tds = res.topDocs(start, howMany);所以感觉只是提供了一个分页的API，但是并不清楚有没有什么优化。

3.search(Query query , Filter filter , int n , Sort sort)

public void searcher(String queryString) {  
        try {  
            FSDirectory dir = FSDirectory.open(new File("d:\\20140303index"));  
            IndexReader reader = DirectoryReader.open(dir);  
            IndexSearcher searcher = new IndexSearcher(reader);  
              
            QueryParser parser = new QueryParser(Version.LUCENE_46, "f1", new StandardAnalyzer(Version.LUCENE_46));  
            Query query = parser.parse(queryString);  
              
            SortField sf = new SortField("f1", SortField.Type.INT);  
            Sort sort = new Sort(sf);  
            TopDocs tds = searcher.search(query,  null, 5, sort);  
            System.out.println(tds.totalHits);  
            ScoreDoc[] docs = tds.scoreDocs;  
            for (ScoreDoc doc : docs) {  
                System.out.println(reader.document(doc.doc));  
            }  
        }catch (IOException e) {  
            e.printStackTrace();  
        }catch (ParseException e) {  
            e.printStackTrace();  
        }  
    }

输出：

500000  
Document<stored,indexed,tokenized<f1:f1 hello doc0> stored,indexed,tokenized<f2:f2 world doc0>>  
Document<stored,indexed,tokenized<f1:f1 hello doc1> stored,indexed,tokenized<f2:f2 world doc1>>  
Document<stored,indexed,tokenized<f1:f1 hello doc2> stored,indexed,tokenized<f2:f2 world doc2>>  
Document<stored,indexed,tokenized<f1:f1 hello doc3> stored,indexed,tokenized<f2:f2 world doc3>>  
Document<stored,indexed,tokenized<f1:f1 hello doc4> stored,indexed,tokenized<f2:f2 world doc4>>