http://www.cnblogs.com/huangfox/archive/2010/10/16/1853086.html
首先假设建立索引为:
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class ceshi0303 {
public static void main(String[] args) {
IndexWriter writer = null;
FSDirectory dir = null;
try {
dir = FSDirectory.open(new File("d:\\20140303index"));
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
IndexWriterConfig iwConfig = new IndexWriterConfig(
Version.LUCENE_46, analyzer);
writer = new IndexWriter(dir, iwConfig);
Field f1 = new TextField("f1", "", Store.YES);
Field f2 = new TextField("f2", "", Store.YES);
long s = System.currentTimeMillis();
for (int i = 0; i < 500000; i++) {
Document doc = new Document();
f1.setStringValue("f1 hello doc" + i);
doc.add(f1);
f2.setStringValue("f2 world doc" + i);
doc.add(f2);
writer.addDocument(doc);
}
System.out.println(System.currentTimeMillis() - s + "ms");
System.out.println("over");
}catch (IOException e) {
e.printStackTrace();
}finally {
try {
writer.close();
}catch (IOException e) {
e.printStackTrace();
}
}
}
}
1.search(Query query , int n)示例
取出前n条目标结果。
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
<pre code_snippet_id="215126" snippet_file_name="blog_20140303_2_7427458" name="code" class="java">public class IndexSearchDemo {
public static void main(String[] args) {
new IndexSearchDemo().searcher("f1 hello doc1000");
}
public void searcher(String queryString) {
try {
FSDirectory dir = FSDirectory.open(new File("d:\\20140303index"));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(Version.LUCENE_46, "f1", new StandardAnalyzer(Version.LUCENE_46));
Query query = parser.parse(queryString);
System.out.println(query.toString("f1"));
TopDocs tds = searcher.search(query, 5);
ScoreDoc[] sd = tds.scoreDocs;
System.out.println(tds.totalHits);
for (ScoreDoc doc : sd) {
System.out.println(reader.document(doc.doc));
//注意点2:怎么查看每个文档的打分的详情。
System.out.println("Explanation: " + searcher.explain(query, doc.doc));
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
输出:
500000
Document<stored,indexed,tokenized<f1:f1 hello doc1000> stored,indexed,tokenized<f2:f2 world doc1000>>
Explanation: 6.7517376 = (MATCH) sum of:
0.037027355 = (MATCH) weight(f1:f1 in 1000) [DefaultSimilarity], result of:
0.037027355 = score(doc=1000,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 1000, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=1000)
0.037027355 = (MATCH) weight(f1:hello in 1000) [DefaultSimilarity], result of:
0.037027355 = score(doc=1000,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 1000, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=1000)
6.677683 = (MATCH) weight(f1:doc1000 in 1000) [DefaultSimilarity], result of:
6.677683 = score(doc=1000,freq=1.0 = termFreq=1.0
), product of:
0.99450076 = queryWeight, product of:
13.429216 = idf(docFreq=1, maxDocs=500000)
0.07405501 = queryNorm
6.714608 = fieldWeight in 1000, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
13.429216 = idf(docFreq=1, maxDocs=500000)
0.5 = fieldNorm(doc=1000)
Document<stored,indexed,tokenized<f1:f1 hello doc0> stored,indexed,tokenized<f2:f2 world doc0>>
Explanation: 0.04936981 = (MATCH) product of:
0.07405471 = (MATCH) sum of:
0.037027355 = (MATCH) weight(f1:f1 in 0) [DefaultSimilarity], result of:
0.037027355 = score(doc=0,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 0, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=0)
0.037027355 = (MATCH) weight(f1:hello in 0) [DefaultSimilarity], result of:
0.037027355 = score(doc=0,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 0, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=0)
0.6666667 = coord(2/3)
Document<stored,indexed,tokenized<f1:f1 hello doc1> stored,indexed,tokenized<f2:f2 world doc1>>
Explanation: 0.04936981 = (MATCH) product of:
0.07405471 = (MATCH) sum of:
0.037027355 = (MATCH) weight(f1:f1 in 1) [DefaultSimilarity], result of:
0.037027355 = score(doc=1,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 1, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=1)
0.037027355 = (MATCH) weight(f1:hello in 1) [DefaultSimilarity], result of:
0.037027355 = score(doc=1,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 1, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=1)
0.6666667 = coord(2/3)
Document<stored,indexed,tokenized<f1:f1 hello doc2> stored,indexed,tokenized<f2:f2 world doc2>>
Explanation: 0.04936981 = (MATCH) product of:
0.07405471 = (MATCH) sum of:
0.037027355 = (MATCH) weight(f1:f1 in 2) [DefaultSimilarity], result of:
0.037027355 = score(doc=2,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 2, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=2)
0.037027355 = (MATCH) weight(f1:hello in 2) [DefaultSimilarity], result of:
0.037027355 = score(doc=2,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 2, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=2)
0.6666667 = coord(2/3)
Document<stored,indexed,tokenized<f1:f1 hello doc3> stored,indexed,tokenized<f2:f2 world doc3>>
Explanation: 0.04936981 = (MATCH) product of:
0.07405471 = (MATCH) sum of:
0.037027355 = (MATCH) weight(f1:f1 in 3) [DefaultSimilarity], result of:
0.037027355 = score(doc=3,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 3, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=3)
0.037027355 = (MATCH) weight(f1:hello in 3) [DefaultSimilarity], result of:
0.037027355 = score(doc=3,freq=1.0 = termFreq=1.0
), product of:
0.07405486 = queryWeight, product of:
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.07405501 = queryNorm
0.499999 = fieldWeight in 3, product of:
1.0 = tf(freq=1.0), with freq of:
1.0 = termFreq=1.0
0.999998 = idf(docFreq=500000, maxDocs=500000)
0.5 = fieldNorm(doc=3)
0.6666667 = coord(2/3)
奇怪的是所有建立索引的都匹配了,只是返回按照评分大小返回的,所以第一个是我们要的。
2.search(Query query , Collector results)
在介绍这个方法之前,先来了解下Collector:
Collectors are primarily meant to be used to gather raw results from a search, and implement sorting or custom result filtering, collation, etc.
先重点了解:
TopScoreDocCollector
is a concrete subclass TopDocsCollector
and sorts according to score + docID.
This is used internally by the IndexSearcher
search methods that do not take an explicitSort
. It is likely the most frequently used collector.
他是最常用的collector子类,是默认相关度排序的。下面给出一个实例,使用TopScoreDocCollector进行结果的收集,并提供简单的分页功能。
import java.io.File;
import java.io.IOException;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
public class IndexSearchDemo {
public static void main(String[] args) {
new IndexSearchDemo().searcher("f1 hello doc1000", 10, 10);
}
public void searcher(String queryString, int start, int howMany) {
try {
FSDirectory dir = FSDirectory.open(new File("d:\\20140303index"));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(Version.LUCENE_46, "f1", new StandardAnalyzer(Version.LUCENE_46));
Query query = parser.parse(queryString);
int hm = start + howMany;
TopScoreDocCollector res = TopScoreDocCollector.create(hm, false);
System.out.println("total hits : " + res.getTotalHits());
searcher.search(query, res);
System.out.println("total hits : " + res.getTotalHits());
//注意点2:这里可以控制分页。
TopDocs tds = res.topDocs(start, howMany);
ScoreDoc[] docs = tds.scoreDocs;
for (ScoreDoc doc : docs) {
System.out.println(reader.document(doc.doc));
}
}catch (IOException e) {
e.printStackTrace();
}catch (ParseException e) {
e.printStackTrace();
}
}
}
我只能说,第一次碰到collector,不是很懂,但是对于分页,想说下自己目前的理解。TopScoreDocCollector res = TopScoreDocCollector.create(hm, false );hm就是searcher.search时取得数据条数,第二个参数为 false 表明文档号是按照从小到大遍历的。TopDocs tds = res.topDocs(start, howMany);所以感觉只是提供了一个分页的API,但是并不清楚有没有什么优化。
3.search(Query query , Filter filter , int n , Sort sort)
public void searcher(String queryString) {
try {
FSDirectory dir = FSDirectory.open(new File("d:\\20140303index"));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(Version.LUCENE_46, "f1", new StandardAnalyzer(Version.LUCENE_46));
Query query = parser.parse(queryString);
SortField sf = new SortField("f1", SortField.Type.INT);
Sort sort = new Sort(sf);
TopDocs tds = searcher.search(query, null, 5, sort);
System.out.println(tds.totalHits);
ScoreDoc[] docs = tds.scoreDocs;
for (ScoreDoc doc : docs) {
System.out.println(reader.document(doc.doc));
}
}catch (IOException e) {
e.printStackTrace();
}catch (ParseException e) {
e.printStackTrace();
}
}
输出:
500000
Document<stored,indexed,tokenized<f1:f1 hello doc0> stored,indexed,tokenized<f2:f2 world doc0>>
Document<stored,indexed,tokenized<f1:f1 hello doc1> stored,indexed,tokenized<f2:f2 world doc1>>
Document<stored,indexed,tokenized<f1:f1 hello doc2> stored,indexed,tokenized<f2:f2 world doc2>>
Document<stored,indexed,tokenized<f1:f1 hello doc3> stored,indexed,tokenized<f2:f2 world doc3>>
Document<stored,indexed,tokenized<f1:f1 hello doc4> stored,indexed,tokenized<f2:f2 world doc4>>