Lucene支持term查询(TermQuery)、布尔查询、词语查询(PhraseQuery)、范围查询(RangeQuery)、前缀查询(PrefixQuery)、模糊查询(FuzzyQuery)等。Lucene的布尔查询又包括求交查询、求并查询和求差查询。在此以求交查询为例,说明Lucene的句子查询。
测试程序说明:
使用Lucene的求交的布尔查询。为支持中文分词,使用JE的MMAnalyzer。使用文档1,文档2,query1,query2,query3,query4进行测试:
核心代码及测试结果:
String[] searchWords = {"金融时报欧元区问题","美国华尔街 评论美元" ,"分析欧元"};
IndexSearcher indexSearcher = new IndexSearcher(dir);
Analyzer analyzer = new MMAnalyzer();
QueryParser qp = new QueryParser("contents", analyzer);
// queryParser默认是求并搜索,此处设置为求交搜索
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
for(int i=0; i<searchWords.length; i++){
query = qp.parse(searchWords[i]);
//打印parse结果:
System.out.println(query.toString());
Hits results = indexSearcher.search(query);
}
Ø 文档1:美国华尔街日报评论文章指出,这次六国央行联手放低美元换汇利率,是起源于今年9月份,联储官员与华人街对冲基金高管们闭门会议后的延续动作。http://url.cn/1W7NrE
查询分析 1:
Query1:“美国华尔街评论问题”
Parse结果:contents:"美国 华尔街 评论 文章"
查询结果:无结果
结论:取词“美国华尔街评论文章”的拉链为空
查询分析2:
Query2:“美国华尔街 评论美元”
Parse结果:+contents:"美国 华尔街" +contents:"评论 美元"
查询结果:无结果
结论:取词“美国华尔街”的拉链不为空,取词“评论美元”的拉链为空
Ø 文档2:金融 路透社分析指出,在全球经济都处于脆弱的状态下,如果12月九日的高峰会仍然是因政治博弈而没实质性消息出台的话,欧元存活会受到极大威胁,全球股市会大幅震荡。
查询分析1:
Query3:“分析 欧元”
Parse结果:+contents:分析 +contents:欧元
查询结果:有结果
结论:取词“分析”和“欧元”的拉链都不为空
查询分析2:
Query4:“分析欧元”
Parse结果:contents:"分析 欧元"
查询结果:无结果
结论:取词“分析欧元”的拉链为空
测试结论:
在布尔查询的求交查询中,QueryParse的parse方法将query中以空格分隔的词作为基本的term,而不是一些基本term的组合。例如上例中Parse结果:contents:"美国 华尔街 评论 文章",其表明contents后的内容是一个term,而不是4个term。
改进方法:
对原始查询重新构造新的查询,将query=“美国华尔街评论问题”,先构造为“美国 华尔街 评论 文章”,再去查询,以下的代码详细地说明了此构造过程:
package search;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class Boolean_search {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
try {
//根据索引文件夹位置,构造查询器
File indexDir = new File("D://lucene/index");
if (!indexDir.exists()) {
System.out.println("The Lucene index is not exist");
return;
}
Directory dir = FSDirectory.getDirectory(indexDir, false);
IndexSearcher indexSearcher = new IndexSearcher(dir);
//构造分词器、分析器
Analyzer analyzer = new MMAnalyzer();
QueryParser qp = new QueryParser("contents", analyzer);
// queryParser默认是求并搜索,此处设置为求交搜索
qp.setDefaultOperator(QueryParser.AND_OPERATOR);
//初始化查询query
String[] searchWords = {"金融时报欧元区问题","美国华尔街 评论美元" ,"分析,欧元"};//, "分析" ,"欧元区"};
String queryWords = "";
Query query = null;
//对每个原始查询query构造new query,使用句子查询,打印文档具体得分
for (int i = 0; i < searchWords.length; i++) {
//对每个原始查询query构造new query,用以下两种方法:
//方法一:利用分词器构造new query
queryWords = getTermsByAnalyzer(analyzer,searchWords[i]);
//方法二:利用分析器构造new query
//queryWords = getTermsByQueryParser(qp,searchWords[i]);
//使用查询器进行求交查询
query = qp.parse(queryWords);
Hits results = indexSearcher.search(query);
System.out.println(results.length() + " search results for query " + searchWords[i]);
System.out.println("query is parsed as:" + queryWords);
//查看query检索结果中文档的具体得分
if(results.length() > 0){
for(int k=0; k<results.length(); k++){
String explain = indexSearcher.explain(query, results.id(k)).toString();
System.out.println(explain);
}
}
}
} catch (CorruptIndexException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* 使用分词器构造new query
* @param analyzer
* @param s
* @throws Exception
*/
public static String getTermsByAnalyzer(Analyzer analyzer, String s) throws Exception {
String queryWord = "";
StringReader reader = new StringReader(s);
TokenStream ts = analyzer.tokenStream(s, reader);
//查看分词后的结果
System.out.println("分词器分词结果如下:");
Token t = ts.next();
while (t != null) {
queryWord += t.termText()+" ";
System.out.println(t.termText());
t = ts.next();
}
return queryWord;
}
/**
* 使用分析器构造new query
* @param qp
* @param s
* @return
*/
public static String getTermsByQueryParser(QueryParser qp, String s){
String queryWord = "";
Query query = null;
Query tmp_query = null;
//1.将s以空格分隔,得到查询词组
String tmp_words[] = s.split(" ");
//处理每个查询词组
for(int j=0; j<tmp_words.length; j++){
//2.对每个查询词组分词
try {
tmp_query = qp.parse(tmp_words[j]);
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String tmp_str = tmp_query.toString();
//System.out.println(tmp_str);
//3.将查询词组转化为以空格分隔的词的形式
if(tmp_str.charAt(9) != '\"'){
queryWord += tmp_str.substring(9, tmp_str.length()) + " ";
}else{
queryWord += tmp_str.substring(10, tmp_str.length()-1) + " ";
}
//System.out.println(queryWords);
}//queryWords 包含了查询中所有词
return queryWord;
}
}
运行结果如下:
分词器分词结果如下:
金融
时报
欧元区
问题
1 search results for query 金融时报欧元区问题
query is parsed as:金融 时报 欧元区 问题
0.8425362 = (MATCH) sum of:
0.22584343 = (MATCH) weight(contents:金融 in 5), product of:
0.51773727 = queryWeight(contents:金融), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.18545197 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:金融 in 5), product of:
1.0 = tf(termFreq(contents:金融)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
0.22584343 = (MATCH) weight(contents:时报 in 5), product of:
0.51773727 = queryWeight(contents:时报), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.18545197 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:时报 in 5), product of:
1.0 = tf(termFreq(contents:时报)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
0.16500592 = (MATCH) weight(contents:欧元区 in 5), product of:
0.442543 = queryWeight(contents:欧元区), product of:
2.3862944 = idf(docFreq=2, numDocs=12)
0.18545197 = queryNorm
0.3728585 = (MATCH) fieldWeight(contents:欧元区 in 5), product of:
1.0 = tf(termFreq(contents:欧元区)=1)
2.3862944 = idf(docFreq=2, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
0.22584343 = (MATCH) weight(contents:问题 in 5), product of:
0.51773727 = queryWeight(contents:问题), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.18545197 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:问题 in 5), product of:
1.0 = tf(termFreq(contents:问题)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=5)
分词器分词结果如下:
美国
华尔街
评论
美元
1 search results for query 美国华尔街 评论美元
query is parsed as:美国 华尔街 评论 美元
0.68775237 = (MATCH) sum of:
0.12486125 = (MATCH) weight(contents:美国 in 11), product of:
0.42608652 = queryWeight(contents:美国), product of:
1.8754687 = idf(docFreq=4, numDocs=12)
0.22718935 = queryNorm
0.293042 = (MATCH) fieldWeight(contents:美国 in 11), product of:
1.0 = tf(termFreq(contents:美国)=1)
1.8754687 = idf(docFreq=4, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
0.20214175 = (MATCH) weight(contents:华尔街 in 11), product of:
0.54214066 = queryWeight(contents:华尔街), product of:
2.3862944 = idf(docFreq=2, numDocs=12)
0.22718935 = queryNorm
0.3728585 = (MATCH) fieldWeight(contents:华尔街 in 11), product of:
1.0 = tf(termFreq(contents:华尔街)=1)
2.3862944 = idf(docFreq=2, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
0.27667123 = (MATCH) weight(contents:评论 in 11), product of:
0.63425803 = queryWeight(contents:评论), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.22718935 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:评论 in 11), product of:
1.0 = tf(termFreq(contents:评论)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
0.08407816 = (MATCH) weight(contents:美元 in 11), product of:
0.3496436 = queryWeight(contents:美元), product of:
1.5389965 = idf(docFreq=6, numDocs=12)
0.22718935 = queryNorm
0.2404682 = (MATCH) fieldWeight(contents:美元 in 11), product of:
1.0 = tf(termFreq(contents:美元)=1)
1.5389965 = idf(docFreq=6, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=11)
分词器分词结果如下:
分析
欧元
1 search results for query 分析,欧元
query is parsed as:分析 欧元
0.57385075 = (MATCH) sum of:
0.24226412 = (MATCH) weight(contents:分析 in 0), product of:
0.64974815 = queryWeight(contents:分析), product of:
2.3862944 = idf(docFreq=2, numDocs=12)
0.27228332 = queryNorm
0.3728585 = (MATCH) fieldWeight(contents:分析 in 0), product of:
1.0 = tf(termFreq(contents:分析)=1)
2.3862944 = idf(docFreq=2, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=0)
0.33158666 = (MATCH) weight(contents:欧元 in 0), product of:
0.76014954 = queryWeight(contents:欧元), product of:
2.7917595 = idf(docFreq=1, numDocs=12)
0.27228332 = queryNorm
0.43621242 = (MATCH) fieldWeight(contents:欧元 in 0), product of:
1.0 = tf(termFreq(contents:欧元)=1)
2.7917595 = idf(docFreq=1, numDocs=12)
0.15625 = fieldNorm(field=contents, doc=0)