Lucene搜索功能

Lucene搜索功能

实现搜索功能,主要的API

这里写图片描述
示例:

public static void search(String indexDir, String q) throws IOException, ParseException {
        Directory dir = FSDirectory.open(new File(indexDir));
        IndexSearcher is = new IndexSearcher(dir);

        QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new StandardAnalyzer(Version.LUCENE_30));
        Query query = parser.parse(q);
        long start = System.currentTimeMillis();
        TopDocs hits = is.search(query, 10);
        long end = System.currentTimeMillis();

        System.err.println("Found " + hits.totalHits + " document(s) (in " + (end - start) + " milliseconds) that matched query '" + q + "':");
        for (ScoreDoc scoreDoc:hits.scoreDocs
             ) {
            Document doc = is.doc(scoreDoc.doc);
            System.out.println(doc.get("fullpath"));
        }
    }

近实时搜索

Lucene提供了IndexReader.reopen()方法来支持近实时搜索,这个实时搜索是一个相对的,因为不完全严格做到实时。需要注意的是在jvm内存占用较多的时候,IndexReader.reopen()会比较慢。
示例:

package com.lucene._3_3;

import junit.framework.TestCase;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;

import java.io.IOException;

/**
 * Created by xun.zhang on 2017/10/30.
 */
public class NearRealTimeTest extends TestCase {

    public void testNearReadTime() throws IOException {
        Directory dir = new RAMDirectory();
        IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),IndexWriter.MaxFieldLength.UNLIMITED);
        for (int i = 0; i < 10; i++) {
            Document doc = new Document();
            doc.add(new Field("id",""+i,Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS));
            doc.add(new Field("text","aaa",Field.Store.NO,Field.Index.ANALYZED));
            writer.addDocument(doc);
        }

        IndexReader reader = writer.getReader();
        IndexSearcher searcher = new IndexSearcher(reader);

        Query query = new TermQuery(new Term("text","aaa"));
        TopDocs docs = searcher.search(query,1);
        assertEquals(10,docs.totalHits);

        writer.deleteDocuments(new Term("id","7"));

        Document doc = new Document();
        doc.add(new Field("id","11",Field.Store.NO,Field.Index.NOT_ANALYZED_NO_NORMS));
        doc.add(new Field("text","bbb",Field.Store.NO,Field.Index.ANALYZED));
        writer.addDocument(doc);

        IndexReader newReader = reader.reopen();
        assertFalse(reader == newReader);
        reader.close();
        searcher = new IndexSearcher(newReader);

        TopDocs hits = searcher.search(query,10);
        assertEquals(9,hits.totalHits);
        query = new TermQuery(new Term("text","bbb"));
        hits = searcher.search(query,1);
        assertEquals(1,hits.totalHits);

        newReader.close();
        writer.close();
    }

}

评分机制

Lucene的评分公式

这里写图片描述
有一个位网友的博客写的非常好,这里引用一下:Lucene评分公式

expain()查看评分结果

package com.lucene._3_4;

import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

import java.io.File;
import java.io.IOException;

/**
 * Created by xun.zhang on 2017/10/30.
 */
public class Explainer {

    public static void main(String[] args) throws IOException, ParseException {
        if(args.length != 2) {
            System.err.print("Usage: Explainer <index dir> <query>");
            System.exit(1);
        }

        String indexDir = args[0];
        String queryExpression = args[1];

        Directory directory = FSDirectory.open(new File(indexDir));
        QueryParser parser = new QueryParser(Version.LUCENE_30, "contents", new SimpleAnalyzer());

        Query query = parser.parse(queryExpression);
        System.out.println("Query: " + queryExpression);
        IndexSearcher searcher = new IndexSearcher(directory);
        TopDocs topDocs = searcher.search(query, 10);

        for (ScoreDoc match:topDocs.scoreDocs
             ) {
            Explanation explanation = searcher.explain(query, match.doc);
            System.out.println("-----------------");
            Document doc = searcher.doc(match.doc);
            System.out.println(doc.get("filename"));
            System.out.println(explanation.toString());
        }

        searcher.close();
        directory.close();
    }

}

多个Query

TermQuery类(通过项进行搜索)

对索引中特定项进行搜索是最基本的搜索方式。Term是最小的索引片段,每个Term包含了一个域名和一个文本值。

Term和Field的对照关系:
这里写图片描述

public void testKeyword() throws IOException {
        Directory dir = new RAMDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);

        Term t = new Term("isbn","987577656789");
        Query query = new TermQuery(t);
        TopDocs docs = searcher.search(query,10);
        assertEquals("JUnit in Action, Second Edition",1,docs.totalHits);

        searcher.close();
        dir.close();
    }

TermRangeQuery(指定范围内搜索)

索引中各个Term会按照字符进行排序,比如从a-z。TermRangeQuery是通过排序后的字符范围进行搜索。

public void testTermRangeQuery() throws IOException {
        Directory dir = new RAMDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);
        TermRangeQuery query = new TermRangeQuery("title2","d","j",true,true);

        TopDocs matches = searcher.search(query,100);
        assertEquals(3,matches.totalHits);
        searcher.close();
        dir.close();
    }

NumericRangeQuery(指定时间内搜索)

NumericRangeQuery可以用来搜索通过NumericField进行索引的域,比如按时间查找等。

public void testInclusive() throws IOException {
        Directory dir = new RAMDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);
        NumericRangeQuery query = NumericRangeQuery.newIntRange("pubmonth",200605,200609,true,true);
        TopDocs matches = searcher.search(query,10);
        assertEquals(1,matches.totalHits);
        dir.close();
    }

PrefixQuery(字符串搜索)

用来搜索包含以指定字符串开头的项的文档。

package com.lucene._3_5;

import junit.framework.TestCase;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import java.io.IOException;

/**
 * Created by xun.zhang on 2017/10/31.
 */
public class PrefixQueryTest extends TestCase {

    public void testPrefix() throws IOException {
        Directory dir = new RAMDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);
        Term term = new Term("category","/technology/computers/programming");
        PrefixQuery query = new PrefixQuery(term);

        TopDocs matches = searcher.search(query,10);
        int programmingAndBelow = matches.totalHits;

        matches = searcher.search(new TermQuery(term),10);
        int justProgramming = matches.totalHits;

        assertTrue(programmingAndBelow > justProgramming);
        searcher.close();
        dir.close();
    }

}

BooleanQuery(组合查询)

使用BooleanQuery可以把各种Query通过逻辑关系AND、OR、NOT组合起来搜索。
逻辑关系常量:
- BooleanClause.Occur.MUST
- BooleanClause.Occur.SHOULD
- BooleanClause.Occur.MUST_NOT

public void testAnd() throws IOException {
        TermQuery searchingBooks = new TermQuery(new Term("subject","search"));
        Query books2010 = NumericRangeQuery.newIntRange("pubmonth",201001,201012,true,true);

        BooleanQuery searchingBooks2010 = new BooleanQuery();
        searchingBooks2010.add(searchingBooks, BooleanClause.Occur.MUST);
        searchingBooks2010.add(books2010,BooleanClause.Occur.MUST);

        Directory dir = new RAMDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);
        TopDocs matches = searcher.search(searchingBooks2010,10);

        searcher.close();
        dir.close();
    }
public void testOr() throws IOException {
        TermQuery query1 = new TermQuery(new Term("category","/tech/com/pro/method"));
        TermQuery query2 = new TermQuery(new Term("category","/phil/eastern"));

        BooleanQuery englishTenmentBooks = new BooleanQuery();
        englishTenmentBooks.add(query1,BooleanClause.Occur.SHOULD);
        englishTenmentBooks.add(query2,BooleanClause.Occur.SHOULD);

        Directory dir = new RAMDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);
        TopDocs docs = searcher.search(englishTenmentBooks,10);
        System.out.println("or = " + englishTenmentBooks);

        searcher.close();
        dir.close();
    }

PhraseQuery(短语搜索)

PhraseQuery类会根据这些位置信息定位某个距离范围内的项对应的文档,例如:假设某个城市中包含短语”the quick brown fox jumped over the lazy”,即使我们不知道这个短语的完整写法,也一样可以查找域中quick和fox相关并且距离很近的文档。这里有一个slop因子,表示两个项间隔几个字符位置也能搜索的到。

package com.lucene._3_8;

import com.sun.org.apache.xpath.internal.WhitespaceStrippingElementMatcher;
import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import java.io.IOException;

/**
 * Created by xun.zhang on 2017/10/31.
 */
public class PhraseQueryTest extends TestCase {

    private Directory dir;
    private IndexSearcher searcher;

    protected void setUp() throws IOException {
        dir = new RAMDirectory();
        IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(),IndexWriter.MaxFieldLength.UNLIMITED);
        Document doc = new Document();
        doc.add(new Field("field","the quick brown fox jumped over the lazy dog",Field.Store.YES,Field.Index.ANALYZED));
        writer.addDocument(doc);
        writer.close();

        searcher = new IndexSearcher(dir);
    }

    protected void tearDown() throws IOException {
        searcher.close();
        dir.close();
    }

    private boolean matched(String[] phrase, int slop) throws IOException {
        PhraseQuery query = new PhraseQuery();
        query.setSlop(slop);
        for (String word:phrase
             ) {
            query.add(new Term("field",word));
        }
        TopDocs matches = searcher.search(query,10);
        return matches.totalHits > 0;
    }

}

这里写图片描述

WildcardQuery(通配符搜索)

两个通配符:
- *,标识0个或多个字母
- ?,0个或1个字母

package com.lucene._3_9;

import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import java.io.File;
import java.io.IOException;

/**
 * Created by xun.zhang on 2017/10/31.
 */
public class WildcardQueryTest extends TestCase {

    private static Directory dir = new RAMDirectory();

    private void indexSingleFieldDocs(Field[] fields) throws IOException {
        IndexWriter writer = new IndexWriter(dir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
        for (Field f : fields
                ) {
            Document doc = new Document();
            doc.add(f);
            writer.addDocument(doc);
        }
        writer.optimize();
        writer.close();
    }

    public void testWildcard() throws IOException {
        indexSingleFieldDocs(new Field[]{new Field("contents", "wild", Field.Store.YES, Field.Index.ANALYZED),
                new Field("contents", "child", Field.Store.YES, Field.Index.ANALYZED),
                new Field("contents", "mild", Field.Store.YES, Field.Index.ANALYZED),
                new Field("contents", "mildew", Field.Store.YES, Field.Index.ANALYZED)});

        IndexSearcher searcher = new IndexSearcher(dir);
        Query query = new WildcardQuery(new Term("contents","?ild"));
        TopDocs docs = searcher.search(query,10);
        searcher.close();
    }

}

FuzzyQuery(搜索类似项)

FuzzyQuery用来解决相似字搜索的问题,使用Levenshtein距离算法。

public void testFuzzy() throws IOException {
        indexSingleFieldDocs(new Field[]{new Field("contents", "fuzzy", Field.Store.YES, Field.Index.ANALYZED),
                new Field("contents", "wuzzy", Field.Store.YES, Field.Index.ANALYZED)});

        IndexSearcher searcher = new IndexSearcher(dir);
        Query query = new FuzzyQuery(new Term("contents", "wuzza"));
        TopDocs matches = searcher.search(query, 10);
        assertEquals("both close enough", 2, matches.totalHits);
        assertTrue("wuzzy closer than fuzzy", matches.scoreDocs[0].score != matches.scoreDocs[1].score);

        Document doc = searcher.doc(matches.scoreDocs[0].doc);
        assertEquals("wuzza bear", "wuzzy", doc.get("contents"));
        searcher.close();
    }

MatchAllDocsQuery(匹配所有文档)

MatchAllDocsQuery,顾名思义,就是匹配索引中所有的文档。

QueryParser类,及其表达式

尽管使用API创建的Query很强大,不过看起来让然感觉很笨拙。Lucene提供了QueryParser类来分析传入的参数,自动创建多个Query的组合查询。

QueryParser处理的表达式范例:

  • java,默认域包含java项的文档
  • java junit(java OR junit),默认域包含java和junit中一个或两个的文档
  • +java +junit(java AND junit),默认域中同时包含java和junit文档
  • title:ant,title域中包含ant项的文档
  • title:extreme -subject:sports(title:extreme AND NOT subject:sports),title域中包含extreme切subject域中不包含sports的文档
  • (aglie OR extreme) AND methoddogy,默认域中包含methodogy且包含aglie和extreme中的一个或两个的文档
  • title:”junit in action”,title域为junit in action的文档
  • title:”junit action”~5,title域中junit和action之间的距离小于5的文档
  • java*,包含由java开头的项的文档
  • java~,包含与单词java相近的项的文档,如lava
  • lastmodified:[1/1/09 TO 12/31/09],lastmodified域值在2009年1月1号和2009年12月31号之间的文档

注意:如果要搜素内容本身含有这些字符\ + - ! ( ) : ^ ] { } ~ * ?时,需要使用转义符:\

Query.toString方法

Query.toString方法可以把Query翻译成搜索表达式。

public void testToString() {
        BooleanQuery query = new BooleanQuery();
        query.add(new FuzzyQuery(new Term("field", "kountry")), BooleanClause.Occur.MUST);
        query.add(new TermQuery(new Term("title", "western")), BooleanClause.Occur.SHOULD);
        assertEquals("both kinds", "+kountry~0.5 title:western", query.toString("field"));
    }

TermQuery单项查询

public void testTermQuery() throws ParseException {
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
        Query query = parser.parse("computers");
        System.out.println("term: " + query);
    }

项范围查询

public void testTermRangeQuery() throws ParseException, IOException {
        Directory dir = new RAMDirectory();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
        Query query = parser.parse("title2:[Q TO V]");
        assertTrue(query instanceof TermRangeQuery);
    }

前缀查询和通配符查询

public void testLowercasing() throws ParseException {
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
        Query query = parser.parse("PrefixQuery*");
        assertEquals("lowercased","prefixquery*", query.toString("field"));

        parser.setLowercaseExpandedTerms(false);
        query = parser.parse("PrefixQuery*");
        assertEquals("not lowercased","PrefixQuery*", query.toString("field"));
    }

设置默认布尔操作符

StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
        parser.setDefaultOperator(QueryParser.AND_OPERATOR);

短语查询

public void testPhraseQuery() throws ParseException {
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "field", analyzer);
        Query query = parser.parse("\"This is Some Phrase\"");

        assertEquals("analyzed", "\"? ? some phrase\"", query.toString("field"));
        assertTrue("reduced to TermQuery", query instanceof TermQuery);
    }
public void testSlop() throws ParseException {
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "field", analyzer);
        Query query = parser.parse("\"exact phrase\"");
        assertEquals("zero slop", "\"exact phrase\"", query.toString("field"));

        parser.setPhraseSlop(5);
        query = parser.parse("\"sloppy phrase\"");
        assertEquals("sloppy phrase", "\"sloppy phrase\"~5", query.toString());
    }

模糊查询

public void testFuzzyQuery() throws ParseException {
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
        Query query = parser.parse("kountry~");
        System.out.println("fuzzy: " + query);

        query = parser.parse("kountry~0.7");
        System.out.println("fuzzy 2: " + query);
    }

分组查询

public void testGrouping() throws ParseException, IOException {
        Directory dir = new RAMDirectory();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        QueryParser parser = new QueryParser(Version.LUCENE_30, "subject", analyzer);
        Query query = parser.parse("(agile OR extreme) AND methodology");
        IndexSearcher searcher = new IndexSearcher(dir);
        TopDocs matches = searcher.search(query,10);
    }

为子查询加权

比如有个表达式如下:

Query query = parser.parse("junit^2.0 testing");

通过QueryParseer分析后,会生成两个TermQuery的OR组合查询,其中junit的TermQuery的权重为2.0,testing的TermQuery的权重为1.0。

参考:
Lucene实战(第2版)著:Michael McCandles、Erik Hatcher、Otis Gospodnetic 译:牛长流、肖宇

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值