LuceneInAction-多样化查询

最新推荐文章于 2017-02-22 13:39:00 发布

zhangjunhd

最新推荐文章于 2017-02-22 13:39:00 发布

阅读量309

点赞数

分类专栏：搜索引擎文章标签：搜索 lucene

本文链接：https://blog.csdn.net/zhangjunhd/article/details/54847387

版权

搜索引擎专栏收录该内容

6 篇文章 0 订阅

订阅专栏

通过项进行搜索：TermQuery类

通过匹配isbn来检索某个文档：

import lia.common.TestUtil;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;

import static junit.framework.Assert.assertEquals;

public class TermQueryTest {
    public void testKeyword() throws Exception {
        Directory dir = TestUtil.getBookIndexDirectory();
        IndexSearcher searcher = new IndexSearcher(dir);

        Term t = new Term("isbn", "9781935182023");
        Query query = new TermQuery(t);
        TopDocs docs = searcher.search(query, 10);
        assertEquals("JUnit in Action, Second Edition",
                1, docs.totalHits);

        searcher.close();
        dir.close();
    }
}

在指定的项范围内搜索：TermRangeQuery类

搜索起始字母范围从d到j的书籍标题：

import junit.framework.TestCase;

import lia.common.TestUtil;

import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.store.Directory;

// From chapter 3
public class TermRangeQueryTest extends TestCase {
  public void testTermRangeQuery() throws Exception {
    Directory dir = TestUtil.getBookIndexDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);
    TermRangeQuery query = new TermRangeQuery("title2", "d", "j", true, true);

    TopDocs matches = searcher.search(query, 100);
    /*
    for(int i=0;i<matches.totalHits;i++) {
      System.out.println("match " + i + ": " + searcher.doc(matches.scoreDocs[i].doc).get("title2"));
    }
    */
    assertEquals(3, matches.totalHits);
    searcher.close();
    dir.close();
  }
}

在指定的数字范围内搜索：NumbericRangeQuery类

出版月份在200605到200609之间：

import junit.framework.TestCase;

import lia.common.TestUtil;

import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.store.Directory;

// From chapter 3
public class NumericRangeQueryTest extends TestCase {
  public void testInclusive() throws Exception {
    Directory dir = TestUtil.getBookIndexDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);
    // pub date of TTC was September 2006
    NumericRangeQuery query = NumericRangeQuery.newIntRange("pubmonth",
                                                            200605,
                                                            200609,
                                                            true,
                                                            true);

    TopDocs matches = searcher.search(query, 10);
    /*
    for(int i=0;i<matches.totalHits;i++) {
      System.out.println("match " + i + ": " + searcher.doc(matches.scoreDocs[i].doc).get("author"));
    }
    */
    assertEquals(1, matches.totalHits);
    searcher.close();
    dir.close();
  }

  public void testExclusive() throws Exception {
    Directory dir = TestUtil.getBookIndexDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);

    // pub date of TTC was September 2006
    NumericRangeQuery query = NumericRangeQuery.newIntRange("pubmonth",
                                                            200605,
                                                            200609,
                                                            false,
                                                            false);
    TopDocs matches = searcher.search(query, 10);
    assertEquals(0, matches.totalHits);
    searcher.close();
    dir.close();
  }
}

通过字符串搜索：PrefixQuery类

搜索包含以指定字符串开头的项的文档：

import junit.framework.TestCase;

import lia.common.TestUtil;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;

// From chapter 3
public class PrefixQueryTest extends TestCase {
  public void testPrefix() throws Exception {
    Directory dir = TestUtil.getBookIndexDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);

    Term term = new Term("category",                              //#A 搜索编程方面书籍，包括它们的子类书籍
                         "/technology/computers/programming");    //#A
    PrefixQuery query = new PrefixQuery(term);                    //#A

    TopDocs matches = searcher.search(query, 10);                 //#A
    int programmingAndBelow = matches.totalHits;

    matches = searcher.search(new TermQuery(term), 10);           //#B 搜索编程方面书籍，不包括它们的子类书籍
    int justProgramming = matches.totalHits;

    assertTrue(programmingAndBelow > justProgramming);
    searcher.close();
    dir.close();
  }
}

/*
  #A Search, including subcategories
  #B Search, without subcategories
*/

组合查询：BooleanQuery类

import junit.framework.TestCase;
import lia.common.TestUtil;

import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.store.Directory;

// From chapter 3
public class BooleanQueryTest extends TestCase {
  public void testAnd() throws Exception {
    TermQuery searchingBooks =
      new TermQuery(new Term("subject","search"));  //#1 查找subject域包含“search”的所有书籍

    Query books2010 =                                   //#2 查找2010年出版的所有书籍
      NumericRangeQuery.newIntRange("pubmonth", 201001, //#2
                                    201012,             //#2
                                    true, true);        //#2

    BooleanQuery searchingBooks2010 = new BooleanQuery();              //#3 合并两个查询，且两个子查询都是必须的
    searchingBooks2010.add(searchingBooks, BooleanClause.Occur.MUST);  //#3
    searchingBooks2010.add(books2010, BooleanClause.Occur.MUST);       //#3

    Directory dir = TestUtil.getBookIndexDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);
    TopDocs matches = searcher.search(searchingBooks2010, 10);

    assertTrue(TestUtil.hitsIncludeTitle(searcher, matches,
                                 "Lucene in Action, Second Edition"));
    searcher.close();
    dir.close();
  }

/*
#1 Match books with subject “search”
#2 Match books in 2004
#3 Combines two queries
*/

  public void testOr() throws Exception {
    TermQuery methodologyBooks = new TermQuery(                       // #1 匹配类别1
               new Term("category",                                   // #1
                 "/technology/computers/programming/methodology"));   // #1

    TermQuery easternPhilosophyBooks = new TermQuery(                 // #2 匹配类别2
        new Term("category",                                          // #2
            "/philosophy/eastern"));                                  // #2

    BooleanQuery enlightenmentBooks = new BooleanQuery();             // #3
    enlightenmentBooks.add(methodologyBooks,                          // #3
                           BooleanClause.Occur.SHOULD);               // #3
    enlightenmentBooks.add(easternPhilosophyBooks,                    // #3
                           BooleanClause.Occur.SHOULD);               // #3

    Directory dir = TestUtil.getBookIndexDirectory();
    IndexSearcher searcher = new IndexSearcher(dir);
    TopDocs matches = searcher.search(enlightenmentBooks, 10);
    System.out.println("or = " + enlightenmentBooks);

    assertTrue(TestUtil.hitsIncludeTitle(searcher, matches,
                                         "Extreme Programming Explained"));
    assertTrue(TestUtil.hitsIncludeTitle(searcher, matches,
                                         "Tao Te Ching \u9053\u5FB7\u7D93"));
    searcher.close();
    dir.close();
  }

  /*
#1 Match 1st category
#2 Match 2nd category
#3 Combine
   */
}

通过短语搜索：PhraseQuery类

slot：两个项的位置之间所允许的最大间隔距离称为slop。这里的距离是指项若要按顺序组成给定的短语所需要移动位置的次数。

import junit.framework.TestCase;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.Directory;

import java.io.IOException;

// From chapter 3
public class PhraseQueryTest extends TestCase {
  private Directory dir;
  private IndexSearcher searcher;

  protected void setUp() throws IOException {
    dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir,
                                         new WhitespaceAnalyzer(),
                                         IndexWriter.MaxFieldLength.UNLIMITED);
    Document doc = new Document();
    doc.add(new Field("field",                                    // 1 建立测试文档
              "the quick brown fox jumped over the lazy dog",     // 1
              Field.Store.YES,                                    // 1
              Field.Index.ANALYZED));                             // 1
    writer.addDocument(doc);
    writer.close();

    searcher = new IndexSearcher(dir);
  }

  protected void tearDown() throws IOException {
    searcher.close();
    dir.close();
  }

  private boolean matched(String[] phrase, int slop)
      throws IOException {
    PhraseQuery query = new PhraseQuery();              // 2 初始化PhraseQuery对象
    query.setSlop(slop);                                // 2

    for (String word : phrase) {             // 3 添加短语项序列
      query.add(new Term("field", word));    // 3
    }                                        // 3

    TopDocs matches = searcher.search(query, 10);
    return matches.totalHits > 0;
  }
  /*
    #1 Add a single test document
    #2 Create initial PhraseQuery
    #3 Add sequential phrase terms
   */
  public void testSlopComparison() throws Exception {
    String[] phrase = new String[] {"quick", "fox"};

    assertFalse("exact phrase not found", matched(phrase, 0));

    assertTrue("close enough", matched(phrase, 1));
  }

  public void testReverse() throws Exception {
    String[] phrase = new String[] {"fox", "quick"};

    assertFalse("hop flop", matched(phrase, 2));
    assertTrue("hop hop slop", matched(phrase, 3));
  }

  public void testMultiple() throws Exception {
    assertFalse("not close enough",
        matched(new String[] {"quick", "jumped", "lazy"}, 3));

    assertTrue("just enough",
        matched(new String[] {"quick", "jumped", "lazy"}, 4));

    assertFalse("almost but not quite",
        matched(new String[] {"lazy", "jumped", "quick"}, 7));

    assertTrue("bingo",
        matched(new String[] {"lazy", "jumped", "quick"}, 8));
  }
}

高级查询

通配符查询：WildcardQuery类
搜索类似项：FuzzyQuery类
- Levenshtein距离决定相似程度

import junit.framework.TestCase;

import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

import java.util.Vector;

// From chapter 3
public class ScoreTest extends TestCase {
  private Directory directory;

  public void setUp() throws Exception {
    directory = new RAMDirectory();
  }

  public void tearDown() throws Exception {
    directory.close();
  }

  public void testSimple() throws Exception {
    indexSingleFieldDocs(new Field[] {new Field("contents", "x", Field.Store.YES, Field.Index.ANALYZED)});
    IndexSearcher searcher = new IndexSearcher(directory);
    searcher.setSimilarity(new SimpleSimilarity());

    Query query = new TermQuery(new Term("contents", "x"));
    Explanation explanation = searcher.explain(query, 0);
    System.out.println(explanation);

    TopDocs matches = searcher.search(query, 10);
    assertEquals(1, matches.totalHits);

    assertEquals(1F, matches.scoreDocs[0].score, 0.0);

    searcher.close();
  }

  private void indexSingleFieldDocs(Field[] fields) throws Exception {
    IndexWriter writer = new IndexWriter(directory,
        new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);
    for (Field f : fields) {
      Document doc = new Document();
      doc.add(f);
      writer.addDocument(doc);
    }
    writer.optimize();
    writer.close();
  }

  public void testWildcard() throws Exception {
    indexSingleFieldDocs(new Field[]
      { new Field("contents", "wild", Field.Store.YES, Field.Index.ANALYZED),
        new Field("contents", "child", Field.Store.YES, Field.Index.ANALYZED),
        new Field("contents", "mild", Field.Store.YES, Field.Index.ANALYZED),
        new Field("contents", "mildew", Field.Store.YES, Field.Index.ANALYZED) });

    IndexSearcher searcher = new IndexSearcher(directory);
    Query query = new WildcardQuery(new Term("contents", "?ild*"));  //#A
    TopDocs matches = searcher.search(query, 10);
    assertEquals("child no match", 3, matches.totalHits);

    assertEquals("score the same", matches.scoreDocs[0].score,
                                   matches.scoreDocs[1].score, 0.0);
    assertEquals("score the same", matches.scoreDocs[1].score,
                                   matches.scoreDocs[2].score, 0.0);
    searcher.close();
  }
  /*
    #A Construct WildcardQuery using Term
  */

  public void testFuzzy() throws Exception {
    indexSingleFieldDocs(new Field[] { new Field("contents",
                                                 "fuzzy",
                                                 Field.Store.YES,
                                                 Field.Index.ANALYZED),
                                       new Field("contents",
                                                 "wuzzy",
                                                 Field.Store.YES,
                                                 Field.Index.ANALYZED)
                                     });

    IndexSearcher searcher = new IndexSearcher(directory);
    Query query = new FuzzyQuery(new Term("contents", "wuzza"));
    TopDocs matches = searcher.search(query, 10);
    assertEquals("both close enough", 2, matches.totalHits);

    assertTrue("wuzzy closer than fuzzy",
               matches.scoreDocs[0].score != matches.scoreDocs[1].score);

    Document doc = searcher.doc(matches.scoreDocs[0].doc);
    assertEquals("wuzza bear", "wuzzy", doc.get("contents"));
    searcher.close();
  }

  public static class SimpleSimilarity extends Similarity {
    public float lengthNorm(String field, int numTerms) {
      return 1.0f;
    }

    public float queryNorm(float sumOfSquaredWeights) {
      return 1.0f;
    }

    public float tf(float freq) {
      return freq;
    }

    public float sloppyFreq(int distance) {
      return 2.0f;
    }

    public float idf(Vector terms, Searcher searcher) {
      return 1.0f;
    }

    public float idf(int docFreq, int numDocs) {
      return 1.0f;
    }

    public float coord(int overlap, int maxOverlap) {
      return 1.0f;
    }
  }

zhangjunhd

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
LuceneInAction-多样化查询

通过项进行搜索：TermQuery类通过匹配isbn来检索某个文档：import lia.common.TestUtil;import org.apache.lucene.index.Term;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apach
复制链接

扫一扫

专栏目录