Elasticsearch源码分析二--调用Lucene查询接口之常用词查询

  • 简介
  • 查询语法
  • 源码分析

简介

常用词查询是在没有使用停用词的情况下, Elasticsearch为了提高常用词的查询相关性和精确性而提供的一个现代解决方案。
例如,“ crime and punishment”可以翻译成3个词查询,每一个都有性能上的成本(词越多,查询性能越低)。但“ and”这个词非常常见,对文档得分的影响非常低。
解决办法是常用词查询,将查询分为两组。第一组包含重要的词,出现的频率较低。第二组包含较高频率的、不那么重要的词。先执行第一个查询,Elasticsearch从第一组的所有词中计算分数。这样,通常都很重要的低频词总是被列入考虑范围。然后,Elasticsearch对第二组中的词执行二次查询,但只为与第一个查询中匹配的文档计算得分。这样只计算了相关文档的得分,实现了更高的性能。

查询语法

{
“query” : {
“common” : {
“title” : {
“query” : “crime and punishment”,
“cutoff_frequency” : 0.001
}}}

源码分析

'''(1)Elasticsearch code'''
public class CommonTermsQueryParser implements QueryParser {
    public static final String NAME = "common";
    static final float DEFAULT_MAX_TERM_DOC_FREQ = 0.01f;
    static final Occur DEFAULT_HIGH_FREQ_OCCUR = Occur.SHOULD;
    static final Occur DEFAULT_LOW_FREQ_OCCUR = Occur.SHOULD;
    static final boolean DEFAULT_DISABLE_COORDS = true;

    @Override
    public Query parse(QueryParseContext parseContext) throws IOException, QueryParsingException {
        XContentParser parser = parseContext.parser();
        XContentParser.Token token = parser.nextToken();
        if (token != XContentParser.Token.FIELD_NAME) {
            throw new QueryParsingException(parseContext.index(), "[common] query malformed, no field");
        }
        String fieldName = parser.currentName();
        Object value = null;
        float boost = 1.0f;
        String queryAnalyzer = null;
        String lowFreqMinimumShouldMatch = null;
        String highFreqMinimumShouldMatch = null;
        boolean disableCoords = DEFAULT_DISABLE_COORDS;
        Occur highFreqOccur = DEFAULT_HIGH_FREQ_OCCUR;
        Occur lowFreqOccur = DEFAULT_LOW_FREQ_OCCUR;
        float maxTermFrequency = DEFAULT_MAX_TERM_DOC_FREQ;
        token = parser.nextToken();
        ....
        '''构造Lucene类CommonTermsQuery的子类对象
           参数highFreqCccur:为高频词组构建查询时用到的布尔运算符SHOULD/MUST
           参数lowFreqOccur:为低频词组构建查询时用到的布尔运算符SHOULD/MUST 
           参数maxTermFrequency:用来构建高、低频词组,小于设定值的词将出现在低频词组中
           参数disableCoords:是否启用分数因子计算'''
        ExtendedCommonTermsQuery query = new ExtendedCommonTermsQuery(highFreqOccur, lowFreqOccur, maxTermFrequency, disableCoords);
        query.setBoost(boost);
        return parseQueryString(query, value.toString(), fieldName, parseContext, queryAnalyzer, lowFreqMinimumShouldMatch, highFreqMinimumShouldMatch);
    }

'''(2)Lucene code'''
public class CommonTermsQuery extends Query {
  '''对query进行重写,区分低频词和高频词,并根据Elasticsearch传递的highFreqOccur和lowFreqOccur将高频词和低频词构造成BooleanQuery'''
  @Override
  public Query rewrite(IndexReader reader) throws IOException {
    if (this.terms.isEmpty()) {
      return new MatchNoDocsQuery();
    } else if (this.terms.size() == 1) {
      final Query tq = newTermQuery(this.terms.get(0), null);
      tq.setBoost(getBoost());
      return tq;
    }
    final List<LeafReaderContext> leaves = reader.leaves();
    final int maxDoc = reader.maxDoc();
    final TermContext[] contextArray = new TermContext[terms.size()];
    final Term[] queryTerms = this.terms.toArray(new Term[0]);
    collectTermContext(reader, leaves, contextArray, queryTerms);
    return buildQuery(maxDoc, contextArray, queryTerms);
  }
  '''(3)Lucene 常用词查询举例'''
  public void test() throws IOException {
    Directory dir = newDirectory();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, analyzer);
    String[] docs = new String[] {"this is the end of the world right",
        "is this it or maybe not",
        "this is the end of the universe as we know it",
        "there is the famous restaurant at the end of the universe",};
    for (int i = 0; i < docs.length; i++) {
      Document doc = new Document();
      doc.add(newStringField("id", "" + i, Field.Store.YES));
      doc.add(newTextField("field", docs[i], Field.Store.NO));
      w.addDocument(doc);
    }

    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    {
      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
          random().nextBoolean() ? 2.0f : 0.5f);
      '''this/is/end -- 高频词 
         world/universe/right -- 高频词 '''
      query.add(new Term("field", "is"));
      query.add(new Term("field", "this"));
      query.add(new Term("field", "end"));
      query.add(new Term("field", "world"));
      query.add(new Term("field", "universe"));
      query.add(new Term("field", "right"));
      '''第一次查询先用高频词world/universe/right查找得到0、2、3;第二次查询只为第一次查询的结果文档0、2、3计算得分'''
      TopDocs search = s.search(query, 10);
      assertEquals(search.totalHits, 3);
      assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
      assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
      assertEquals("3", r.document(search.scoreDocs[2].doc).get("id"));
    }

    { // only high freq
      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.SHOULD,
          random().nextBoolean() ? 2.0f : 0.5f);
      query.add(new Term("field", "is"));
      query.add(new Term("field", "this"));
      query.add(new Term("field", "end"));
      TopDocs search = s.search(query, 10);
      assertEquals(search.totalHits, 2);
      assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
      assertEquals("2", r.document(search.scoreDocs[1].doc).get("id"));
    }

    { // low freq is mandatory
      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
          random().nextBoolean() ? 2.0f : 0.5f);
      query.add(new Term("field", "is"));
      query.add(new Term("field", "this"));
      query.add(new Term("field", "end"));
      query.add(new Term("field", "world"));

      TopDocs search = s.search(query, 10);
      assertEquals(search.totalHits, 1);
      assertEquals("0", r.document(search.scoreDocs[0].doc).get("id"));
    }

    { // low freq is mandatory
      CommonTermsQuery query = new CommonTermsQuery(Occur.SHOULD, Occur.MUST,
          random().nextBoolean() ? 2.0f : 0.5f);
      query.add(new Term("field", "restaurant"));
      query.add(new Term("field", "universe"));

      TopDocs search = s.search(query, 10);
      assertEquals(search.totalHits, 1);
      assertEquals("3", r.document(search.scoreDocs[0].doc).get("id"));

    }
    IOUtils.close(r, w, dir, analyzer);
  }

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值