在Elasticsearch中使用脚本插件修改打分

最新推荐文章于 2024-04-26 09:55:10 发布

过悟

最新推荐文章于 2024-04-26 09:55:10 发布

阅读量1.1k

点赞数

分类专栏： elasticsearch 文章标签： elasticsearch 大数据 java

本文链接：https://blog.csdn.net/weixin_36340771/article/details/122613681

版权

elasticsearch 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

文章目录

- 客官且慢，点赞、收藏+关注谢谢~

前提和背景

问题描述：在ES中搜索data时会出现“data demo”和“demo data”打分一样，但实际希望“data demo”能打分更高。

原始问题：https://elasticsearch.cn/question/12494

解决办法：ES5.5以前是使用脚本读取特殊_index字段中词条的frequency、position等信息，ES5.5开始使用脚本插件实现上述操作！本博客将基于ES7.15.2使用脚本插件来解决该问题！

编写插件代码&打包安装

编写插件代码

编写目标

自定义脚本能获得匹配词条的offset
脚本返回结果能实现offset越小返回值越大，且返回值不超过1；最终使用sigmoid函数作为转换函数，然后加负号获得负相关；最后加一获得0-0.5的正数。

在这里插入图片描述

编写代码

编写插件代码调用postings方法时，必须传参PostingsEnum.ALL例如：postings(new Term(field, term),PostingsEnum.ALL)（默认不会传入），这样才会返回携带Offset的PostingsEnum
必须先执行nextPosition方法更新startOffset，才能获得startOffset
代码可以参考github工程

public class ExpertScriptPlugin extends Plugin implements ScriptPlugin {

    @Override
    public ScriptEngine getScriptEngine(
        Settings settings,
        Collection<ScriptContext<?>> contexts
    ) {
        return new MyExpertScriptEngine();
    }

    /**
     * An example {@link ScriptEngine} that uses Lucene segment details to
     * implement pure document frequency scoring.
     */
    // tag::expert_engine
    private static class MyExpertScriptEngine implements ScriptEngine {
        @Override
        public String getType() {
            return "expert_scripts";
        }

        @Override
        public <T> T compile(
            String scriptName,
            String scriptSource,
            ScriptContext<T> context,
            Map<String, String> params
        ) {
            if (context.equals(ScoreScript.CONTEXT) == false) {
                throw new IllegalArgumentException(getType()
                    + " scripts cannot be used for context ["
                    + context.name + "]");
            }
            // we use the script "source" as the script identifier
            if ("pure_df".equals(scriptSource)) {
                ScoreScript.Factory factory = new PureDfFactory();
                return context.factoryClazz.cast(factory);
            }
            throw new IllegalArgumentException("Unknown script name "
                + scriptSource);
        }

        @Override
        public void close() {
            // optionally close resources
        }

        @Override
        public Set<ScriptContext<?>> getSupportedContexts() {
            return Collections.singleton(ScoreScript.CONTEXT);
        }

        private static class PureDfFactory implements ScoreScript.Factory,
            ScriptFactory {
            @Override
            public boolean isResultDeterministic() {
                // PureDfLeafFactory only uses deterministic APIs, this
                // implies the results are cacheable.
                return true;
            }

            @Override
            public LeafFactory newFactory(
                Map<String, Object> params,
                SearchLookup lookup
            ) {
                return new PureDfLeafFactory(params, lookup);
            }
        }

        private static class PureDfLeafFactory implements LeafFactory {
            private final Map<String, Object> params;
            private final SearchLookup lookup;
            private final String field;
            private final String term;

            private PureDfLeafFactory(
                Map<String, Object> params, SearchLookup lookup) {
                if (params.containsKey("field") == false) {
                    throw new IllegalArgumentException(
                        "Missing parameter [field]");
                }
                if (params.containsKey("term") == false) {
                    throw new IllegalArgumentException(
                        "Missing parameter [term]");
                }
                this.params = params;
                this.lookup = lookup;
                field = params.get("field").toString();
                term = params.get("term").toString();
            }

            @Override
            public boolean needs_score() {
                return false;  // Return true if the script needs the score
            }

            @Override
            public ScoreScript newInstance(DocReader docReader)
                throws IOException {
                DocValuesDocReader dvReader = ((DocValuesDocReader) docReader);
                // Postings方法这里需要多填写一个传参：PostingsEnum.ALL/PostingsEnum.OFFSETS（官网默认没有填写）
                PostingsEnum postings = dvReader.getLeafReaderContext()
                    .reader().postings(new Term(field, term), PostingsEnum.ALL);
                if (postings == null) {
                    /*
                     * the field and/or term don't exist in this segment,
                     * so always return 0
                     */
                    return new ScoreScript(params, lookup, docReader) {
                        @Override
                        public double execute(
                            ExplanationHolder explanation
                        ) {
                            return 0.0d;
                        }
                    };
                }
                return new ScoreScript(params, lookup, docReader) {
                    int currentDocid = -1;

                    @Override
                    public void setDocument(int docid) {
                        /*
                         * advance has undefined behavior calling with
                         * a docid <= its current docid
                         */
                        if (postings.docID() < docid) {
                            try {
                                postings.advance(docid);
                            } catch (IOException e) {
                                throw new UncheckedIOException(e);
                            }
                        }
                        currentDocid = docid;
                    }

                    @Override
                    public double execute(ExplanationHolder explanation) {
                        if (postings.docID() != currentDocid) {
                            /*
                             * advance moved past the current doc, so this
                             * doc has no occurrences of the term
                             */
                            return 0.0d;
                        }
                        try {
                            // 计算下一个匹配term的startOffset，注意默认值不是第一个term的startOffset
                            postings.nextPosition();
                            int i = postings.startOffset();
                            // 这里基于sigmoid取反加一，但是计算值太大了，于是乘以0.01，以免词条出现两次的打分相反低一次的，例子如下：
                            // eg. foo is body比is foo body foo 打分更加高，实际可能想要两次的更高！
                            return mySigmoid(i) * 0.01;
                        } catch (IOException e) {
                            throw new UncheckedIOException(e);
                        }
                    }
                };
            }
        }
    }

    // end::expert_engine
    public static double mySigmoid(double value) {
        double ey = Math.pow(Math.E, -value);
        return 1 - 1 / (1 + ey);
    }
}

打包&安装

工程打包

将上述代码打包（不带依赖），且打为jar包

编写名为plugin-descriptor.properties的插件描述文件，必填信息如下（更多信息请参考官网）

# 上述插件类名路径
classname=org.elasticsearch.example.expertscript.ExpertScriptPlugin
# 插件描述，随意取就行
description=My cool plugin
# 插件版本
version=6.0
# ES版本，注意一定要写全，例如7.15.0
elasticsearch.version=7.15.2
# java版本号
java.version=1.8
# 插件名称，用于ES显示插件时展示
name=test_plugin

使用压缩软件将代码jar和plugin-descriptor.properties一并打包为ExpertScriptPlugin.zip的zip文件，文件名称可以随意取。文件结构如下
```
-ExpertScriptPlugin.zip
		--ExpertScriptPlugin.jar
		--plugin-descriptor.properties
```

安装

上传上述压缩包到/root/目录下
进入ES安装目录bin目录下
切换用户sudo su elasticsearch
安装自定义插件./elasticsearch-plugin install file:///root/ExpertScriptPlugin.zip

删除、查询插件请参考./elasticsearch-plugin -h

准备数据

创建索引

注意：创建索引时，索引字段的类型必须是text且必须设置index_options属性为offset（默认情况是position，无法提供Offset）。

PUT /text
{
  "mappings": {
    "properties": {
      "body":{
        "type": "text","analyzer": "standard", "index_options": "offsets"
      }
    }
  }
}

写入数据测试数据

POST /text/_doc
{
  "body":"nobody foo is"
}

POST /text/_doc
{
  "body":"foo nobody is foo"
}

POST /text/_doc
{
  "body":"nobody foo is foo"
}

POST /text/_doc
{
  "body":"foo is nobody"
}

POST /text/_doc
{
  "body":"nobody foo is"
}

测试结果

编写查询语句

查询打分和function打分的组合方式，通过设置boost_mode字段确定，本次使用相加的方式sum

GET /text/_search
{
  "query": {
    "function_score": {
      "query": {
        "match": {
          "body": "foo"
        }
      },
      "functions": [
        {
          "script_score": {
            "script": {
              "source": "pure_df",
              "lang": "expert_scripts",
              "params": {
                "field": "body",
                "term": "foo"
              }
            }
          }
        }
      ],
      "boost_mode": "sum"
    }
  }
}

查询结果为：

{
  "took": 28,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 5,
      "relation": "eq"
    },
    "max_score": 0.11898339,
    "hits": [
      {
        "_index": "text",
        "_type": "_doc",
        "_id": "bPXwZn4B7A6RSNks37jI",
        "_score": 0.11898339,
        "_source": {
          "body": "foo nobody is foo"
        }
      },
      {
        "_index": "text",
        "_type": "_doc",
        "_id": "bfXwZn4B7A6RSNks5rgv",
        "_score": 0.1139925,
        "_source": {
          "body": "nobody foo is foo"
        }
      },
      {
        "_index": "text",
        "_type": "_doc",
        "_id": "bvXwZn4B7A6RSNks67jy",
        "_score": 0.09641083,
        "_source": {
          "body": "foo is nobody"
        }
      },
      {
        "_index": "text",
        "_type": "_doc",
        "_id": "a_XwZn4B7A6RSNks2bil",
        "_score": 0.09141994,
        "_source": {
          "body": "nobody foo is"
        }
      },
      {
        "_index": "text",
        "_type": "_doc",
        "_id": "b_XwZn4B7A6RSNks8rgK",
        "_score": 0.09141994,
        "_source": {
          "body": "nobody foo is"
        }
      }
    ]
  }
}