Es6.x语法探索【结束】

最新推荐文章于 2022-10-15 09:30:11 发布

PHPerJiang

最新推荐文章于 2022-10-15 09:30:11 发布

阅读量275

点赞数

分类专栏： elasticsearch

本文链接：https://blog.csdn.net/qq_36558538/article/details/101099599

版权

elasticsearch 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

2019-09-21

script使用表达式expression进行打分

/**
	 * 使用打分函数来进行排序
	 * @Author: jiangyu
	 * @Time: 2019/9/20 18:00
	 */
	function search_func_score(){
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'function_score' => [
						'query' => ['match' => ['test' => "quick brown fox"]],
						'script_score' => [
							'script' => [
								'lang' => 'expression',
								'source' => "_score * doc['popularity']"
							]
						]
					]
				]
			]
		];
		$client = ClientBuilder ::create() -> build();
		echo json_encode($client -> search($params));
	}

使用表达式打分的话注意lang要选用expression,此外与painless不同的是，souce中获取文档内容字段不能再使用ctx._source.xx的形式,ctx是应用在update、update-by-query、reindex上的，而是doc[xx]的格式是查询聚合的方式来获取文档字段值。如上图的表达式是将打完分后的分值再乘上popularity的值，最后汇总打分并按照分值倒排来返回结果。

脚本获取参数值

function search_func_score(){
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'function_score' => [
						'query' => ['match' => ['test' => "quick brown fox"]],
						'script_score' => [
							'script' => [
								'lang' => 'expression',
								'source' => "_score + count",
								'params' => ['count' => 2],
							]
						]
					]
				]
			]
		];
		$client = ClientBuilder ::create() -> build();
		echo json_encode($client -> search($params));
	}

查询打分使用的expression脚本，获取参数值时不再使用params.xx来获取params数组中的参数值，而是直接使用params中参数的key名来获取值。

给一个字段打分

function search_script_field(){
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'match_all' => new  \stdClass()
				],
				'script_fields' => [
					'test1' => [            //脚本获取文章值并计算积
						'script' => [
							'lang' => 'painless',
							'source' => "doc['popularity'].value * 2"
						]
					],
					'test2' => [            //脚本获取参数值并与文档值乘积
						'script' => [
							'lang' => 'painless',
							'source' => "doc['popularity'].value * params.count",
							'params' => ['count' => 3]
						]
					],
					'test3' => [               //给这个字段打分，但注意无法使用painless及paramas数组传值
						'script' => "params['_source']['popularity'] * 4"
					],
				]
			]
		];
		$client = ClientBuilder ::create() -> build();
		echo json_encode($client -> search($params));
	}

这里要区分脚本字段打分中的参数 doc[xxx].value 和 params['_source'][xxx],第一个使用doc关键字，将导致将该字段的术语加载到内存中（缓存），这将导致执行速度更快，但会占用更多内存。另外，该doc[...]表示法仅允许使用简单的值字段（您不能从中返回json对象），并且仅对未分析或基于单个术语的字段有意义。但是，仍然建议使用doc[...]来访问文档中的值（如果可能的话）因为_source每次使用时都必须对其进行加载和解析。使用_source非常慢。

游标查询（深分页）
```
public function search_by_scrolling(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'scroll' => "30s",      //设置游标时间
			'size'   => 1,          //设置每次查询数量
			'body'  => ['query' => ['match_all' => new \stdClass()],
			]
		];
		$response = $client->search($params);
		$result = isset($response['hits']['hits']) ? $response['hits']['hits'] : [];    //缓存初次结果
		while (isset($response['hits']['hits']) && count($response['hits']['hits']) > 0) {
			$scroll_id = $response['_scroll_id'];
			$response = $client->scroll([
					"scroll_id" => $scroll_id,  // 使用上个请求获取到的  _scroll_id
					"scroll" => "30s"           // 时间窗口保持一致
				]
			);
			$result_tmp = isset($response['hits']['hits']) ? $response['hits']['hits'] : [];
			$result = array_merge($result,$result_tmp);
		}
```
es深分页问题，es不允许查10000条以后的数据，es的配置中index.max_result_window:10000，来限制最大查询，如果要查询10000条以后的数据可以使用scroll游标查询，而不可以使用form-size的方式。因为如果使用from-size的方式查从第20调数据向后查20条数据，es就不得不去除所有分片上的1-20条数据然后进行排序最后取form-size条数据，假如你有12个分片，那么查20条数据，那么就要在内存获取到 12*（20+20）记录后再做一次全局排序，当数据达到一定数量时，就很容易出现内存用完的情况。所以当我们非得要获取到1w条数据之后，建议使用scroll游标查询。当然，游标查询不适合实时搜索，它适合后台的批处理。这里分享一个关于游标查询的文章https://blog.csdn.net/weixin_40341116/article/details/80821655 希望对大家有所帮助

聚合-平均值

function agg_search(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'size'  => 0,
			'body'  => [
				'aggs' => [
					'avg_popularity' => ['avg' => ['field' => 'popularity']],   //根据文档去除字段计算平均值
					'avg_populartiy_by_script' => ['avg' => ['script' => ['source' => "doc.popularity.value * 2",]]],   //使用脚本计算平均值
					'avg_def'        => ['avg' => ['field' => 'grade','missing' => 10]]   //文档中不存在的字段聚合结果是null,也可以指定确实字段值
				],
			]
		];
		$response = $client->search($params);
		echo json_encode($response);
	}

es的聚合使用关键词agg，单纯的聚合我们并不关心bool查询，因此我们舍弃掉body中的bool参数，并且将size设置为0,这样返回中我们会的到如下结构的响应

{
    "took":2,
    "timed_out":false,
    "_shards":{
        "total":5,
        "successful":5,
        "skipped":0,
        "failed":0
    },
    "hits":{
        "total":2,
        "max_score":0,
        "hits":[

        ]
    },
    "aggregations":{
        "avg_def":{
            "value":null
        },
        "avg_populartiy_by_script":{
            "value":6
        },
        "avg_popularity":{
            "value":3
        }
    }
}

我们在查询语句中指定的聚合查询名称作为响应中返回的key，其值value即我们要获取的平均值结果，上列代码演示了三种计算平均值得方法，第一种是直接获取文档内得字段然后进行聚合计算，第二种则是使用脚本得方式进行聚合打分，第三种是对不存在的字段进行聚合，前两种方式都可以对日常字段聚合，但个人举得脚本会更灵活，第三种如果文档中不存在这个字段，聚合结果会是null,如果是用missing参数指定确实字段默认值，则聚合结果为此默认值

2019-09-23更新

单值、多值聚合查询

        //单值、多值聚合
	function agg_extends_stats(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'size'  => 0,
			'body'  => [
				'aggs' => [
					'min'   => ['min' => ['field' => 'popularity']],  //最小值聚合
					'max'   => ['max' => ['field' => 'popularity']],  //最大值聚合
					'avg'   => ['avg' => ['field' => 'popularity']],  //均值聚合
					'sum'   => ['sum' => ['field' => 'popularity']],  //和值聚合
					'cardinality' => ['cardinality' =>['field' => 'popularity']], //基数聚合，比如你文档中设置的性别有 男女两种，则基数为2
					'stats' => ['stats' => ['field' => 'popularity']],  //基础度量，获取文档中此字段的基数、均值、最大、最小、和值
					'extended_stats' => ['extended_stats' => ['field' => 'popularity']],  //额外度量聚合
					'terms' => ['terms' => ['field' => 'popularity']],  //键值聚合，可以统计某个字段中每个键出现的次数
					'value_count' => ['value_count' => ['script' => ['source' => "doc.value"]]]  //值统计，有几个值
				],
			]
		];
		$response = $client->search($params);
		echo json_encode($response);
	}

以下是响应，agg中第一维数组是聚合的名称，返回值会以聚合名称-聚合值的形式返回，如下

{
    "took":23,
    "timed_out":false,
    "_shards":{
        "total":5,
        "successful":5,
        "skipped":0,
        "failed":0
    },
    "hits":{
        "total":2,
        "max_score":0,
        "hits":[

        ]
    },
    "aggregations":{
        "avg":{
            "value":3
        },
        "min":{
            "value":1
        },
        "terms":{
            "doc_count_error_upper_bound":0,
            "sum_other_doc_count":0,
            "buckets":[
                {
                    "key":1,
                    "doc_count":1
                },
                {
                    "key":5,
                    "doc_count":1
                }
            ]
        },
        "extended_stats":{
            "count":2,
            "min":1,
            "max":5,
            "avg":3,
            "sum":6,
            "sum_of_squares":26,
            "variance":4,
            "std_deviation":2,
            "std_deviation_bounds":{
                "upper":7,
                "lower":-1
            }
        },
        "stats":{
            "count":2,
            "min":1,
            "max":5,
            "avg":3,
            "sum":6
        },
        "max":{
            "value":5
        },
        "sum":{
            "value":6
        },
        "value_count":{
            "value":2
        },
        "cardinality":{
            "value":2
        }
    }
}

其中常见聚合 min \ max \ agv\sum\value_count(值统计) 此处不做解释。
这里说一下基数聚合cardinality,他统计的是字段的基数，比如文档中有性别字段gender中有男\女两个case, cardinality统计的就是有几种case，这里就是2。
接下来我们说一下stats\extended_stats这两个是多值聚合，其聚合值涵盖了min\max\agv\cardinality\sum等内容，这个应用时根据情况自选聚合类型。
这里有个比较重要的就是terms聚合，这个聚合我理解的就是和cardinality聚合类似，不过terms聚合明确指出了聚合的key-value,key就是字段值中的case,而value则是这个case在es里所有文档中出现的次数，term有数据不确定性，

比如：

我们想要获取popularity字段中出现频率最高的前5个。

此时，客户端向ES发送聚合请求，主节点接收到请求后，会向每个独立的分片发送该请求。
分片独立的计算自己分片上的前5个popularity，然后返回。当所有的分片结果都返回后，在主节点进行结果的合并，再求出频率最高的前5个，返回给客户端。

这样就会造成一定的误差，比如最后返回的前5个中，有一个叫A的，有50个文档；B有49。但是由于每个分片独立的保存信息，信息的分布也是不确定的。有可能第一个分片中B的信息有2个，但是没有排到前5，所以没有在最后合并的结果中出现。这就导致B的总数少计算了2，本来可能排到第一位，却排到了A的后面。

term聚合排序

function agg_search_term_sort(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'size'  => 0,
			'body'  => [
				'aggs' => [
					#根据聚合后的term及响应聚合中的key进行排序只在histogram 和 date_histogram中使用,事实上也能在terms使用，也叫字典排序
					'terms_by_key' => ['terms' => ['field' => 'popularity','order' => ['_key' => 'desc']]],
					#根据聚合后响应中的doc_count进行排序，对terms\histogram\date_histogram中使用
					'terms_by_count' => ['terms' => ['field' => 'popularity','order' => ['_count' => 'desc']]],
					#根据词项的字符串的字母顺序排序，只在terms中使用，term在6.0中已经被废弃，如果使用成功是因为代码中使用了key来代替term
					'terms_by_term' => ['terms' => ['field' => 'popularity','order' => ['_term' => 'desc']]]
				],
			]
		];
		$response = $client->search($params);
		echo json_encode($response);
	}

使用terms对popularity字段进行分桶，分桶的结果根据响应中的key或者doc_count进行排序，6.0之前还有种内置排序是term，根据词项的字符串顺序排序，只在terms中使用，term在6.0中已经废弃，6.0之后使用term关键词依旧可以使用，但是实际上代码里使用了key来代替了term

2019-09-24更新

聚合查询中增加过滤语句

function agg_search_filter(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'size'  => 0,
			'body'  => [
				'aggs' => [
					'agg_filter' => [
						'filter' => ['term' => ['test' => "aaaaa"]],
						'aggs'    => ['terms' => ['terms' => ['field' => 'popularity']]]
					]
				],
			]
		];
		$response = $client->search($params);
		echo json_encode($response);
	}

单桶聚合并关联一个筛选项，以下是响应

{
    "took":15,
    "timed_out":false,
    "_shards":{
        "total":5,
        "successful":5,
        "skipped":0,
        "failed":0
    },
    "hits":{
        "total":5,
        "max_score":0,
        "hits":[

        ]
    },
    "aggregations":{
        "agg_filter":{
            "doc_count":1,
            "terms":{
                "doc_count_error_upper_bound":0,
                "sum_other_doc_count":0,
                "buckets":[
                    {
                        "key":13,
                        "doc_count":1
                    }
                ]
            }
        }
    }
}

多桶聚合，每个桶关联一个筛选项

function agg_multi_search_filter(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'size'  => 0,
			'body'  => [
				'aggs' => [
					'multi_aggs' => [
						'filters' => [
							'other_bucket_key' => "other_bucket",
							'filters' => [
								'popularity13' => ['term' => ['popularity' => 13]],
								'popularity22' => ['term' => ['popularity' => 22]],
							]
						]
					]
				],
			]
		];
		$response = $client->search($params);
		echo json_encode($response);
	}

上列代码中multi_agg为相应中字典的keym，第一个filters对应响应中返回的bucket的类型为 other_bucket\popularity13\popularity22三个桶，其中13、22这两个桶关联各自的过滤筛选，符合筛选的则落入对应的桶中，不符合筛选的落入other_buket桶中。以下是响应，

{
    "took":46,
    "timed_out":false,
    "_shards":{
        "total":5,
        "successful":5,
        "skipped":0,
        "failed":0
    },
    "hits":{
        "total":5,
        "max_score":0,
        "hits":[

        ]
    },
    "aggregations":{
        "multi_aggs":{
            "buckets":{
                "popularity13":{
                    "doc_count":2
                },
                "popularity22":{
                    "doc_count":1
                },
                "other_bucket":{
                    "doc_count":2
                }
            }
        }
    }
}

看响应中，buckets有三个桶，正好对应请求中的三个分桶

2019-09-30更新

嵌套对象索引创建

function nested_mapping_create(){
		$client = ClientBuilder ::create() -> build();
		$mappings = [
			'properties' => [
				'user' => [
					'type' => 'nested',
					'properties' => [
						'name' => ['type' => 'keyword'],
						'age'  => ['type' => 'integer']
					]
				]
			]
		];
		$params = [
			'index' => 'user',
			'body'  => [
				'doc' => $mappings,
			]
		];
		var_dump($client->indices()->create($params));
	}

nested嵌套对象，类型纪委nested,关联一个properties,其下为一个数组或者多维数组，可以存多个数据

嵌套对象输入存入

function nseted_doc_create(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'user',
			'type'  => 'doc',
			'id'    => 6,
			'body'  => [
				'user' => [
					['name' => 'Pythoner','age' => 30],
					['name' => 'Javaer','age' => 20],
				]
			]
		];
		var_dump($client->create($params));
	}

user嵌套内可以存多个数组

嵌套对象搜索

function nested_doc_search(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'user',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'nested' => [
						'path' => 'user',
						'query' => [
							'term' => ['user.name'=>'PHPer']
						]
					]
				]
			]
		];
		echo json_encode($client->search($params));
	}

需要用path关键词指定嵌套对象

基本过滤方式，过滤不进行打分，只筛选，可以缓存

function base_filter(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'bool' => [
						'filter' => [
							'bool' => [
								'must' => [
									['terms' => ['test' => ["aaaaa",'hahah']]],
									['term' => ['popularity' => 13]],
								]
							]
						],
					]
				]
			]
		];
		echo json_encode($client->search($params));
	}

es5.0之后废弃了filtered关键词，进行了查询筛选合并，分为查询时筛选和查询后筛选，上列代码为查询时筛选，并没有写查询语句，单纯的筛选，外层的query\bool内使用filter关键词指明是过滤操作，内部使用bool关键词来进行条件合并，使用must关键词知名多条件且过滤。

基本查询方式，会对文档就进行打分，不能进行缓存

function base_search(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'bool' => [
						'must' => [
							['term' => ['test' => 'asdaa']],
							['term' => ['popularity' => 22]],
						],
					],
				]
			]
		];
		echo json_encode($client->search($params));
	}

基本查询与基本过滤差不多，在外层的query\bool内不指定filter关键词即没有过滤操作，直接使用must关键词指明是多个查询且关系。

基本查询筛选

function base_search_filter(){
		$client = ClientBuilder ::create() -> build();
		$params = [
			'index' => 'func_score',
			'type'  => 'doc',
			'body'  => [
				'query' => [
					'bool' => [
						'must_not' => [
							['term' => ['test' => 'asdaa']],
							['term' => ['popularity' => 22]],
						],
						'filter'  => [
							'term' => ['test' => 'aaaaa']
						]
					],
				]
			]
		];
		echo json_encode($client->search($params));
	}

查询时过滤，辉县进行完过滤，然后对过滤的结果进行筛选，尽量使用这种方式，会提高性能

PHPerJiang

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Es6.x语法探索【结束】

2019-09-21script使用表达式expression进行打分 /** * 使用打分函数来进行排序 * @Author: jiangyu * @Time: 2019/9/20 18:00 */ function search_func_score(){ $params = [ 'index' => 'func_score', 'type' ...
复制链接

扫一扫