目录
聚合嵌套(嵌套时支持bucket、metric、pipeline)
在使用Python客户端操作ES时,需要编写ES DSL语句对ES进行查询操作。但体验下来,发现以下缺点:
字段冗长,容易出现语法错误,如不正确的嵌套,难以修改(如添加另一个过滤器)
基于此目的,发现了解到了elasticsearch-dsl库。
将传统的JSON DSL语句,映射为Python类、对象操作,使其两者相互转换互通,减少手动编写JSON DSL语句的繁琐,提高可维护性。
一、创建连接(可选)
dsl库支持查询,并序列化结果。
若仅为了生成DSL语句,则无需创建连接。
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
es = Elasticsearch(hosts="127.0.0.1:9200")
s = Search(using=es, index="teset_index")
二、基本查询(Q方法)
2.1 query
空查询
# 空查询
s = Search().query(Q())
{
"query": {
"match_all": {}
}
}
基于字典生成Search对象
# 基于字典生成Search对象
s = Search.from_dict({"track_total_hits": True}).query(Q())
{
"query": {
"match_all": {}
},
"track_total_hits": true
}
Query查询语句
# query查询
s = Search.from_dict({"track_total_hits": True}).query(Q())
s = s.query("multi_match", query='python django', fields=['title', 'body'])
{
"query": {
"multi_match": {
"query": "python django",
"fields": [
"title",
"body"
]
}
},
"track_total_hits": true
}
组合查询
should
# should
s = Search().query(Q("match", title='python') | Q("match", title='django'))
{
"query": {
"bool": {
"should": [
{
"match": {
"title": "python"
}
},
{
"match": {
"title": "django"
}
}
]
}
}
}
must
# must
s = Search().query(Q("match", title='python') & Q("match", title='django'))
{
"query": {
"bool": {
"must": [
{
"match": {
"title": "python"
}
},
{
"match": {
"title": "django"
}
}
]
}
}
}
must_not
# must_not
s = Search().query(~Q("match", title='python'))
{
"query": {
"bool": {
"must_not": [
{
"match": {
"title": "python"
}
}
]
}
}
}
2.2 filter
空查询
# 空查询
s = Search().filter(Q())
{
"query": {
"bool": {
"filter": [
{
"match_all": {}
}
]
}
}
}
terms
# trems
s = Search().filter('terms', tags=['search', 'python'])
{
"query": {
"bool": {
"filter": [
{
"terms": {
"tags": [
"search",
"python"
]
}
}
]
}
}
}
2.3 分页
s = Search().query(Q())
s = s[0:10]
{
"query": {
"match_all": {}
},
"from": 0,
"size": 10
}
2.4 排序
位置参数(-):表示按该字段降序
s = Search().sort(
'category',
'-title',
{"lines": {"order": "asc", "mode": "avg"}}
)
{
"sort": [
"category",
{
"title": {
"order": "desc"
}
},
{
"lines": {
"order": "asc",
"mode": "avg"
}
}
]
}
2.5 限制返回字段
指定显示字段
# 指定显示字段
s = Search().query(Q())
s = s.source(["title", "body"])
{
"query": {
"match_all": {}
},
"_source": [
"title",
"body"
]
}
指定包含和不包含字段
# 指定包含或不包含字段
s = Search().query(Q())
s = s.source(include=["title"], exclude=["user.*"])
{
"query": {
"match_all": {}
},
"_source": {
"include": [
"title"
],
"exclude": [
"user.*"
]
}
}
仅返回元数据
# 仅返回元数据
s = Search().query(Q())
s = s.source(False)
{
"query": {
"match_all": {}
},
"_source": false
}
2.6 更新已存在的语句
s = Search.from_dict({"track_total_hits": True}).query(Q())
s.update_from_dict({"query": {"match": {"title": "Django"}}, "size": 42})
{
"query": {
"match": {
"title": "Django"
}
},
"track_total_hits": true,
"size": 42
}
2.7 复杂类型字段查询
q = Q('nested', path='source.ip', query=Q('terms', **{'source.ip': ['1.1.1.1', '2.2.2.2']}))
s = Search().filter(q)
{
"query": {
"bool": {
"filter": [
{
"nested": {
"path": "source.ip",
"query": {
"terms": {
"source.ip": [
"1.1.1.1",
"2.2.2.2"
]
}
}
}
}
]
}
}
}
2.8 额外的属性和参数
extra(添加额外属性)
s = Search().query(Q())
s = s.extra(track_total_hits=True)
{
"query": {
"match_all": {}
},
"track_total_hits": true
}
三、深度分页查询
3.1 scroll
elasticsearch库实现,本文不具体说明。
3.2 search_after
# 1. 查询前10000条数据,并根据_id(任何唯一值)排序
s = Search().query(Q()).sort("-date", "-_id")[:10000]
# 2. 取第10000个排序后值,据此实时查询后1W数据
s = s.extra(search_after=[202207121139, "kafoiheufqwafw"])[:10000]
search_after查询时,from只能为0或-1。
{
"query": {
"match_all": {}
},
"sort": [
{
"date": {
"order": "desc"
}
},
{
"_id": {
"order": "desc"
}
}
],
"from": 0,
"size": 10000
}
{
"query": {
"match_all": {}
},
"search_after": [
202207121139,
"kafoiheufqwafw"
],
"from": 0,
"size": 10000
}
四、聚合查询(A方法)
4.1 桶(bucket)
s = Search().query(Q())
s.aggs.bucket("agg_name", A("terms", field="type"))
{
"query": {
"match_all": {}
},
"aggs": {
"agg_name": {
"terms": {
"field": "type"
}
}
}
}
聚合嵌套(嵌套时支持bucket、metric、pipeline)
s.aggs.bucket("groupDate", "date_histogram", field="update_time", interval="1d", format="yyyy-MM-dd") \
.bucket("op_agg", "terms", field="op_state") \
.metric("avg_price", "avg", field="price")
{
"query": {
"match_all": {}
},
"aggs": {
"groupDate": {
"date_histogram": {
"field": "update_time",
"interval": "1d",
"format": "yyyy-MM-dd"
},
"aggs": {
"op_agg": {
"terms": {
"field": "op_state"
},
"aggs": {
"avg_price": {
"avg": {
"field": "price"
}
}
}
}
}
}
}
}
五、附录
5.1 根据指定规则生成DSL脚本
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import json
from elasticsearch_dsl import Search, Q, A
class GenerateDSL:
"""
DSL工具包
"""
def __init__(self):
self.base_dsl_obj = Search()
def init_base_dsl_obj(
self,
s_time_dict: dict = None,
e_time_dict: dict = None,
size: int = 10000,
sort: list = None,
source_fields: list = None,
):
"""
生成基础DSL
:param s_time_dict: 开始时间 {'time_key': {'gte': "value", "format": "yyyy-MM-dd HH:mm:ss"}}
:param e_time_dict: 结束时间 {'time_key': {'lte': "value", "format": "yyyy-MM-dd HH:mm:ss"}}
:param size: 指定返回数量
:param sort: 排序 ["category", "-age", {"price" : {"order" : "asc", "mode" : "avg"}}]
:param source_fields: 指定返回字段 ["name", "age"]
:return:
"""
track_total_hits = {"track_total_hits": True}
# 预加载时间范围
if s_time_dict and e_time_dict:
q = Q("range", **s_time_dict) & Q("range", **e_time_dict)
self.base_dsl_obj = Search().update_from_dict(track_total_hits).filter(q)
elif s_time_dict or e_time_dict:
q = Q("range", **s_time_dict or e_time_dict)
self.base_dsl_obj = Search().update_from_dict(track_total_hits).filter(q)
else:
self.base_dsl_obj = Search().update_from_dict(track_total_hits)
# 排序
if sort:
self.base_dsl_obj = self.base_dsl_obj.sort(*sort)
# 指定返回字段
if source_fields:
self.base_dsl_obj = self.base_dsl_obj.source(source_fields)
self.base_dsl_obj = self.base_dsl_obj[0:size]
return self.base_dsl_obj.to_dict()
def generate_query_dsl(self, rule_dict: dict):
"""
根据规则生成DSL
规则格式说明:
{
"alert_level": { # alert_level为查询目的字段(简单结构)
"value": [3, 4, 5], # value为匹配项,可多选,与switch配合查询
"switch": "and" # switch支持:and(并)、not(不包括)、or(或)
},
"source.ip": { # "source.ip为查询目的字段(复杂结构)
"value": ["0.0.0.0", "1.1.1.1"],
"switch": "or"
}
}
"""
def son_func(_type: str, q_list: list, key: str, value: dict):
keyword = value.get("keyword", "term")
assert keyword in ("match", "term"), (
"keyword ValueError"
)
# 复杂
if _type == "nested":
nested_path, search_field = key.split(".")
for v in value.get("value"):
if isinstance(v, str) and ("*" in v or "?" in v):
q_list.append(Q("nested", path=nested_path, query=Q("wildcard", **{key: v})))
else:
q_list.append(Q("nested", path=nested_path, query=Q(keyword, **{key: v})))
else:
for v in value.get("value"):
if isinstance(v, str) and ("*" in v or "?" in v):
q_list.append(Q("wildcard", **{key: v}))
else:
q_list.append(Q(keyword, **{key: v}))
q_must = []
q_should = []
q_must_not = []
for key, value in rule_dict.items():
# 判断逻辑操作符
switch = value.get("switch", "and")
if switch == "and":
q_list = q_must
elif switch == "not":
q_list = q_must_not
elif switch == "or":
q_list = q_should
else:
raise ValueError("不支持的 switch")
# 复杂结构
if "." in key:
son_func("nested", q_list, key, value)
# 普通结构
else:
son_func("simple", q_list, key, value)
self.base_dsl_obj = self.base_dsl_obj.query(Q("bool", must=q_must, should=q_should, must_not=q_must_not))
return self.base_dsl_obj.to_dict()
def generate_bucket_dsl(
self,
bucket_name: str,
aggr_type: str,
aggr_field: str = None,
sup_bucket_name: str = None,
aggr_keyword: dict = None
):
"""
生成DSL聚合语句
:param bucket_name: 自定义的桶名称
:param aggr_type: 桶类型
:param aggr_field: 聚合字段
:param sup_bucket_name: 父级桶名称
:param aggr_keyword: 其它聚合扩展字段
:return:
"""
a = A(aggr_type, field=aggr_field or aggr_keyword.pop("field", None), **aggr_keyword if aggr_keyword else {})
if sup_bucket_name:
self.base_dsl_obj.aggs[sup_bucket_name].bucket(bucket_name, a)
else:
self.base_dsl_obj.aggs.bucket(bucket_name, a)
return self.base_dsl_obj.to_dict()
@staticmethod
def to_dict(es_type="or", **kwargs):
"""
根据ES mapping字段 生成查询es格式化dict
:param es_type:支持 or、and、not
:param kwargs:
:return:
"""
body_dict = {}
for key, value in kwargs.items():
if not value:
continue
value_dict = {
"switch": es_type,
"value": value,
}
param_dict = {
key: value_dict
}
body_dict.update(param_dict)
return body_dict
dsl_obj = GenerateDSL()
使用
# 初始化基础DSL语句
dsl_obj.init_base_dsl_obj(
# 根据年龄排序
sort=["age"],
# 根据出生日期过滤
s_time_dict={'birthday': {'gte': "1999-01-01", "format": "yyyy-MM-dd"}},
# 指定查询返回字段
source_fields=["name", "age"]
)
# 构建查询规则
rules = dsl_obj.to_dict("and", name=["张三", "李四"], address=["北京"])
dsl_obj.generate_query_dsl(rules)
# 打印查询语句
print(json.dumps(dsl_obj.base_dsl_obj.to_dict(), indent=4, ensure_ascii=False))
输出
{
"query": {
"bool": {
"filter": [
{
"range": {
"birthday": {
"gte": "1999-01-01",
"format": "yyyy-MM-dd"
}
}
}
],
"must": [
{
"term": {
"name": "张三"
}
},
{
"term": {
"name": "李四"
}
},
{
"term": {
"address": "北京"
}
}
]
}
},
"sort": [
"age"
],
"track_total_hits": true,
"from": 0,
"size": 10000,
"_source": [
"name",
"age"
]
}