先来,常用的、最基础的elasticsearch_dsl查询:
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
# es查询实现
client = Elasticsearch()
# 匹配
# s = Search(using=client, index='crowd_member', ).query("match", org_id=org_id). \
# query("match", oid=oid).query("match", eid=eid)
# 多字段匹配
# s = Search(using=client, index=index).query("multi_match", query='撒旦法撒sad富士达时代', \
# fields=['title', 'content'])
# in查询
# terms_dict = {'channelscopes.id': [3, 4, 5, 6]}
# s = Search(using=client, index=index).query('match', **query_dict).filter("terms", **terms_dict)
# 能用filter就用filter,match需要进行相关度计算,速度会慢
# 过滤
s = Search(using=client, index='crowd_member', ).filter("term", org_id=org_id). \
filter("term", oid=oid).filter("term", eid=eid)
# 可以进行链式拼接,方便组合代码
if crowd_type:
s = s.filter("term", crowd_type=crowd_type)
if crowd_id:
s = s.filter("term", crowd_id=crowd_id)
# 对结果进行聚合,分组
s.aggs.bucket('name1', 'terms', field='_index', )
# 嵌套的对象,也是可以的:以'channelscopes.target_type'进行分组,求每组的"channelscopes.id"的平均值
# s.aggs.bucket('name1', 'terms', field='channelscopes.target_type', ).\
# metric("name2", "avg", field="channelscopes.id")
# 限定返回条数
s = s.params(size=count_per_page)
# 分页,得到第page_no页,每页count_per_page条
start_num = (int(page_no) - 1) * int(count_per_page)
end_num = int(page_no) * int(count_per_page)
s = s[start_num:end_num:1]
# _source只取特定的返回值,可以加快效率
s = s.source(['crowd_type', 'crowd_id', 'join_msg_id'])
# r = s.execute() # 执行,默认只获取十条
# print(r)
# print_json(r.to_dict()) # 转换结果为字典
s = s.scan() # 获取全部符合的数据
get_result_list(s, ['crowd_type', 'crowd_id']) # 调取函数,将结果转换为列表嵌套列表的格式
# 搜索结果,title和content字段,高亮
s = s.query(q).highlight('title', fragment_size=50).highlight('content', fragment_size=50)
# 分组聚合,并取每组前三条,并高亮显示title和content字段
a = A('terms', field='object_type', size=10000)
a.bucket('first_hit', 'top_hits', size=3, highlight={'fields': {"content": {},'title':{}}})
s.aggs.bucket('object_type', a)
# 使用crowd_type,crowd_id两个字段进行分组聚合,并显示每组第一条,还要高亮title和content字段
a = A('terms', field='crowd_type', size=10000)
a.bucket('crowd_id', 'terms', field='crowd_id', size=10000).
bucket('first_hit', 'top_hits', size=1, highlight={'fields': {"content": {},'title':{}}})
s.aggs.bucket('crowd_type', a)
# 这是一个进行后续处理的函数,我写的
def get_result_list(s, attr_list):
result_list = []
for hit in s:
little_list = []
for attr in attr_list:
if hasattr(hit, attr):
little_list.append(getattr(hit, attr))
else:
little_list.append('')
# little_list.append(0)
result_list.append(little_list)
return result_list
前方高能:万能的Q方法
万能的Q方法,能用其他写法实现的,都可以使用Q方法实现,然后自由组合:
client = Elasticsearch()
s = Search(using=client, index='crowd_member')
# 可以使用逻辑运算符组合查询对象:
# 借助这三个式子,万变不离其宗,随意组合可以用Q方法实现的查询条件,没有做不到,只有你想不到
# 或,or
q = Q("match", title='python') | Q("match", title='django')
# {"bool": {"should": [...]}}
# 与,and
q = Q("match", title='python') & Q("match", title='django')
# {"bool": {"must": [...]}}
# 非,not
q = ~Q("match", title="python")
# {"bool": {"must_not": [...]}}
q = Q("multi_match", query='python django', fields=['title', 'body'])
s = s.query(q)
进行条件嵌套,需要使用字典解包**,实现参数的变量化。
下面是
“or”里面嵌套“and”条件:
def get_q_or_contain_and(list_x, name_list):
print("q " * 30)
q = None
for index in range(len(list_x)):
for little_index in range(len(name_list)):
# 过滤的写法
q_q = Q('bool', filter=[Q('terms', **{name_list[little_index]: [list_x[index][little_index]]})])
# 匹配的写法
# q_q = Q("match", **{name_list[little_index]: list_x[index][little_index]})
# 过滤的例子
# q_q = Q('bool', filter=[Q('terms', object_type=['message', ])])
if little_index == 0:
little_q = q_q
else:
little_q = little_q & q_q
if index == 0:
q = little_q
else:
q = q | little_q
return q
这是
“or”里面嵌套“and”条件和“大于等于”条件:
def get_q_or_contain_and_gt(list_x, name_list):
print("q " * 30)
q = None
for index in range(len(list_x)):
for little_index in range(len(name_list)):
if little_index == len(name_list) - 1:
q_q = Q('range', **{name_list[little_index]: {"gte": list_x[index][little_index]}})
# q_q = Q('bool', filter=[Q('terms', **{name_list[little_index]: [list_x[index][little_index]]})])
# q_q = Q('range', end_time={"lte": end_time})
# s = s.filter("range", timestamp={"gte": 0, "lt": time.time()}).query("match", country="in")
# Q('range', end_time={"lte": end_time})
# pass
else:
q_q = Q('bool', filter=[Q('terms', **{name_list[little_index]: [list_x[index][little_index]]})])
# q_q = Q("match", **{name_list[little_index]: list_x[index][little_index]})
# q_q = Q('bool', filter=[Q('terms', object_type=['message', ])])
if little_index == 0:
little_q = q_q
else:
little_q = little_q & q_q
if index == 0:
q = little_q
else:
q = q | little_q
return q
使用Q方法,进行非常复杂的组合:
# 非message的查询
name_list = ['crowd_type', 'crowd_id']
q_in = get_q_or_contain_and(crowd_allow_list, name_list)
q1 = Q("match", title=keyword)
q2 = Q("match", content=keyword)
q_one = q_in & (q1 | q2)
q_not_message = Q('bool', filter=[Q('terms', owner_type=['topic', 'file'])]) & q_one
# message的查询
name_list_2 = ['crowd_type', 'crowd_id', 'owner_id']
q_in_gte = get_q_or_contain_and_gt(crowd_allow_list, name_list_2)
q1 = Q("match", title=keyword)
q2 = Q("match", content=keyword)
# q = q1
# q = q_in
q_two = q_in_gte & (q1 | q2)
q_message = Q('bool', filter=[Q('terms', owner_type=['message'])]) & q_two
if owner_type:
if owner_type in ['topic', 'file']:
q = Q('bool', filter=[Q('terms', owner_type=[owner_type])]) & q_one
if owner_type in ['message', ]:
q = q_message
if message_type:
q = Q('bool', filter=[Q('terms', message_type=[message_type])]) & q
pass
if creator_id:
q = Q('bool', filter=[Q('terms', creator_id=[creator_id])]) & q
pass
else:
q = q_message | q_not_message
使用A方法,进行复杂的,聚合查询:
from elasticsearch_dsl import Search, Q, A
s = s.query(q).highlight('title', fragment_size=50).highlight('content', fragment_size=50)
a = A('terms', field='crowd_type', size=10000)
a.bucket('crowd_id', 'terms', field='crowd_id', size=10000).bucket('first_hit', 'top_hits', size=1)
s.aggs.bucket('crowd_type', a)
r = s.execute()
hits = r.to_dict().get('hits').get('hits')
total = r.to_dict().get('hits').get('total').get('value')
if hits:
hits = [{
object_type + '_id': i.get('_source').get('object_id'),
'highlight': i.get('highlight').get('content')[0] if i.get('highlight').get('content') else
i.get('highlight').get('title')[0]
} for i in hits]
还有这样的A方法,聚合结果高亮:
a = A('terms', field='object_type', size=10000)
a.bucket('first_hit', 'top_hits', size=3, highlight={'fields': {"content": {}, 'title': {}}})
s.aggs.bucket('object_type', a)
r = s.execute()
总之就是,用kibana的控制台能实现的查询,都能用elasticsearch_dsl实现,而且语法更友好,还不容易出错。
多多查看源码,多尝试,多打印,就行了。
看过上面的例子,你应该明白了,如何进行复杂的elasticsearch查询,就是堆乐高积木那样堆起来就好了。
如果还不明白,可以点赞并留言,我如果有时间会看的。
最后,附上官方文档地址:
elasticsearch_dsl的官方文档地址