使用elasticsearch_dsl完成对ElasticSearch的复杂搜索

最新推荐文章于 2024-05-16 00:06:34 发布

天魔老师

最新推荐文章于 2024-05-16 00:06:34 发布

阅读量1k

点赞数 2

分类专栏： elasticsearch 文章标签： elasticsearch_dsl elasticsearch python 全文搜索 Q方法

本文链接：https://blog.csdn.net/weixin_39990025/article/details/99754989

版权

elasticsearch 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

先来，常用的、最基础的elasticsearch_dsl查询：

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

# es查询实现
client = Elasticsearch()

# 匹配
# s = Search(using=client, index='crowd_member', ).query("match", org_id=org_id). \
#     query("match", oid=oid).query("match", eid=eid)			

# 多字段匹配
# s = Search(using=client, index=index).query("multi_match", query='撒旦法撒sad富士达时代', \
#     fields=['title', 'content'])		
	
# in查询
# terms_dict = {'channelscopes.id': [3, 4, 5, 6]}
# s = Search(using=client, index=index).query('match', **query_dict).filter("terms", **terms_dict)   

# 能用filter就用filter，match需要进行相关度计算，速度会慢
# 过滤
s = Search(using=client, index='crowd_member', ).filter("term", org_id=org_id). \
    filter("term", oid=oid).filter("term", eid=eid)
    
# 可以进行链式拼接，方便组合代码
if crowd_type:
    s = s.filter("term", crowd_type=crowd_type)
if crowd_id:
    s = s.filter("term", crowd_id=crowd_id)
    
# 对结果进行聚合，分组
s.aggs.bucket('name1', 'terms', field='_index', )

# 嵌套的对象，也是可以的：以'channelscopes.target_type'进行分组，求每组的"channelscopes.id"的平均值
# s.aggs.bucket('name1', 'terms', field='channelscopes.target_type', ).\
#	  metric("name2", "avg", field="channelscopes.id")

# 限定返回条数
s = s.params(size=count_per_page)		

# 分页，得到第page_no页，每页count_per_page条
start_num = (int(page_no) - 1) * int(count_per_page)
end_num = int(page_no) * int(count_per_page)		
s = s[start_num:end_num:1]		

# _source只取特定的返回值，可以加快效率
s = s.source(['crowd_type', 'crowd_id', 'join_msg_id'])		

# r = s.execute()			# 执行，默认只获取十条
# print(r)
# print_json(r.to_dict())		# 转换结果为字典

s = s.scan()			# 获取全部符合的数据
get_result_list(s, ['crowd_type', 'crowd_id'])		# 调取函数，将结果转换为列表嵌套列表的格式

# 搜索结果，title和content字段，高亮
s = s.query(q).highlight('title', fragment_size=50).highlight('content', fragment_size=50)

# 分组聚合，并取每组前三条，并高亮显示title和content字段
a = A('terms', field='object_type', size=10000)
a.bucket('first_hit', 'top_hits', size=3, highlight={'fields': {"content": {},'title':{}}})
s.aggs.bucket('object_type', a)

# 使用crowd_type，crowd_id两个字段进行分组聚合，并显示每组第一条，还要高亮title和content字段
a = A('terms', field='crowd_type', size=10000)
a.bucket('crowd_id', 'terms', field='crowd_id', size=10000).
	bucket('first_hit', 'top_hits', size=1, highlight={'fields': {"content": {},'title':{}}})
s.aggs.bucket('crowd_type', a)

# 这是一个进行后续处理的函数，我写的
def get_result_list(s, attr_list):
    result_list = []
    for hit in s:
        little_list = []
        for attr in attr_list:
            if hasattr(hit, attr):
                little_list.append(getattr(hit, attr))
            else:
                little_list.append('')
                # little_list.append(0)
        result_list.append(little_list)
    return result_list

前方高能：万能的Q方法

万能的Q方法，能用其他写法实现的，都可以使用Q方法实现，然后自由组合：

client = Elasticsearch()
s = Search(using=client, index='crowd_member')

# 可以使用逻辑运算符组合查询对象：
# 借助这三个式子，万变不离其宗，随意组合可以用Q方法实现的查询条件，没有做不到，只有你想不到
# 或，or
q = Q("match", title='python') | Q("match", title='django')		
# {"bool": {"should": [...]}}
# 与，and
q = Q("match", title='python') & Q("match", title='django')		
# {"bool": {"must": [...]}}
# 非，not
q = ~Q("match", title="python")		
# {"bool": {"must_not": [...]}}

q = Q("multi_match", query='python django', fields=['title', 'body'])
s = s.query(q)

进行条件嵌套，需要使用字典解包**，实现参数的变量化。
下面是

“or”里面嵌套“and”条件：

def get_q_or_contain_and(list_x, name_list):
    print("q " * 30)
    q = None
    for index in range(len(list_x)):
        for little_index in range(len(name_list)):
            # 过滤的写法
            q_q = Q('bool', filter=[Q('terms', **{name_list[little_index]: [list_x[index][little_index]]})])		
            # 匹配的写法
            # q_q = Q("match", **{name_list[little_index]: list_x[index][little_index]})		
            # 过滤的例子
            # q_q = Q('bool', filter=[Q('terms', object_type=['message', ])])		
            if little_index == 0:
                little_q = q_q
            else:
                little_q = little_q & q_q

        if index == 0:
            q = little_q
        else:
            q = q | little_q
    return q

这是

“or”里面嵌套“and”条件和“大于等于”条件：

def get_q_or_contain_and_gt(list_x, name_list):
    print("q " * 30)
    q = None
    for index in range(len(list_x)):
        for little_index in range(len(name_list)):
            if little_index == len(name_list) - 1:
                q_q = Q('range', **{name_list[little_index]: {"gte": list_x[index][little_index]}})
                # q_q = Q('bool', filter=[Q('terms', **{name_list[little_index]: [list_x[index][little_index]]})])
                # q_q = Q('range', end_time={"lte": end_time})
                # s = s.filter("range", timestamp={"gte": 0, "lt": time.time()}).query("match", country="in")
                # Q('range', end_time={"lte": end_time})
                # pass
            else:
                q_q = Q('bool', filter=[Q('terms', **{name_list[little_index]: [list_x[index][little_index]]})])
            # q_q = Q("match", **{name_list[little_index]: list_x[index][little_index]})
            # q_q = Q('bool', filter=[Q('terms', object_type=['message', ])])
            if little_index == 0:
                little_q = q_q
            else:
                little_q = little_q & q_q

        if index == 0:
            q = little_q
        else:
            q = q | little_q
    return q

使用Q方法，进行非常复杂的组合：

    # 非message的查询
    name_list = ['crowd_type', 'crowd_id']
    q_in = get_q_or_contain_and(crowd_allow_list, name_list)
    q1 = Q("match", title=keyword)
    q2 = Q("match", content=keyword)
    q_one = q_in & (q1 | q2)
    q_not_message = Q('bool', filter=[Q('terms', owner_type=['topic', 'file'])]) & q_one

    # message的查询
    name_list_2 = ['crowd_type', 'crowd_id', 'owner_id']
    q_in_gte = get_q_or_contain_and_gt(crowd_allow_list, name_list_2)
    q1 = Q("match", title=keyword)
    q2 = Q("match", content=keyword)

    # q = q1
    # q = q_in
    q_two = q_in_gte & (q1 | q2)
    q_message = Q('bool', filter=[Q('terms', owner_type=['message'])]) & q_two

    if owner_type:
        if owner_type in ['topic', 'file']:
            q = Q('bool', filter=[Q('terms', owner_type=[owner_type])]) & q_one
        if owner_type in ['message', ]:
            q = q_message
            if message_type:
                q = Q('bool', filter=[Q('terms', message_type=[message_type])]) & q
                pass
            if creator_id:
                q = Q('bool', filter=[Q('terms', creator_id=[creator_id])]) & q
                pass
    else:
        q = q_message | q_not_message

使用A方法，进行复杂的，聚合查询：

from elasticsearch_dsl import Search, Q, A

s = s.query(q).highlight('title', fragment_size=50).highlight('content', fragment_size=50)
a = A('terms', field='crowd_type', size=10000)
a.bucket('crowd_id', 'terms', field='crowd_id', size=10000).bucket('first_hit', 'top_hits', size=1)
s.aggs.bucket('crowd_type', a)
r = s.execute()
hits = r.to_dict().get('hits').get('hits')
total = r.to_dict().get('hits').get('total').get('value')
if hits:
    hits = [{
        object_type + '_id': i.get('_source').get('object_id'),
        'highlight': i.get('highlight').get('content')[0] if i.get('highlight').get('content') else
        i.get('highlight').get('title')[0]
    } for i in hits]

还有这样的A方法，聚合结果高亮：

a = A('terms', field='object_type', size=10000)
a.bucket('first_hit', 'top_hits', size=3, highlight={'fields': {"content": {}, 'title': {}}})
s.aggs.bucket('object_type', a)
r = s.execute()

总之就是，用kibana的控制台能实现的查询，都能用elasticsearch_dsl实现，而且语法更友好，还不容易出错。
多多查看源码，多尝试，多打印，就行了。
看过上面的例子，你应该明白了，如何进行复杂的elasticsearch查询，就是堆乐高积木那样堆起来就好了。
如果还不明白，可以点赞并留言，我如果有时间会看的。
最后，附上官方文档地址：
elasticsearch_dsl的官方文档地址

天魔老师

关注

2
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
使用elasticsearch_dsl完成对ElasticSearch的复杂搜索

这是一个进行后续处理的函数，后面会用到# 这是一个进行后续处理的函数，后面会用到def get_result_list(s, attr_list): result_list = [] for hit in s: little_list = [] for attr in attr_list: if hasattr(hit, ...
复制链接

扫一扫

专栏目录