ElasticSearch增删改查之python sort、scroll、scan_python elasticsearch scan-CSDN博客

本文链接：https://blog.csdn.net/u012089823/article/details/82258738

1、用python操作elasticsearch有两个库可以调用

# ElasticSearch不支持scroll（分页查询）查询
from pyelasticsearch import ElasticSearch
# Elasticsearch支持scroll查询，一般建议使用这个库
from elasticsearch import helpers,Elasticsearch

""" 注意：以上两个库各自在查询或更新传递的参数是不同的 """
# ElasticSearch查询使用方式
ES = ElasticSearch(URL)
res = ES.search(
            query,
            index=index,
            size=size
        )

# Elasticsearch查询使用方式
ES = ElasticSearch(URL)
res = ES.search(
            body=query,
            index=index,
            size=size
        )

ES中的高性能的部分大部分在helpers中实现

python2.7在使用helpers.scan时一定要注意，有个深抗，必须传递参数 preserve_order=True，不传递这个参数会报错TransportError(400, u'illegal_argument_exception')；这个参数表示查询的时候，是否要排序，如果在查询会返回大量数据的情况下，会对返回的scroll中的数据进行排序，很耗费系统性能；但是 python2.7不知为何强制必须使用这个参数，但是官方说明上面这个参数又是可选参数

如果要批量查询大量的数据，建议使用helpers.scan，helpers.scan返回的数据对象时迭代器，很大节省内存空间，而且查询速度要远远大于search；search在利用from、size参数控制返回数据的条数，scroll进行数据分页，也可以返回大数据，但是search返回的数据是以list的形式，如果一次需要返回的数据量比较大的话，则会十分耗费内存，而且数据传输速度也会比较慢
helpers使用列举样例

class MyConnection(RequestsHttpConnection):
    def __init__(self, *args, **kwargs):
        proxies = kwargs.pop('proxies', {})
        super(MyConnection, self).__init__(*args, **kwargs)
        self.session.proxies = proxies

# proxies是需要使用代理时使用，在此使用一般都会出错，需要安装requests的socks代理库，具体我别的文章有讲解，请查看
es_client = Elasticsearch(["ip:port"],
                                           connection_class=MyConnection,
                                           proxies={'http': 'socks5://ip:port'},
                                           timeout=60,
                                           max_retries=10,
                                           retry_on_timeout=True)
# python3的使用方式，如果是python2必须传递参数preserve_order=True，不然会报错
datas = helpers.scan(es_client, index=indexs, query=body, size=1000, request_timeout=100, _source=source)
datas为一个迭代器，会极大的节省内存空间
query：为查询条件
index： 可以为单个索引，也可以为索引的list列表
size： 每次迭代的数量
_source： 可以指定查询之后返回那些字段，舍弃无用数据字段，可以加速数据返回的速度，
          处理数据时也可以节省内存，类型为list，如：source = ['name']

2、Elasticsearch中search scroll使用

scroll的优势：支持分页查询，自动排序，并把查询结果返回
scroll使用方式：每次查询获取下一次查询需要使用的scroll_id，查询时传递参数scroll='2m'，后台ES即可以将查询的结果保存2分钟
查询时常用技巧
1、将必须包含字段添加到 must中
2、将必须不包含字段添加到 must_not中
3、单一条件匹配选用 term，多个单一条件任何一个匹配选用 terms
4、from 指定从结果数据中的第多少条开始返回，from的最大值超不过2000，所以在使用大数据查询基本使用不上
5、size 指定结果数据中共返回多少条数据

# 在使用时一定要注意Elasticsearch与ElasticSearch还是有一定的区别的，传递参数不一样
from elasticsearch import Elasticsearch

ES_SEARCH_HOSTURL = 'http://domain:9000/'
ES = Elasticsearch(ES_SEARCH_HOSTURL)

query = {
        "query": {
            "bool": {
                "must": [],
                "must_not": []
            }
        }
    }

# index可以为索引的列表或者单个索引，如果是索引的列表，则使用search时不能传递doc_type，也就是如果同时查询多个索引，不能指定文档的类型
def scroll_search(index, query, size, page):
    """ 使用scroll查询ES，实现分页查询

    :param index: type of list or str
    :param query: type of dict，查询条件
    :param size: type of int(1-100)，指定返回数据中每页的数据条数
    :param page: type of int(>0)，指定返回第几页数据
    :return: 查询结果总数和某页的数据
    """

    try:
        res = ES.search(
            index=index,
            scroll='5m',            # 查询一次数据在ES中缓存5分钟再销毁
            size=size,
            body=query,
            sort="modified:desc",      # sort增加排序功能，多个字段排序可以以逗号隔开
            # sort="modified:desc,_score:desc",  # 指定某个字段按照升序或者降序排列,modifie为数据字段
            # sort="_doc",        # ES会计算一个最优的排序方案
            # search_type='scan',   # 如果不关注排序的话，可以增加该字段，查询速度十分高效，性能比较好
        )
    except Exception as e:
        raise e
    else:
        sid = res['_scroll_id']           # 获得查询下一条数据的scroll_id
        total = res['hits']['total']      # 获取查询结果中总数据的条数
        hits = res["hits"]["hits"]        # 首次查询返回第一页的结果数据
        results = [hit["_source"] for hit in hits]

        first_page = 1
        while page > first_page:
            try:
                res = ES.scroll(scroll_id=sid, scroll='2m')
            except Exception as e:
                raise e
            else:
                sid = res['_scroll_id']
                hits = res["hits"]["hits"]
                results = [hit["_source"] for hit in hits]
                first_page += 1

    return total, results

# terms使用，其中categories为list类型，含义为categories中任何一个满足条件即可
temp = {"terms": {"categories": categories}}
query["query"]["bool"]["must"].append(temp)

3、Elasticsearch中update局部更新

单条数据进行更新

""" 功能：从多个索引中查询需要更新的对应数据的id，再更新此数据 """
from elasticsearch import Elasticsearch

ES_SEARCH_HOSTURL = 'http://domain:9000/'
ES = Elasticsearch(ES_SEARCH_HOSTURL)

indexs = [index1, index2]

query = {
        "query": {
            "bool": {
                "must": []
            }
        }
    }

for index in indexs:
	try:
		res = ES.search(body=query, index=index, doc_type='info')
	except Exception as e:
		print(e)
		# logger.error("Request search_indicator function error. Error: %s" % e)
		message = "Internal server error"
		results = data_formatter(message=message)
		return Response(results, status=500)
	else:
		if res["hits"]["total"] > 0:
			hits = res["hits"]["hits"][0]
			update_id = hits["_id"]

			try:
                                # 注意，如果ES为pyelasticsearch的对象，则需要更新的参数传递形式应该为doc= {"revoked": revoked}
				ES.update(index=index, doc_type='indicator_info', id=update_id, body={"doc": {"revoked": revoked}})
			except Exception as e:
				print(e)
				message = "Update {} failed".format(id)
				results = data_formatter(message=message)
				return Response(results, status=500)
			else:
				results = data_formatter()
				return Response(results, status=200)

数据批量更新

from datetime import datetime
def get_now_time():
    now_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S.000Z')
    return now_time

# 将数据库中过期的revok字段设置为True，以当前时间为准
# 此处注意更新多个字段，必须以分号隔开，并且中间不能有多余的空格
body = {
        "script": {
           "inline": "ctx._source.revok = params.revok;ctx._source.mod = params.mod;",
            "params": {
                "revoked": None,
                "mod": now_time
            },
            "lang": "painless"
        },
        "query": {
            "bool": {
                "must": [
                    {"exists": {"field": "date"}},
                    {"range": {"date": {"lte": now_time}}}
                ],
            }
        }
    }

# 由于python2没有update_by_query这个方法，所以只能调用原生post请求实现
# python3可以直接调用update_by_query这个方法实现
url = 'http://ip:port/{}/{}/_update_by_query'.format(index, doc)  # 参数分别为索引名与文档类型