es scroll 时间_es查询：scroll_scan使用方法-CSDN博客

本文链接：https://blog.csdn.net/weixin_39582724/article/details/113895441

需求大概是：需要实时地推送日志系统的日志消息，提供给其他同事查询使用。当前时间查询前一分钟的数据，因为数据量大，考虑用es的scroll_scan方法。

代码：

#!/usr/bin/env python

# -*- coding:utf-8 -*-

"""

查询es数据 demo

由于es数据存在一定延迟, 所以统计一分钟前的数据.

"""

import sys

import os

import requests

setting_path = (os.path.dirname(os.path.dirname(os.path.realpath(__file__))))

sys.path.insert(0, setting_path)

import json

import datetime

import logging

logger = logging.getLogger('demo')

ES_URL_PRE = "https://XXX/es/"

def request_get(url, params, timeout=2):

"""统一的request.gets函数借口，加上异常处理.

Note: 适用于返回数据为json字符串的cgi接口.

"""

try:

r = requests.get(url=url, params=params, timeout=timeout)

if r.ok:

ret = r.json()

return ret

else:

logger.error('{0} faild, code: {1}, cause: {2}'\

.format(url, r.status_code, r.text[:200]))

except requests.exceptions.ConnectionError:

logger.exception('connection error: %s' % (url, ))

except requests.exceptions.RequestException:

logger.exception('request {0} error'.format(url))

return {}

def requests_post(url, data, timeout=2):

"""统一的requests.post函数接口,加上异常处理.

Note: 适用于返回数据为json字符串的cgi接口.

"""

try:

r = requests.post(url=url, data=data, timeout=timeout)

if r.ok:

ret = r.json()

return ret

else:

logger.error('{0} faild, code: {1}, cause: {2}'\

.format(url, r.status_code, r.text))

except requests.exceptions.ConnectionError:

logger.exception('connection error: %s' % (url, ))

except Exception:

logger.exception('request {0} error'.format(url))

return {}

def gen_index(date, name="logstash"):

return '{0}-{1}'.format(name, date.strftime('%Y.%m.%d'))

def get_exact_index_name(from_time, to_time, name="logstash"):

"""获取精确的index名称"""

from_time -= datetime.timedelta(hours=8)

to_time -= datetime.timedelta(hours=8)

day = to_time.day - from_time.day

if day >= 1:

indexs = []

for idx in range(day + 1):

indexs.append(gen_index(from_time + datetime.timedelta(days=idx), name))

index_name = ",".join(indexs)

else:

index_name = gen_index(to_time, name)

return index_name

def get_query_data(from_time, to_time, should_terms):

should = []

for item in should_terms:

should.append({"term": item})

query_template = {

"query": {

"filtered": {

"filter": {

"bool": {

"should": should

"query": {

"range": {

"@timestamp": {

"gte": from_time,

"lt": to_time,

"time_zone": "+08:00"

}

return json.dumps(query_template)

def get_type_data(from_time, to_time, type_name, size=500):

index_name = get_exact_index_name(from_time, to_time)

initial_url = ES_URL_PRE + "{0}/{1}/_search/?scroll=2m&size={2}&search_type=scan".format(index_name, type_name, size)

messages, counts = [], 0

should_terms = [{"opt": "1_1"}, {"opt": "4_12"}]

data = get_query_data(from_time.strftime("%Y-%m-%dT%H:%M:%S"),

to_time.strftime("%Y-%m-%dT%H:%M:%S"),

should_terms)

rets = requests_post(initial_url, data, timeout=2)

if not rets:

return messages, counts

scroll_id, counts = rets.get("_scroll_id", ""), rets.get("hits", []).get("total", 0)

if not counts:

return messages, counts

scroll_url = ES_URL_PRE + "_search/scroll?"

while True:

params = {'scroll_id': scroll_id, "scroll": "2m", "size": size}

res = request_get(scroll_url, params=params, timeout=1)

hits = res.get("hits", {}).get("hits", [])

if not hits:

break

for hit in hits:

messages.append(hit.get("_source", {}))

scroll_id = res.get("_scroll_id", "")

return messages, counts

def main(from_time, to_time):

type_name = "bilog"

size = 1000

messages, counts = get_type_data(from_time, to_time, type_name, size=size)

return messages, counts

if __name__ == "__main__":

start_time = datetime.datetime.now()

to_time = start_time.replace(second=0, microsecond=0) \

- datetime.timedelta(minutes=1)

from_time = (to_time - datetime.timedelta(minutes=1))

messages, counts = main(from_time, to_time)

end_time = datetime.datetime.now()

print end_time-start_time