不受elasticsearch查询上限（10000）的限制，可以最大化的从数据库获取数据

最新推荐文章于 2024-07-12 12:16:53 发布

jia_xue

最新推荐文章于 2024-07-12 12:16:53 发布

阅读量388

点赞数 6

文章标签： python elasticsearch

本文链接：https://blog.csdn.net/jia_xue/article/details/135990204

版权

# -*- coding: utf-8 -*-

from elasticsearch import Elasticsearch
import requests
import os
import datetime

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
conv_time = str(yesterday).replace('-','.')
conn = Elasticsearch(['192.168.10.147:9200','192.168.10.148:9200','192.168.10.149:9200','192.168.10.150:9200','192.168.10.141:9200'])
es_index = 'cr-stat-{}'.format(conv_time)
dstPath = r'\\192.168.10.82\昨天数据\{}'.format(yesterday)
if not os.path.exists(dstPath):
    os.makedirs(dstPath)

def down_load(plat,lang):
    total_query = \
        {
            "from": 0,
            "size": 1000,
            "query": {
                "bool": {
                    "must": [
                        {
                            "term": {
                                "languagename.keyword": {
                                    "value": lang
                                }
                            }
                        },
                        {
                            "term": {
                                "platformid.keyword": {
                                    "value": plat
                                }
                            }
                        },
                        {
                            "range": {
                                "callsecs": {
                                    "gte": 60,
                                    "lte": 20000
                                }
                            }
                        }
                    ]
                }
            }
        }
    es_res = conn.search(index=es_index,body=total_query,scroll='10m',size=1000)

    scroll_size = es_res['hits']['total']
    try:
        for i in range(int(scroll_size/1000)):
            scroll_id = es_res['_scroll_id']
            results = conn.scroll(scroll_id=scroll_id, scroll='5m')
            # print(len(results['hits']['hits']))
            for res in results['hits']['hits']:
                urls = res['_source']['files']
                # print(urls)
                for value in urls.values():
                    name = os.path.basename(value)
                    file_path = os.path.join(dstPath, name)
                    response = requests.get(value)
                    try:
                        with open(file_path, 'wb') as ff:
                            ff.write(response.content)
                    except Exception as e:
                        print(e)
    except Exception as e:
        print(e)

if __name__ == '__main__':
    down_load('600','汉语')