elasticsearch 中 scroll的用法，突破查询数量10000的限制

jia_xue

于 2024-05-16 10:39:19 发布

阅读量212

点赞数 3

文章标签：运维 python

本文链接：https://blog.csdn.net/jia_xue/article/details/138951195

版权

# -*- coding: utf-8 -*-
# @Time    : 2024/5/14 10:40
# @Author  : hjcui
# @Site    : 
# @File    : 关键词对应的话单记录.py
# @Software: PyCharm
from elasticsearch import Elasticsearch
import os

kwd = "你好"
# 定义查询语句
query = \
    {
        "query": {
            "bool": {
                "should": [
                    {
                        "nested": {
                            "path": "kwdresult.B.kwdresults",
                            "query": {
                                "term": {
                                    "kwdresult.B.kwdresults.keyword.keyword": {
                                        "value": kwd
                                    }
                                }
                            }
                        }
                    },
                    {
                        "nested": {
                            "path": "kwdresult.A.kwdresults",
                            "query": {
                                "term": {
                                    "kwdresult.A.kwdresults.keyword.keyword": {
                                        "value": kwd
                                    }
                                }
                            }
                        }
                    }
                ]
            }
        }
    }
# 执行滚动查询的函数
def scroll_query(es,index,query,scroll_size=1000,scroll_time='2m'):
    result = es.search(index=index,body=query,size=scroll_size,scroll=scroll_time)
    scroll_id = result['_scroll_id']
    total_docs = result['hits']['total']
    res = open(f'./{kwd}.txt', 'w', encoding='utf-8')
    res.write(f"包含关键词 {kwd} 的话单数是: {total_docs},以下是话单ID\n")
    # 处理获取的第一批数据
    for hit in result['hits']['hits']:
        source_data = hit['_source']
        res.write(source_data['callid'] + '\n')
        # print(source_data['callid'])
    # 继续滚动查询剩余数据
    while len(result['hits']['hits']) > 0:
        result = es.scroll(scroll_id=scroll_id,scroll=scroll_time)
        scroll_id = result['_scroll_id']
        for hit in result['hits']['hits']:
            source_data = hit['_source']
            res.write(source_data['callid'] + '\n')
    es.clear_scroll(scroll_id=scroll_id)
    res.close()
    print("The file saved sucessfully.")

if __name__ == '__main__':
    es_conn = Elasticsearch(['192.168.0.147:9200', '192.168.0.148:9200', '192.168.0.149:9200', \
                             '192.168.0.150:9200', '192.168.0.141:9200'])
    if es_conn.ping():
        print("connected to Elasticsearch.")
    else:
        print("Could not connect to Elasticsearch.")
    index = 'cr-all-2024.05.15'
    scroll_query(es_conn,index,query,scroll_size=1000,scroll_time='2m')