elasticsearch 中 scroll的用法,突破查询数量10000的限制

# -*- coding: utf-8 -*-
# @Time    : 2024/5/14 10:40
# @Author  : hjcui
# @Site    : 
# @File    : 关键词对应的话单记录.py
# @Software: PyCharm
from elasticsearch import Elasticsearch
import os

kwd = "你好"
# 定义查询语句
query = \
    {
        "query": {
            "bool": {
                "should": [
                    {
                        "nested": {
                            "path": "kwdresult.B.kwdresults",
                            "query": {
                                "term": {
                                    "kwdresult.B.kwdresults.keyword.keyword": {
                                        "value": kwd
                                    }
                                }
                            }
                        }
                    },
                    {
                        "nested": {
                            "path": "kwdresult.A.kwdresults",
                            "query": {
                                "term": {
                                    "kwdresult.A.kwdresults.keyword.keyword": {
                                        "value": kwd
                                    }
                                }
                            }
                        }
                    }
                ]
            }
        }
    }
# 执行滚动查询的函数
def scroll_query(es,index,query,scroll_size=1000,scroll_time='2m'):
    result = es.search(index=index,body=query,size=scroll_size,scroll=scroll_time)
    scroll_id = result['_scroll_id']
    total_docs = result['hits']['total']
    res = open(f'./{kwd}.txt', 'w', encoding='utf-8')
    res.write(f"包含关键词 {kwd} 的话单数是: {total_docs},以下是话单ID\n")
    # 处理获取的第一批数据
    for hit in result['hits']['hits']:
        source_data = hit['_source']
        res.write(source_data['callid'] + '\n')
        # print(source_data['callid'])
    # 继续滚动查询剩余数据
    while len(result['hits']['hits']) > 0:
        result = es.scroll(scroll_id=scroll_id,scroll=scroll_time)
        scroll_id = result['_scroll_id']
        for hit in result['hits']['hits']:
            source_data = hit['_source']
            res.write(source_data['callid'] + '\n')
    es.clear_scroll(scroll_id=scroll_id)
    res.close()
    print("The file saved sucessfully.")

if __name__ == '__main__':
    es_conn = Elasticsearch(['192.168.0.147:9200', '192.168.0.148:9200', '192.168.0.149:9200', \
                             '192.168.0.150:9200', '192.168.0.141:9200'])
    if es_conn.ping():
        print("connected to Elasticsearch.")
    else:
        print("Could not connect to Elasticsearch.")
    index = 'cr-all-2024.05.15'
    scroll_query(es_conn,index,query,scroll_size=1000,scroll_time='2m')

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值