code总结——python连接es，利用scroll遍历查询

本文链接：https://blog.csdn.net/csdjia11/article/details/90451907

python利用scroll_id游标遍历查询es，并将所有查询结果写入txt

#download all hits from index of es
#use scroll_id
from elasticsearch import Elasticsearch
import json
es=Elasticsearch(["localhost:9200"])
body={
        "_source":["fileName","fullPath","HashFeature"],
        #"_source":["fileName"],
        #"_source":["fullPath"],
        #"_source":["HashFeature"],
        "query":{
            "match_all":{}
        }
    }
def get_search_result(es,index,doc_type,scroll='5m',timeout='1m',size=1000,body=body):
    queryData = es.search(
        index = index,
        doc_type = doc_type,
        scroll = scroll,
        timeout = timeout,
        size = size,
        body = body
    )
  
    mdata = queryData.get("hits").get("hits")
    
    if not mdata:
        print('empty')
    scroll_id = queryData["_scroll_id"]
    total = queryData["hits"]["total"]
    for i in range(int(total/1000)):
        res = es.scroll(scroll_id=scroll_id,scroll='5m')
        
        mdata = mdata + res["hits"]["hits"]
    
    return mdata       


if __name__ == "__main__":
    result = get_search_result(es,'dh02_20180227','cc06_design')
    #source = result['_source']
    f = open('G:/lab614/12000.txt','w')
    for item in result:
        item2 = item['_source']
        item_fullPath = item['_source']['fullPath']
        item_HashFeature = item['_source']['HashFeature']
        item_fileName = item['_source']['fileName']
        
        f.writelines(item_fullPath+','+item_HashFeature+','+item_fileName)
        f.write('\n')
    f.close
    #print(result)