python分页读取全量Elasticsearch数据导出CSV文件支持hive分区表查询

最新推荐文章于 2021-07-06 13:25:06 发布

逮皮皮虾户

最新推荐文章于 2021-07-06 13:25:06 发布

阅读量420

点赞数

本文链接：https://blog.csdn.net/zxt880610/article/details/106946897

版权

import csv
import os
from elasticsearch import Elasticsearch
from datetime import timedelta, datetime

yesterday = datetime.today() + timedelta(-1)
index_name = yesterday.strftime('monitor-%Y.%m.%d')
starttime=datetime.now()

es = Elasticsearch(hosts="http://localhost:9200/", http_auth=('big_data_query','big_data_query'))
query_json = {"query":{"match_all":{}}}
query = es.search(index=index_name,body=query_json,scroll='5m',size=1000)

results = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total'] # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果

for i in range(0, int(total/1000)+1):
# scroll参数必须指定否则会报错
query_scroll = es.scroll(scroll_id=scroll_id,scroll='5m')['hits']['hits']
results += query_scroll

with open('/tmp/'+index_name+'.csv','w',newline='',encoding='utf-8') as flow:
csv_writer = csv.writer(flow)
for res in results:
#由于ES字段个数不固定，此处判断字段是否存在，不存在默认为空

sponsor= "" if not res['_source'].get("sponsor") else res['_source']['sponsor']
traceLogId= "" if not res['_source'].get("traceLogId") else res['_source']['traceLogId']
offset= "" if not res['_source'].get("offset") else res['_source']['offset']
level= "" if not res['_source'].get("level") else res['_source']['level']
txnSubType= "" if not res['_source'].get("txnSubType") else res['_source']['txnSubType']
input_type= "" if not res['_source'].get("input_type") else res['_source']['input_type']
requestIp= "" if not res['_source'].get("requestIp") else res['_source']['requestIp']
txnType= "" if not res['_source'].get("txnType") else res['_source']['txnType']
source= "" if not res['_source'].get("source") else res['_source']['source']
type= "" if not res['_source'].get("type") else res['_source']['type']
resultDesc= "" if not res['_source'].get("resultDesc") else res['_source']['resultDesc']
result= "" if not res['_source'].get("result") else res['_source']['result']
timeLength= "" if not res['_source'].get("timeLength") else res['_source']['timeLength']
_timestamp= "" if not res['_source'].get("@timestamp") else res['_source']['@timestamp']
csv_writer.writerow([res['_id']+'#'+sponsor+'#'+traceLogId+'#'+str(offset)+'#'+ level+'#'+txnSubType+'#'+input_type+'#'+requestIp+'#'+txnType+'#'+source+'#'+type+'#'+resultDesc+'#'+result+'#'+ str(timeLength)+'#'+_timestamp])

os.system("hadoop fs -moveFromLocal /tmp/%s.csv hdfs://ns1/user/hive/warehouse/test_es.db/es_monitor/pk_year=%s/pk_month=%s/pk_day=%s"%(index_name,yesterday.strftime('%Y'),yesterday.strftime('%Y-%m'),yesterday.strftime('%Y-%m-%d')))

endtime=datetime.now()
print('done!'+ str((endtime-starttime).seconds))

逮皮皮虾户

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python分页读取全量Elasticsearch数据导出CSV文件支持hive分区表查询

import csvfrom elasticsearch import Elasticsearches = Elasticsearch(hosts="http://localhost:9200/", http_auth=('big_data_query','big_data_query'))query_json = {"query":{"match_all":{}}}query = es.search(index='test',body=query_json,scroll='5m',size=10
复制链接

扫一扫