python分页读取全量Elasticsearch数据导出CSV文件支持hive分区表查询

import csv
import os
from elasticsearch import Elasticsearch
from datetime import timedelta, datetime

yesterday = datetime.today() + timedelta(-1)
index_name = yesterday.strftime('monitor-%Y.%m.%d')
starttime=datetime.now()

es = Elasticsearch(hosts="http://localhost:9200/", http_auth=('big_data_query','big_data_query'))
query_json = {"query":{"match_all":{}}}
query = es.search(index=index_name,body=query_json,scroll='5m',size=1000)

results = query['hits']['hits'] # es查询出的结果第一页
total = query['hits']['total']  # es查询出的结果总量
scroll_id = query['_scroll_id'] # 游标用于输出es查询出的所有结果

for i in range(0, int(total/1000)+1):
    # scroll参数必须指定否则会报错
    query_scroll = es.scroll(scroll_id=scroll_id,scroll='5m')['hits']['hits']
    results += query_scroll


with open('/tmp/'+index_name+'.csv','w',newline='',encoding='utf-8') as flow:
    csv_writer = csv.writer(flow)
    for res in results:
                #由于ES字段个数不固定,此处判断字段是否存在,不存在默认为空

                sponsor= "" if not res['_source'].get("sponsor") else res['_source']['sponsor']
                traceLogId= "" if not res['_source'].get("traceLogId") else res['_source']['traceLogId']
                offset= "" if not res['_source'].get("offset") else res['_source']['offset']
                level= "" if not res['_source'].get("level") else res['_source']['level']
                txnSubType= "" if not res['_source'].get("txnSubType") else res['_source']['txnSubType']
                input_type= "" if not res['_source'].get("input_type") else res['_source']['input_type']
                requestIp= "" if not res['_source'].get("requestIp") else res['_source']['requestIp']
                txnType= "" if not res['_source'].get("txnType") else res['_source']['txnType']
                source= "" if not res['_source'].get("source") else res['_source']['source']
                type= "" if not res['_source'].get("type") else res['_source']['type']
                resultDesc= "" if not res['_source'].get("resultDesc") else res['_source']['resultDesc']
                result= "" if not res['_source'].get("result") else res['_source']['result']
                timeLength= "" if not res['_source'].get("timeLength") else res['_source']['timeLength']
                _timestamp= "" if not res['_source'].get("@timestamp") else res['_source']['@timestamp']
                csv_writer.writerow([res['_id']+'#'+sponsor+'#'+traceLogId+'#'+str(offset)+'#'+ level+'#'+txnSubType+'#'+input_type+'#'+requestIp+'#'+txnType+'#'+source+'#'+type+'#'+resultDesc+'#'+result+'#'+ str(timeLength)+'#'+_timestamp])

os.system("hadoop fs -moveFromLocal /tmp/%s.csv hdfs://ns1/user/hive/warehouse/test_es.db/es_monitor/pk_year=%s/pk_month=%s/pk_day=%s"%(index_name,yesterday.strftime('%Y'),yesterday.strftime('%Y-%m'),yesterday.strftime('%Y-%m-%d')))                
                
endtime=datetime.now()            
print('done!'+  str((endtime-starttime).seconds))

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值