适用于需要跑多日数据及临时数据需求的时候;Hql代码及目录路径可以随时按需更换;为提高效率采用了并行的方式。
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData2LocalFile.py
跟出的多个数据可以考虑压缩后再传输,提高效率。
/Users/nisj/PycharmProjects/BiDataProc/love/HiveRunData2LocalFile.py
# -*- coding=utf-8 -*-
import os
import datetime
import warnings
import time
import threadpool
warnings.filterwarnings("ignore")
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def hiveRunData2localFile(pt_day):
os.system("""/usr/lib/hive-current/bin/hive -e " \
with tab_pay_info as( \
select '{pt_day}' runday,x.uid,sum(x.pay_amount) total_pay_amount,max(x.pay_amount) max_pay_amount \
from (select pt_day,uid,amount pay_amount from oss_bi_all_chushou_pay_info where pt_day between '{pt_day}' and '{pt_day7after}' and state=0 \
union all \
select pt_day,uid,amount pay_amount from oss_bi_all_open_point_record where pt_day between '{pt_day}' and '{pt_day7after}' and state=0 and (key='alipay_cp' or key='tmall_pay') \
) x \
group by x.uid \
), \
tab_all_device as( \
select pt_day,parms['uid'] uid,max(device_info['device_model']) device_model,max(parms['ip']) ip \
from oss_bi_all_device_log \
where pt_day='{pt_day}' \
group by pt_day,parms['uid']) \
select '{pt_day}' pt_day,a1.appkey,a1.appsource,a1.uid,a2.device_model,a2.ip, \
substr(a3.id_card_num,7,8) birthday,2018-substr(a3.id_card_num,7,4) age,case when substr(a3.id_card_num,-2,1)%2=1 then '男' when substr(a3.id_card_num,-2,1)%2=0 then '女' end sex, \
case when a4.uid is null then 0 else 1 end is_pay,a4.max_pay_amount,a4.total_pay_amount \
from oss_bi_type_of_retention_user a1 \
left join tab_all_device a2 on a1.uid=a2.uid and a1.pt_day=a2.pt_day \
left join xxx_user_id_card a3 on a1.uid=a3.uid \
left join tab_pay_info a4 on a1.uid=a4.uid and a1.pt_day=a4.runday \
where a1.pt_day='{pt_day}' and a1.remain_type=1 and a1.remain_day_num=0 \
; \
">/home/hadoop/nisj/love/hive2Local/{pt_day}.txt """.format(pt_day=pt_day, pt_day7after=(datetime.datetime.strptime(pt_day, "%Y-%m-%d") + datetime.timedelta(6)).strftime('%Y-%m-%d')));
# # run serial Batch
# for ptDay in dateRange(beginDate='2018-03-01', endDate='2018-03-01'):
# print ptDay
# hiveRunData2localFile(pt_day=ptDay)
# run parallel Batch
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
runDay_list = dateRange(beginDate='2018-03-01', endDate='2018-03-31')
requests = []
request_hiveRunData2localFile_batchCtl = threadpool.makeRequests(hiveRunData2localFile, runDay_list)
requests.extend(request_hiveRunData2localFile_batchCtl)
main_pool = threadpool.ThreadPool(16)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
用并行跑批的时候,要么安装threadpool模块,要么将threadpool代码脚本拷到跑批目录下。跟出的多个数据可以考虑压缩后再传输,提高效率。
cd love/
cd hive2Local/
zip 201803.zip 2018*
sz 201803.zip