三个版本,最老的版本将数据从Hive上取出来,然后用Python进行处理,现在看来这显然是不合适的,放着强大的Hadoop不用,而使用单机多线程;较老版本dict手工生成代码清单进行周的控制,且是从前往后计算,主要计算过程在hadoop上;新版本充分利用了Python的datetime函数等,根据时段,自动生成日期的列表,然后进行周的控制,从后往前计算,便于代码的利用,数据运算主要在Hadoop上运行。总体来看,
新版本比老版本更容易理解、更清晰、更充分利用了大数据平台的计算能力、复用性更强。
1、最早的版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_group_byWeek_thread.py
2、老版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_group_byWeek_thread_0103.py
3、新版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_pay_byWeek_thread.py
结果数据的查看
一些想法
对于用Python进行大数据平台下的跑批,Python的作用最好只限制在参数的生成、调用及总体流程的控制;而耗时的、具体的运算还是让Hive调用MapReduce去完成。Python的多线程最好也是用在调用Hive的mapReduce上;而不是数据的具体运算。
1、最早的版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_group_byWeek_thread.py
# -*- coding=utf-8 -*-
import warnings
import datetime
import time
import os
import re
import threadpool
from itertools import groupby
from operator import itemgetter
warnings.filterwarnings("ignore")
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
def getWeekFristday(weekflag):
yearnum = weekflag[0:4] # 取到年份
weeknum = weekflag[5:7] # 取到周
stryearstart = yearnum + '0101' # 当年第一天
yearstart = datetime.datetime.strptime(stryearstart, '%Y%m%d') # 格式化为日期格式
yearstartcalendarmsg = yearstart.isocalendar() # 当年第一天的周信息
yearstartweekday = yearstartcalendarmsg[2]
yearstartyear = yearstartcalendarmsg[0]
if yearstartyear < int(yearnum):
daydelat = (8 - int(yearstartweekday)) + (int(weeknum) - 1) * 7
else:
daydelat = (8 - int(yearstartweekday)) + (int(weeknum) - 2) * 7
week1day = (yearstart + datetime.timedelta(days=daydelat)).date()
return week1day
def user_remain_proc(batch_week):
os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
delete from bi_user_remain_byweek_static where data_week='%s'; \
" """ % (batch_week))
week1day = getWeekFristday(batch_week)
newuser_data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
select a1.appsource,a1.appkey,a1.identifier from ( \
select appsource,appkey,identifier \
from bi_all_access_log \
where concat(year(pt_day),'#',weekofyear(pt_day)) = '%s' \
group by appsource,appkey,identifier) a1 \
left join \
(select appsource,appkey,identifier \
from bi_all_access_log \
where pt_day < '%s' ) a2 on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource \
where a2.identifier is null \
;" \
"""
% (batch_week, week1day)).readlines();
week_dict = {"1": "2015#27", "2": "2015#28", "3": "2015#29", "4": "2015#30", "5": "2015#31", "6": "2015#32",
"7": "2015#33", "8": "2015#34", "9": "2015#35", "10": "2015#36", "11": "2015#37", "12": "2015#38",
"13": "2015#39", "14": "2015#40", "15": "2015#41", "16": "2015#42", "17": "2015#43", "18": "2015#44",
"19": "2015#45", "20": "2015#46", "21": "2015#47", "22": "2015#48", "23": "2015#49", "24": "2015#50",
"25": "2015#51", "26": "2015#52", "27": "2015#53", "28": "2016#1", "29": "2016#2", "30": "2016#3",
"31": "2016#4", "32": "2016#5", "33": "2016#6", "34": "2016#7", "35": "2016#8", "36": "2016#9",
"37": "2016#10", "38": "2016#11", "39": "2016#12", "40": "2016#13", "41": "2016#14", "42": "2016#15",
"43": "2016#16", "44": "2016#17", "45": "2016#18", "46": "2016#19", "47": "2016#20", "48": "2016#21",
"49": "2016#22", "50": "2016#23", "51": "2016#24", "52": "2016#25", "53": "2016#26", "54": "2016#27",
"55": "2016#28", "56": "2016#29", "57": "2016#30", "58": "2016#31", "59": "2016#32", "60": "2016#33",
"61": "2016#34", "62": "2016#35", "63": "2016#36", "64": "2016#37", "65": "2016#38", "66": "2016#39",
"67": "2016#40", "68": "2016#41", "69": "2016#42", "70": "2016#43", "71": "2016#44", "72": "2016#45",
"73": "2016#46", "74": "2016#47", "75": "2016#48", "76": "2016#49", "77": "2016#50", "78": "2016#51",
"79": "2016#52"}
for (k, week_value) in week_dict.items():
if week_value == batch_week:
for (kx, week_valuex) in week_dict.items():
if int(kx) >= int(k):
print k,kx,week_valuex
sql_text = "select appsource,appkey,identifier from bi_all_access_log where concat(year(pt_day),'#',weekofyear(pt_day)) = '%s' group by appsource,appkey,identifier;"%(week_valuex)
# print sql_text
week2user_data = os.popen("""/usr/lib/hive-current/bin/hive -e "%s" """%(sql_text)).readlines();
week2remain_data = set(newuser_data) & set(week2user_data)
week2remain_data = list(week2remain_data)
week2remain_list = []
for w2rm_list in week2remain_data:
w2l = re.split('\t',w2rm_list.replace('\n',''))
week2remain_list.append(w2l)
week2remain_data_sorted = sorted(week2remain_list, key=itemgetter(0, 1))
groupby_week2remain_data = groupby(week2remain_data_sorted, key=itemgetter(0, 1))
rl = []
for key, item in groupby_week2remain_data:
item_cnt = 0
for jtem in item:
item_cnt += 1
groupby_week2remain_list = (key, item_cnt)
rl.append(groupby_week2remain_list)
for x in rl:
# print x[0][0], x[0][1], x[1]
appsource = x[0][0]
appkey = x[0][1]
data_type = "%s-remain_cnt" % (week_valuex)
data_cnt = x[1]
etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
insert into bi_user_remain_byweek_static(data_week,appsource,appkey,data_type,data_cnt,etl_time) \
select '%s','%s','%s','%s','%s','%s'; \
" """ % (batch_week, appsource, appkey, data_type, data_cnt, etl_time))
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
batch_week_list = ["2015#27","2015#28","2015#29","2015#30","2015#31","2015#32","2015#33","2015#34","2015#35","2015#36","2015#37","2015#38","2015#39","2015#40","2015#41","2015#42","2015#43","2015#44","2015#45","2015#46","2015#47","2015#48","2015#49","2015#50","2015#51","2015#52","2015#53","2016#1","2016#2","2016#3","2016#4","2016#5","2016#6","2016#7","2016#8","2016#9","2016#10","2016#11","2016#12","2016#13","2016#14","2016#15","2016#16","2016#17","2016#18","2016#19","2016#20","2016#21","2016#22","2016#23","2016#24","2016#25","2016#26","2016#27","2016#28","2016#29","2016#30","2016#31","2016#32","2016#33","2016#34","2016#35","2016#36","2016#37","2016#38","2016#39","2016#40","2016#41","2016#42","2016#43","2016#44","2016#45","2016#46","2016#47","2016#48","2016#49","2016#50","2016#51","2016#52"]
# batch_week_list = ["2016#49","2016#50"]
requests = threadpool.makeRequests(user_remain_proc, batch_week_list)
main_pool = threadpool.ThreadPool(40)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(3)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
2、老版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_group_byWeek_thread_0103.py
# -*- coding=utf-8 -*-
import warnings
import datetime
import time
import os
import re
import threadpool
warnings.filterwarnings("ignore")
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
def user_remain_proc(batch_week):
os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
delete from bi_user_remain_pay_byweek_static where data_week='%s'; \
" """ % (batch_week))
week_dict = {"1": "2015#27", "2": "2015#28", "3": "2015#29", "4": "2015#30", "5": "2015#31", "6": "2015#32",
"7": "2015#33", "8": "2015#34", "9": "2015#35", "10": "2015#36", "11": "2015#37", "12": "2015#38",
"13": "2015#39", "14": "2015#40", "15": "2015#41", "16": "2015#42", "17": "2015#43", "18": "2015#44",
"19": "2015#45", "20": "2015#46", "21": "2015#47", "22": "2015#48", "23": "2015#49", "24": "2015#50",
"25": "2015#51", "26": "2015#52", "27": "2015#53", "28": "2016#1", "29": "2016#2", "30": "2016#3",
"31": "2016#4", "32": "2016#5", "33": "2016#6", "34": "2016#7", "35": "2016#8", "36": "2016#9",
"37": "2016#10", "38": "2016#11", "39": "2016#12", "40": "2016#13", "41": "2016#14", "42": "2016#15",
"43": "2016#16", "44": "2016#17", "45": "2016#18", "46": "2016#19", "47": "2016#20", "48": "2016#21",
"49": "2016#22", "50": "2016#23", "51": "2016#24", "52": "2016#25", "53": "2016#26", "54": "2016#27",
"55": "2016#28", "56": "2016#29", "57": "2016#30", "58": "2016#31", "59": "2016#32", "60": "2016#33",
"61": "2016#34", "62": "2016#35", "63": "2016#36", "64": "2016#37", "65": "2016#38", "66": "2016#39",
"67": "2016#40", "68": "2016#41", "69": "2016#42", "70": "2016#43", "71": "2016#44", "72": "2016#45",
"73": "2016#46", "74": "2016#47", "75": "2016#48", "76": "2016#49", "77": "2016#50", "78": "2016#51",
"79": "2016#52"}
for (k, week_value) in week_dict.items():
if week_value == batch_week:
for (kx, compare_week) in week_dict.items():
if int(kx) >= int(k):
print batch_week,compare_week
newuser_remain_pay_data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
with new_user as (select a1.appsource,a1.appkey,a1.identifier,a1.uid from ( \
select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
from bi_all_access_log \
where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
group by appsource,appkey,identifier,RadixChange(uid,16,10)) a1 \
left join \
(select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
from bi_all_access_log \
where year(pt_day)<int(substring('%s',1,4)) or (year(pt_day)=int(substring('%s',1,4)) and weekofyear(pt_day)<int(substring('%s',6)))) a2 \
on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource and a1.uid=a2.uid \
where a2.identifier is null \
), \
x_day_data as (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
from bi_all_access_log \
where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
group by appsource,appkey,identifier,RadixChange(uid,16,10)), \
x_pay as (select uid,sum(amount) amount \
from data_chushou_pay_info \
where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
group by uid) \
select a1.appkey,a1.appsource,count(a1.identifier) remain_cnt,sum(a3.amount) pay_amount \
from new_user a1 \
inner join x_day_data a2 on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource \
left join x_pay a3 on a1.uid=a3.uid \
group by a1.appkey,a1.appsource \
;" \
""" % (batch_week, batch_week, batch_week, batch_week, compare_week, compare_week)).readlines();
nrpd_list = []
for nrp_list in newuser_remain_pay_data:
nrp = re.split('\t', nrp_list.replace('\n', ''))
nrpd_list.append(nrp)
for nrpd in nrpd_list:
# print nrpd[0],nrpd[1],nrpd[2],nrpd[3]
remain_week = compare_week
appkey = nrpd[0]
appsource = nrpd[1]
remain_cnt = nrpd[2]
pay_amount = nrpd[3]
etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
insert into bi_user_remain_pay_byweek_static(data_week,appsource,appkey,remain_week,remain_cnt,pay_amount,etl_time) \
select '%s','%s','%s','%s','%s','%s','%s'; \
" """ % (batch_week, appsource, appkey, remain_week, remain_cnt, pay_amount, etl_time))
# batch_week_list = ["2015#27","2015#28","2015#29","2015#30","2015#31","2015#32","2015#33","2015#34","2015#35","2015#36","2015#37","2015#38","2015#39","2015#40","2015#41","2015#42","2015#43","2015#44","2015#45","2015#46","2015#47","2015#48","2015#49","2015#50","2015#51","2015#52","2015#53","2016#1","2016#2","2016#3","2016#4","2016#5","2016#6","2016#7","2016#8","2016#9","2016#10","2016#11","2016#12","2016#13","2016#14","2016#15","2016#16","2016#17","2016#18","2016#19","2016#20","2016#21","2016#22","2016#23","2016#24","2016#25","2016#26","2016#27","2016#28","2016#29","2016#30","2016#31","2016#32","2016#33","2016#34","2016#35","2016#36","2016#37","2016#38","2016#39","2016#40","2016#41","2016#42","2016#43","2016#44","2016#45","2016#46","2016#47","2016#48","2016#49","2016#50","2016#51","2016#52"]
batch_week_list = ["2016#1","2016#2","2016#3","2016#4","2016#5","2016#6","2016#7","2016#8","2016#9","2016#10","2016#11","2016#12","2016#13","2016#14","2016#15","2016#16","2016#17","2016#18","2016#19","2016#20","2016#21","2016#22","2016#23","2016#24","2016#25","2016#26","2016#27","2016#28","2016#29","2016#30","2016#31","2016#32","2016#33","2016#34","2016#35","2016#36","2016#37","2016#38","2016#39","2016#40","2016#41","2016#42","2016#43","2016#44","2016#45","2016#46","2016#47","2016#48","2016#49","2016#50","2016#51","2016#52"]
requests = threadpool.makeRequests(user_remain_proc, batch_week_list)
main_pool = threadpool.ThreadPool(8)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(3)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
3、新版本
/Users/nisj/PycharmProjects/EsDataProc/bi-static/Hive_remain_pay_byWeek_thread.py
# -*- coding=utf-8 -*-
import warnings
import datetime
import time
import os
import re
import threadpool
warnings.filterwarnings("ignore")
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
tomorrow = today + datetime.timedelta(days=1)
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(beginDate, "%Y-%m-%d")
date = beginDate[:]
while date <= endDate:
dates.append(date)
dt = dt + datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def weekRang(beginDate, endDate):
week = set()
for date in dateRange(beginDate, endDate):
week.add(datetime.date(int(date[0:4]), int(date[5:7]), int(date[8:10])).isocalendar()[0:2])
wk_l = []
for wl in sorted(list(week)):
wk_l.append(str(wl[0])+'#'+str(wl[1]))
return wk_l
def hisWeekList(curr_week):
last_wk = datetime.datetime.now() - datetime.timedelta(days=7)
end_day = str(last_wk)[0:10]
his_week_list = []
for week in weekRang('2015-07-01', end_day):
if (int(week[0:4]) == int(curr_week[0:4]) and int(week[5:]) <= int(curr_week[5:])) or (int(week[0:4]) < int(curr_week[0:4])):
his_week_list.append(week)
return his_week_list
def user_remain_proc(curr_week):
for his_week in hisWeekList(curr_week):
if (int(his_week[0:4]) == int(curr_week[0:4]) and int(his_week[5:]) <= int(curr_week[5:])) or (int(his_week[0:4]) < int(curr_week[0:4])):
# print his_week,curr_week
os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
delete from bi_user_remain_pay_byweek where data_week='%s' and remain_week='%s'; \
" """ % (his_week, curr_week))
newuser_remain_pay_data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
add jar /home/hadoop/nisj/udf-jar/hadoop_udf_radixChange.jar; \
create temporary function RadixChange as 'com.kascend.hadoop.RadixChange'; \
with his_new_user as (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
from bi_all_access_log_of_new \
where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
group by appsource,appkey,identifier,RadixChange(uid,16,10)), \
curr_week_data as (select appsource,appkey,identifier,RadixChange(uid,16,10) uid \
from bi_all_access_log \
where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
group by appsource,appkey,identifier,RadixChange(uid,16,10)), \
curr_week_pay as (select uid,sum(amount) amount \
from data_chushou_pay_info \
where case when weekofyear(pt_day)>=52 and month(pt_day)=1 then concat(year(pt_day)-1,'#',weekofyear(pt_day)) else concat(year(pt_day),'#',weekofyear(pt_day)) end = '%s' \
group by uid) \
select a1.appkey,a1.appsource,count(distinct a2.identifier) remain_cnt,sum(a3.amount) pay_amount \
from his_new_user a1 \
inner join curr_week_data a2 on a1.appkey=a2.appkey and a1.identifier=a2.identifier and a1.appsource=a2.appsource \
left join curr_week_pay a3 on a1.uid=a3.uid \
group by a1.appkey,a1.appsource \
;" \
""" % (his_week, curr_week, curr_week)).readlines();
nrpd_list = []
for nrp_list in newuser_remain_pay_data:
nrp = re.split('\t', nrp_list.replace('\n', ''))
nrpd_list.append(nrp)
for nrpd in nrpd_list:
remain_week = curr_week
appkey = nrpd[0]
appsource = nrpd[1]
remain_cnt = nrpd[2]
pay_amount = nrpd[3]
etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
os.system("""/usr/bin/mysql -hHostIp -P6603 -uhadoop -pMysqlPass -e "use funnyai_data; \
insert into bi_user_remain_pay_byweek(data_week,appsource,appkey,remain_week,remain_cnt,pay_amount,etl_time) \
select '%s','%s','%s','%s','%s','%s','%s'; \
" """ % (his_week, appsource, appkey, remain_week, remain_cnt, pay_amount, etl_time))
batch_week_list = weekRang('2015-07-01', '2016-01-03')
requests = threadpool.makeRequests(user_remain_proc, batch_week_list)
main_pool = threadpool.ThreadPool(12)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(3)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time = time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
结果数据的查看
select data_week,remain_week,count(*)
from funnyai_data.bi_user_remain_pay_byweek
group by data_week,remain_week
order by substring(remain_week,1,4)+0,substring(remain_week,6)+0,substring(data_week,1,4)+0,substring(data_week,6)+0
;
一些想法
对于用Python进行大数据平台下的跑批,Python的作用最好只限制在参数的生成、调用及总体流程的控制;而耗时的、具体的运算还是让Hive调用MapReduce去完成。Python的多线程最好也是用在调用Hive的mapReduce上;而不是数据的具体运算。