此代码通过将需要的数据分别从多个mysql数据源及hive数据源分别获取后,在python中进行整合合并,然后插入到时mysql数据库中。
hive库中本有这些mysql的表,但由于只是用到了hive中的部分数据,在hive巨量数据中查询效率极低;而在mysql中进行其中一小部分的数据查询则能极大的提高查询效率。所以,代码涉及的数据源等处理相对比较复杂。
数据本可以在hive中一个查询脚本跑完;也可以将mysql数据分别获取后插入到mysql临时表,然后关联插入到mysql目标表中。通过python脚本,后者是一个比较迅捷一些的方案。但本文中,采用的是所有数据在python内存中处理,运行速度上得来了提升,但脚本写的也确实够烦;且不具备很好的复用性。
学习点:
1、Python的多个数据结果左关联处理方法
2、mysql批量数据的插入
3、python中数据的多参数并行跑批方法
/Users/nisj/PycharmProjects/BiDataProc/love/moreSrcDataCalc.py
hive库中本有这些mysql的表,但由于只是用到了hive中的部分数据,在hive巨量数据中查询效率极低;而在mysql中进行其中一小部分的数据查询则能极大的提高查询效率。所以,代码涉及的数据源等处理相对比较复杂。
数据本可以在hive中一个查询脚本跑完;也可以将mysql数据分别获取后插入到mysql临时表,然后关联插入到mysql目标表中。通过python脚本,后者是一个比较迅捷一些的方案。但本文中,采用的是所有数据在python内存中处理,运行速度上得来了提升,但脚本写的也确实够烦;且不具备很好的复用性。
学习点:
1、Python的多个数据结果左关联处理方法
2、mysql批量数据的插入
3、python中数据的多参数并行跑批方法
/Users/nisj/PycharmProjects/BiDataProc/love/moreSrcDataCalc.py
# -*- coding=utf-8 -*-
import os
import re
import time
import datetime
import warnings
import threadpool
warnings.filterwarnings("ignore")
srcMysqlConfig_jellyfish_hadoop_stat={
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 6605,
'db': 'jellyfish_hadoop_stat'
}
srcMysqlConfig_jellyfish_stat={
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50515,
'db': 'jellyfish_stat'
}
srcMysqlConfig_jellyfish_wealth = {
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 10019,
'db': 'jellyfish_wealth'
}
srcMysqlConfig_jellyfish_server = {
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 50506,
'db': 'jellyfish_server'
}
def dateRange(beginDate, endDate):
dates=[]
dt=datetime.datetime.strptime(endDate, "%Y-%m-%d")
date=endDate[:]
while date >= beginDate:
dates.append(date)
dt=dt - datetime.timedelta(1)
date=dt.strftime("%Y-%m-%d")
return dates
def hiveDataCalc(runDay,roomIdList):
messageSend_Data=os.popen("""/usr/lib/hive-current/bin/hive -e " \
select \
parms['roomId'] roomid,count(1) as mess_num, \
count(distinct parms['uid']) as mess_user, \
count(distinct parms['ip']) as mess_ip \
from oss_bi_all_message_send_log \
where pt_day='{runDay}' and parms['roomId'] in ({roomIdStr}) \
group by parms['roomId'] \
" """.format(runDay=runDay, roomIdStr=str(roomIdList)[1:-1])).readlines();
messageSend_Data_list=[]
for ms_list in messageSend_Data:
ms=re.split('\t', ms_list.replace('\n', ''))
messageSend_Data_list.append(ms)
return messageSend_Data_list
# for messageSendData in messageSend_Data_list:
# print messageSendData[0],messageSendData[1],messageSendData[2],messageSendData[3]
def mysqlDataCalc(runDay,roomIdList):
onlineCount_DataList = []
for roomId in roomIdList:
onlineCount_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
select room_id, max(online_real_count) as online_real_count \
from room_online_stat_{tabSuffix} \
where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \
group by room_id \
; \
" """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
host=srcMysqlConfig_jellyfish_stat['host'],
port=srcMysqlConfig_jellyfish_stat['port'],
user=srcMysqlConfig_jellyfish_stat['user'],
passwd=srcMysqlConfig_jellyfish_stat['passwd'],
db=srcMysqlConfig_jellyfish_stat['db'])).readlines();
onlineCount_Data_list = []
for oc_list in onlineCount_Data:
oc = re.split('\t', oc_list.replace('\n', ''))
onlineCount_Data_list.append(oc)
onlineCount_DataList.append(onlineCount_Data_list)
# print onlineCount_DataList
giftPoint_DataList = []
for roomId in roomIdList:
giftPoint_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
select room_id,sum(point) as gift_point, \
count(distinct uid) as point_user \
from room_point_detail_{tabSuffix} \
where substr(created_time,1,10)='{runDay}' and room_id ={roomId} \
group by room_id \
; \
" """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
host=srcMysqlConfig_jellyfish_wealth['host'], port=srcMysqlConfig_jellyfish_wealth['port'],
user=srcMysqlConfig_jellyfish_wealth['user'],
passwd=srcMysqlConfig_jellyfish_wealth['passwd'],
db=srcMysqlConfig_jellyfish_wealth['db'])).readlines();
giftPoint_Data_list = []
for gp_list in giftPoint_Data:
gp = re.split('\t', gp_list.replace('\n', ''))
giftPoint_Data_list.append(gp)
giftPoint_DataList.append(giftPoint_Data_list)
# print giftPoint_DataList
roomSubscr_DataList = []
for roomId in roomIdList:
roomSubscr_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
select room_id, count(distinct uid) cnt
from room_subscriber_uid_{tabSuffix}
where substr(created_time,1,10)='{runDay}' and state=0 and room_id ={roomId}
group by room_id; \
" """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'],
user=srcMysqlConfig_jellyfish_server['user'],
passwd=srcMysqlConfig_jellyfish_server['passwd'],
db=srcMysqlConfig_jellyfish_server['db'])).readlines();
roomSubscr_Data_list = []
for rs_list in roomSubscr_Data:
rs = re.split('\t', rs_list.replace('\n', ''))
roomSubscr_Data_list.append(rs)
roomSubscr_DataList.append(roomSubscr_Data_list)
# print roomSubscr_DataList
roomRank_DataList = []
for roomId in roomIdList:
roomRank_Data = os.popen("""mysql -h{host} -P{port} -u{user} -p{passwd} -N -e "set names utf8;use {db}; \
select room_id,min(rank) rank,sum(duration) duration
from room_category_rank
where substr(created_time,1,10)='{runDay}' and category_id=0 and room_id ={roomId}
group by room_id; \
" """.format(runDay=runDay, roomId=roomId, tabSuffix=str(roomId % 256),
host=srcMysqlConfig_jellyfish_server['host'], port=srcMysqlConfig_jellyfish_server['port'],
user=srcMysqlConfig_jellyfish_server['user'],
passwd=srcMysqlConfig_jellyfish_server['passwd'],
db=srcMysqlConfig_jellyfish_server['db'])).readlines();
roomRank_Data_list = []
for rr_list in roomRank_Data:
rr = re.split('\t', rr_list.replace('\n', ''))
roomRank_Data_list.append(rr)
roomRank_DataList.append(roomRank_Data_list)
# print roomRank_DataList
return onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList
def DataCom2mysql(runDay, roomIdList):
# 参数初始化赋值
host=srcMysqlConfig_jellyfish_hadoop_stat['host']
port=srcMysqlConfig_jellyfish_hadoop_stat['port']
user=srcMysqlConfig_jellyfish_hadoop_stat['user']
passwd=srcMysqlConfig_jellyfish_hadoop_stat['passwd']
db=srcMysqlConfig_jellyfish_hadoop_stat['db']
os.system("""source /etc/profile; \
mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
delete from report_room_live_detail_daily_stat where \`date\`='{runDay}' and room_id in({roomIdStr}); " """.format(host=host, port=port, user=user, passwd=passwd, db=db, runDay=runDay, roomIdStr=str(roomIdList)[1:-1]))
onlineCount_DataList, giftPoint_DataList, roomSubscr_DataList, roomRank_DataList = mysqlDataCalc(runDay, roomIdList)
# print onlineCount_DataList,giftPoint_DataList,roomSubscr_DataList,roomRank_DataList
lineCountcomList=[]
roomIdonlineCountHave = []
for roomId in roomIdList:
for onlineCount in onlineCount_DataList:
if len(onlineCount)!= 0 and int(roomId) == int(onlineCount[0][0]):
roomIdonlineCountHave.append(roomId)
lineCountcomList.append([roomId,int(onlineCount[0][1])])
for roomId in roomIdList:
if roomId not in roomIdonlineCountHave:
lineCountcomList.append([roomId,0])
# print lineCountcomList
giftPointcomList=[]
roomIdgiftPointHave = []
for roomId in roomIdList:
for giftPoint in giftPoint_DataList:
if len(giftPoint)!= 0 and int(roomId) == int(giftPoint[0][0]):
roomIdgiftPointHave.append(roomId)
giftPointcomList.append([roomId,int(giftPoint[0][1]),int(giftPoint[0][2])])
for roomId in roomIdList:
if roomId not in roomIdgiftPointHave:
giftPointcomList.append([roomId,0,0])
# print giftPointcomList
roomSubscrcomList=[]
roomIdroomSubscrHave = []
for roomId in roomIdList:
for roomSubscr in roomSubscr_DataList:
if len(roomSubscr)!= 0 and int(roomId) == int(roomSubscr[0][0]):
roomIdroomSubscrHave.append(roomId)
roomSubscrcomList.append([roomId,int(roomSubscr[0][1])])
for roomId in roomIdList:
if roomId not in roomIdroomSubscrHave:
roomSubscrcomList.append([roomId,0])
# print roomSubscrcomList
roomRankcomList=[]
roomIdroomRankHave = []
for roomId in roomIdList:
for roomRank in roomRank_DataList:
if len(roomRank)!= 0 and int(roomId) == int(roomRank[0][0]):
roomIdroomRankHave.append(roomId)
roomRankcomList.append([roomId,int(roomRank[0][1]),int(roomRank[0][2])])
for roomId in roomIdList:
if roomId not in roomIdroomRankHave:
roomRankcomList.append([roomId,0,0])
# print roomRankcomList
messageSend_Data_list=hiveDataCalc(runDay, roomIdList)
messageSendcomList=[]
roomIdmessageSendHave = []
for roomId in roomIdList:
for messageSend in messageSend_Data_list:
if len(messageSend)!= 0 and int(roomId) == int(messageSend[0]):
roomIdmessageSendHave.append(roomId)
messageSendcomList.append([roomId,int(messageSend[1]),int(messageSend[2]),int(messageSend[3])])
for roomId in roomIdList:
if roomId not in roomIdmessageSendHave:
messageSendcomList.append([roomId,0,0,0])
# print messageSendcomList
Sum_Data_list=[]
for msd in messageSendcomList:
for old in lineCountcomList:
for gpd in giftPointcomList:
for rsd in roomSubscrcomList:
for rrd in roomRankcomList:
if msd[0] == old[0] and msd[0] == gpd[0] and msd[0] == rsd[0] and msd[0] == rrd[0]:
Sum_Data_list.append([msd[0], msd[1], msd[2], msd[3], 0, 0, old[1], gpd[1], gpd[2], rrd[2], rsd[1], rrd[1]])
# for sumd in Sum_Data_list:
# print sumd
# result insert table
i=0
insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into report_room_live_detail_daily_stat(\`date\`,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
for sumd in Sum_Data_list:
room_id=sumd[0]
barrage_count=sumd[1]
barrage_user_count=sumd[2]
barrage_ip_count=sumd[3]
view_user=sumd[4]
view_duration=sumd[5]
view_peak_user=sumd[6]
gift_num=sumd[7]
gift_user=sumd[8]
anchor_duration=sumd[9]
subscriber_count=sumd[10]
rank=sumd[11]
etl_time=time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert_mysql_sql=insert_mysql_sql + """('{date}','{room_id}','{barrage_count}','{barrage_user_count}','{barrage_ip_count}','{view_user}','{view_duration}','{view_peak_user}','{gift_num}','{gift_user}','{anchor_duration}','{subscriber_count}','{rank}'),""".format(host=host, port=port, user=user, passwd=passwd, db=db, date='{runDay}'.format(runDay=runDay),room_id=room_id, barrage_count=barrage_count, barrage_user_count=barrage_user_count, barrage_ip_count=barrage_ip_count,view_user=view_user,view_duration=view_duration,view_peak_user=view_peak_user,gift_num=gift_num,gift_user=gift_user,anchor_duration=anchor_duration,subscriber_count=subscriber_count,rank=rank)
if (i % 1000 == 0):
insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """
os.system(insert_mysql_sql)
insert_mysql_sql="""/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into report_room_live_detail_daily_stat(date,room_id,barrage_count,barrage_user_count,barrage_ip_count,view_user,view_duration,view_peak_user,gift_num,gift_user,anchor_duration,subscriber_count,rank) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
insert_mysql_sql=insert_mysql_sql.rstrip(',') + """ ;" """
os.system(insert_mysql_sql)
# run parallel Batch
roomIdList=[19467,2469625,3755657,96969,95277]
runDayRoomIdList=[]
for runDay in dateRange(beginDate='2015-11-27', endDate='2018-04-11'):
runDayRoomIdList.append(([runDay, roomIdList], None))
# print runDayRoomIdList
now_time=time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
requests=[]
request_DataCom2mysql_batchCtl=threadpool.makeRequests(DataCom2mysql, runDayRoomIdList)
requests.extend(request_DataCom2mysql_batchCtl)
main_pool=threadpool.ThreadPool(38)
[main_pool.putRequest(req) for req in requests]
if __name__ == '__main__':
while True:
try:
time.sleep(30)
main_pool.poll()
except KeyboardInterrupt:
print("**** Interrupted!")
break
except threadpool.NoResultsPending:
break
if main_pool.dismissedWorkers:
print("Joining all dismissed worker threads...")
main_pool.joinAllDismissedWorkers()
now_time=time.strftime('%Y-%m-%d %X', time.localtime())
print "当前时间是:",now_time
# run serial Batch
# roomIdList=[19467,2469625,3755657,96969,95277]
# for runDay in dateRange(beginDate='2016-10-01', endDate='2016-10-08'):
# DataCom2mysql(runDay=runDay, roomIdList=roomIdList)