数据插入及处理的方式其实并没有什么新花样,主要是hive处理后的数据表,相同的表,不同的字段插入到两个不同的mysql表中。
/Users/nisj/PycharmProjects/BiDataProc/love/HiveDataSum2Mysql-0710forZw.py
/Users/nisj/PycharmProjects/BiDataProc/love/HiveDataSum2Mysql-0710forZw.py
# -*- coding=utf-8 -*-
import os
import re
import time
import datetime
import warnings
import sys
reload(sys)
sys.setdefaultencoding('utf8')
warnings.filterwarnings("ignore")
srcMysqlConfig_jellyfish_hadoop_stat = {
'host': 'MysqlHost',
# 'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': 6605,
'db': 'jellyfish_hadoop_stat'
}
def dateRange(beginDate, endDate):
dates = []
dt = datetime.datetime.strptime(endDate, "%Y-%m-%d")
date = endDate[:]
while date >= beginDate:
dates.append(date)
dt = dt - datetime.timedelta(1)
date = dt.strftime("%Y-%m-%d")
return dates
def dataProcPre(runDay):
os.system("""source /etc/profile; \
/usr/lib/hive-current/bin/hive -e " \
drop table if exists xxxhuangxt_user_room_info; \
create table xxxhuangxt_user_room_info as \
select a1.nickname,a2.room_id,a1.uid \
from oss_bi_all_user_profile a1 \
left join (select a1.room_id,a1.creator_uid \
from (select a1.id room_id,a1.creator_uid,row_number()over(partition by a1.creator_uid order by a1.updated_time desc) rn \
from oss_bi_all_room a1 \
where a1.pt_day='{runDay}' and a1.state=0) a1 \
where a1.rn=1) a2 on a1.uid=a2.creator_uid \
where a1.pt_day='{runDay}'; \
drop table if exists xxxhuangxt_bigfans_live; \
create table xxxhuangxt_bigfans_live as \
with tab_big_fans_relation as ( \
select a1.room_id,a1.uid,a1.intimacy,case when a1.intimacy between 0 and 40 then 1 \
when a1.intimacy between 41 and 408-1 then 2 \
when a1.intimacy between 408 and 1020-1 then 3 \
when a1.intimacy between 1020 and 3061-1 then 4 \
when a1.intimacy between 3061 and 6122-1 then 5 \
when a1.intimacy between 6122 and 10204-1 then 6 \
when a1.intimacy between 10204 and 18367-1 then 7 \
when a1.intimacy between 18367 and 30612-1 then 8 \
when a1.intimacy between 30612 and 44898-1 then 9 \
when a1.intimacy between 44898 and 61224-1 then 10 \
when a1.intimacy between 61224 and 102041-1 then 11 \
when a1.intimacy between 102041 and 163265-1 then 12 \
when a1.intimacy between 163265 and 244898-1 then 13 \
when a1.intimacy between 244898 and 367347-1 then 14 \
when a1.intimacy between 367347 and 510204-1 then 15 \
when a1.intimacy between 510204 and 714286-1 then 16 \
when a1.intimacy between 714286 and 979592-1 then 17 \
when a1.intimacy between 979592 and 1326531-1 then 18 \
when a1.intimacy between 1326531 and 1734694-1 then 19 \
when a1.intimacy between 1734694 and 2244898-1 then 20 \
when a1.intimacy between 2244898 and 3265306-1 then 21 \
when a1.intimacy between 3265306 and 4693878-1 then 22 \
when a1.intimacy between 4693878 and 6530612-1 then 23 \
when a1.intimacy between 6530612 and 9183673-1 then 24 \
when a1.intimacy between 9183673 and 12244898-1 then 25 \
when a1.intimacy between 12244898 and 16326531-1 then 26 \
when a1.intimacy between 16326531 and 20408163-1 then 27 \
when a1.intimacy between 20408163 and 24489796-1 then 28 \
when a1.intimacy between 24489796 and 28571429-1 then 29 \
when a1.intimacy between 28571429 and 32653061-1 then 30 \
when a1.intimacy >= 32653061 then 30 \
end bigfans_level,a1.state,row_number()over(partition by a1.uid order by a1.intimacy desc) user_grade_rn \
from fans_all_big_fans_relation_total a1 \
where a1.pt_day='{runDay}' and a1.state=-1), \
tab_live_room as( \
select a1.room_id \
from honeycomb_all_live_history_status a1 \
inner join oss_bi_all_room a2 on a1.room_id=a2.id \
where a1.pt_day between date_sub('{runDay}',30) and '{runDay}' and a2.pt_day='{runDay}' and a2.state=0 \
group by a1.room_id), \
tab_phone_num as ( \
select uid,phone_num \
from oss_bi_all_user_phone_num \
where pt_day='{runDay}' and state=0) \
select a1.uid,a1.bigfans_level,a1.intimacy,a3.phone_num,a1.room_id \
from tab_big_fans_relation a1 \
inner join tab_live_room a2 on a1.room_id=a2.room_id \
left join tab_phone_num a3 on a1.uid=a3.uid \
where a1.user_grade_rn=1; \
drop table if exists xxxhuangxt_bigfans_view; \
create table xxxhuangxt_bigfans_view as \
select a1.uid,a1.roomid room_id,sum(a1.view_time) view_time \
from recommend_data_view a1 \
inner join xxxhuangxt_bigfans_live a2 on a1.uid=a2.uid and a1.roomid=a2.room_id \
where a1.pt_day between '2018-01-01' and '2018-07-06' \
group by a1.uid,a1.roomid; \
drop table if exists xxxhuangxt_bigfans_subscriber; \
create table xxxhuangxt_bigfans_subscriber as \
select a1.uid,a1.room_id,min(a1.created_time) subscriber_time \
from oss_bi_all_room_subscriber_roomid a1 \
inner join xxxhuangxt_bigfans_live a2 on a1.uid=a2.uid and a1.room_id=a2.room_id \
where a1.pt_day='{runDay}' \
group by a1.uid,a1.room_id;" """.format(runDay=runDay))
def hiveDataSum2Mysql_jiazw(runDay):
# 参数初始化赋值
host = srcMysqlConfig_jellyfish_hadoop_stat['host']
port = srcMysqlConfig_jellyfish_hadoop_stat['port']
user = srcMysqlConfig_jellyfish_hadoop_stat['user']
passwd = srcMysqlConfig_jellyfish_hadoop_stat['passwd']
db = srcMysqlConfig_jellyfish_hadoop_stat['db']
os.system("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
delete from bigfans_recall_record where \`date\`='{runDay}'; \
delete from bigfans_recall_info where \`date\`='{runDay}'; " """.format(host=host, port=port, user=user,
passwd=passwd, db=db,
runDay=runDay))
Sum_Data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
select '{runDay}' data_date,a1.uid,a1.bigfans_level,a1.intimacy,a1.phone_num,a1.room_id,a2.nickname fans_nickname,a3.nickname anchor_nickname,a4.view_time,a5.subscriber_time, \
-1 state,from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss') created_time,from_unixtime(unix_timestamp(),'yyyy-MM-dd HH:mm:ss') updated_time \
from xxxhuangxt_bigfans_live a1 \
left join xxxhuangxt_user_room_info a2 on a1.uid=a2.uid \
left join xxxhuangxt_user_room_info a3 on a1.room_id=a3.room_id \
left join xxxhuangxt_bigfans_view a4 on a1.uid=a4.uid and a1.room_id=a4.room_id \
left join xxxhuangxt_bigfans_subscriber a5 on a1.uid=a5.uid and a1.room_id=a5.room_id; \
" """.format(runDay=runDay)).readlines();
Sum_Data_list = []
for sum_list in Sum_Data:
sum = re.split('\t', sum_list.replace('\n', ''))
Sum_Data_list.append(sum)
i = 0
insert_mysql_sql = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into bigfans_recall_record(\`date\`,uid,phone_num,room_id,state,created_time,updated_time,nickname,anchorName) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
insert_mysql_sql_recallinfo = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into bigfans_recall_info(uid,level,room_id,duration,follow_time,intimacy,state,\`date\`,created_time,updated_time) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
for sumd in Sum_Data_list:
data_date = sumd[0]
uid = sumd[1]
bigfans_level = sumd[2]
intimacy = sumd[3]
phone_num = sumd[4]
room_id = sumd[5]
fans_nickname = sumd[6].replace(chr(10), '').replace(chr(39), '').replace('(', '').replace(')', '').replace(
chr(96), '').replace(chr(34), '').replace(chr(92), '')
anchor_nickname = sumd[7].replace(chr(10), '').replace(chr(39), '').replace('(', '').replace(')', '').replace(
chr(96), '').replace(chr(34), '').replace(chr(92), '')
view_time = sumd[8]
subscriber_time = sumd[9]
state = sumd[10]
etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert_mysql_sql = insert_mysql_sql + """('{data_date}','{uid}','{phone_num}','{room_id}','{state}','{etl_time}','{etl_time}','{fans_nickname}','{anchor_nickname}'),""".format(
host=host, port=port, user=user, passwd=passwd, db=db,
data_date=data_date, uid=uid, phone_num=phone_num, room_id=room_id, state=state, etl_time=etl_time,
fans_nickname=fans_nickname, anchor_nickname=anchor_nickname)
insert_mysql_sql_recallinfo = insert_mysql_sql_recallinfo + """('{uid}','{bigfans_level}','{room_id}','{view_time}','{subscriber_time}','{intimacy}','{state}','{data_date}','{etl_time}','{etl_time}'),""".format(
host=host, port=port, user=user, passwd=passwd, db=db,
uid=uid, bigfans_level=bigfans_level, room_id=room_id,
view_time=view_time, subscriber_time=subscriber_time, intimacy=intimacy, state=state, data_date=data_date,
etl_time=etl_time)
if (i % 600 == 0):
insert_mysql_sql = insert_mysql_sql.rstrip(',') + """ ;" """
insert_mysql_sql_recallinfo = insert_mysql_sql_recallinfo.rstrip(',') + """ ;" """
os.system(insert_mysql_sql)
os.system(insert_mysql_sql_recallinfo)
insert_mysql_sql = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into bigfans_recall_record(\`date\`,uid,phone_num,room_id,state,created_time,updated_time,nickname,anchorName) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
insert_mysql_sql_recallinfo = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into bigfans_recall_info(uid,level,room_id,duration,follow_time,intimacy,state,\`date\`,created_time,updated_time) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
insert_mysql_sql = insert_mysql_sql.rstrip(',') + """ ;" """
insert_mysql_sql_recallinfo = insert_mysql_sql_recallinfo.rstrip(',') + """ ;" """
os.system(insert_mysql_sql)
os.system(insert_mysql_sql_recallinfo)
# run serial Batch
runDay = '2018-07-09'
# dataProcPre(runDay=runDay)
hiveDataSum2Mysql_jiazw(runDay=runDay)