需求描述:
1、对应专区排名有20次及以上保持前20名记录的主播roomid、uid、昵称
2、上述拉出的各个专区的主播,在2017-12-08至2018-01-08期间,如果同一个时段被覆盖40分钟超过20次以上,记录为该时段主播
问题分析:
需求的第一部分可以很方便的拉出,关键是第二部分比较麻烦。
涉及到直播时间跨小时及跨天的处理;计算 时段内的直播时长,有四个逻辑情况处理; 针对跨天的数据情况,将跨天的直播分成两段,一段是开播时间到当天的[23:59:59],一段是从第二天的[00:00:00]到时关播时间。
数据是先在Hive里计算出第一步的结果,然后将直播时长覆盖按每个时段分别在Hive计算后装载到Mysql上,最后再进行汇总处理,满足最终的需要。
处理整个需求,分别使用了SQL、HQL及Python进行了处理,以及常用的Hive汇总数据向Mysql传输处理。
1、Mysql目标表建表
/Users/nisj/PycharmProjects/BiDataProc/love/HiveDataSum2Mysql.py
1、对应专区排名有20次及以上保持前20名记录的主播roomid、uid、昵称
2、上述拉出的各个专区的主播,在2017-12-08至2018-01-08期间,如果同一个时段被覆盖40分钟超过20次以上,记录为该时段主播
问题分析:
需求的第一部分可以很方便的拉出,关键是第二部分比较麻烦。
涉及到直播时间跨小时及跨天的处理;计算 时段内的直播时长,有四个逻辑情况处理; 针对跨天的数据情况,将跨天的直播分成两段,一段是开播时间到当天的[23:59:59],一段是从第二天的[00:00:00]到时关播时间。
数据是先在Hive里计算出第一步的结果,然后将直播时长覆盖按每个时段分别在Hive计算后装载到Mysql上,最后再进行汇总处理,满足最终的需要。
处理整个需求,分别使用了SQL、HQL及Python进行了处理,以及常用的Hive汇总数据向Mysql传输处理。
1、Mysql目标表建表
show create table xxxxxx_liuyl;
CREATE TABLE `xxxxxx_liuyl` (
`roomid` int(11) DEFAULT NULL,
`gameid` int(11) DEFAULT NULL,
`gamename` varchar(255) DEFAULT NULL,
`live_date` datetime DEFAULT NULL,
`live_minutes` decimal(22,3) DEFAULT NULL,
`hourtype` varchar(80) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、第一部分的数据获取sqldrop table if exists xxxx_liuyl_first;
create table xxxx_liuyl_first as
with tab_user_room_info as(
select a2.nickname,a1.id room_id,a2.uid
from oss_room_v2 a1
inner join oss_bi_all_user_profile a2 on a1.creator_uid=a2.uid
where a1.pt_day='2018-01-09' and a2.pt_day='2018-01-09'),
tab_game_info as(
select id game_id,name game_name,category_id
from xxx_game
where name in('王者荣耀','球球大作战','穿越火线:荒岛特训','迷你世界','我的世界','全民枪战','天天酷跑','一起来飞车','火影忍者','QQ飞车手游','荒野行动','终结者2:审判日')
),
tab_room_rank as(
select room_id,category_id,count(distinct pt_day) top20_cnt
from (select last_time pt_day,room_id,rank,category_id
from data_chushou_room_category_rank
where pt_day between '2017-12-08' and '2018-01-08' and rank<=20
group by last_time,room_id,rank,category_id)x
group by room_id,category_id
having count(distinct pt_day)>=20)
select a1.category_id,a1.game_id,a1.game_name,a2.room_id,a3.uid,a3.nickname,a2.top20_cnt
from tab_game_info a1
inner join tab_room_rank a2 on a1.category_id=a2.category_id
left join tab_user_room_info a3 on a2.room_id=a3.room_id
order by a1.category_id,a2.top20_cnt desc
;
3、直播时长时段覆盖的样例sqlwith tab_live_time as(
select room_id,game_id,game_name,substr(updated_time,1,10) pt_day,
case when substr(switch_time,12,8) between '00:00:00' and '01:00:00' and substr(updated_time,12,8)>='01:00:00' then (unix_timestamp(concat(substr(updated_time,1,11),'01:00:00'))-unix_timestamp(switch_time))/60
when substr(switch_time,12,8) between '00:00:00' and '01:00:00' and substr(updated_time,12,8) between '00:00:00' and '01:00:00' then (unix_timestamp(updated_time)-unix_timestamp(switch_time))/60
when substr(switch_time,12,8) <= '00:00:00' and substr(updated_time,12,8) between '00:00:00' and '01:00:00' then (unix_timestamp(updated_time)-unix_timestamp(concat(substr(switch_time,1,11),'00:00:00')))/60
when substr(switch_time,12,8) >= '00:00:00' and substr(updated_time,12,8) <= '01:00:00' then (unix_timestamp(concat(substr(updated_time,1,11),'01:00:00'))-unix_timestamp(concat(substr(switch_time,1,11),'00:00:00')))/60
else 0 end live_minutes
from (select room_id,game_id,game_name,switch_time,concat(substr(switch_time,1,10),' 23:59:59') updated_time
from xxx_live_history_status
where substr(switch_time,1,10)<>substr(updated_time,1,10)
union all
select room_id,game_id,game_name,concat(substr(updated_time,1,10),' 00:00:00') switch_time,updated_time
from xxx_live_history_status
where substr(switch_time,1,10)<>substr(updated_time,1,10)
union all
select room_id,game_id,game_name,switch_time,updated_time
from xxx_live_history_status
where substr(switch_time,1,10)=substr(updated_time,1,10)) x
where substr(updated_time,1,10) between '2017-12-08' and '2018-01-08'
)
select a1.room_id,a1.game_id,a1.game_name,a1.pt_day live_date,a1.live_minutes,'rangetime(0~1)' hourtype
from tab_live_time a1
inner join xxxx_liuyl_first a2 on a1.room_id=a2.room_id and a1.game_id=a2.game_id;
4、具体跑出时段覆盖的python脚本/Users/nisj/PycharmProjects/BiDataProc/love/HiveDataSum2Mysql.py
# -*- coding=utf-8 -*-
import os
import re
import time
import warnings
warnings.filterwarnings("ignore")
srcMysqlConfig_jellyfish_hadoop_stat = {
'host': 'MysqlHost',
'user': 'MysqlUser',
'passwd': 'MysqlPass',
'port': MysqlPort,
'db': 'funnyai_data'
}
def hiveDataSum2Mysql_liuyl(time_start, time_end):
# 参数初始化赋值
host = srcMysqlConfig_jellyfish_hadoop_stat['host']
port = srcMysqlConfig_jellyfish_hadoop_stat['port']
user = srcMysqlConfig_jellyfish_hadoop_stat['user']
passwd = srcMysqlConfig_jellyfish_hadoop_stat['passwd']
db = srcMysqlConfig_jellyfish_hadoop_stat['db']
os.system("""source /etc/profile; \
/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
delete from xxxxxx_liuyl where hourtype='{hourtype}'; " """.format(host=host, port=port, user=user, passwd=passwd, db=db, hourtype='rangetime({time_start}~{time_end})'.format(time_start=time_start, time_end=time_end)))
Sum_Data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
with tab_live_time as( \
select room_id,game_id,game_name,substr(updated_time,1,10) pt_day, \
case when substr(switch_time,12,8) between '{time_start}' and '{time_end}' and substr(updated_time,12,8)>='{time_end}' then (unix_timestamp(concat(substr(updated_time,1,11),'{time_end}'))-unix_timestamp(switch_time))/60 \
when substr(switch_time,12,8) between '{time_start}' and '{time_end}' and substr(updated_time,12,8) between '{time_start}' and '{time_end}' then (unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 \
when substr(switch_time,12,8) <= '{time_start}' and substr(updated_time,12,8) between '{time_start}' and '{time_end}' then (unix_timestamp(updated_time)-unix_timestamp(concat(substr(switch_time,1,11),'{time_start}')))/60 \
when substr(switch_time,12,8) >= '{time_start}' and substr(updated_time,12,8) <= '{time_end}' then (unix_timestamp(concat(substr(updated_time,1,11),'{time_end}'))-unix_timestamp(concat(substr(switch_time,1,11),'{time_start}')))/60 \
else 0 end live_minutes \
from (select room_id,game_id,game_name,switch_time,concat(substr(switch_time,1,10),' 23:59:59') updated_time \
from xxx_live_history_status \
where substr(switch_time,1,10)<>substr(updated_time,1,10) \
union all \
select room_id,game_id,game_name,concat(substr(updated_time,1,10),' 00:00:00') switch_time,updated_time \
from xxx_live_history_status \
where substr(switch_time,1,10)<>substr(updated_time,1,10) \
union all \
select room_id,game_id,game_name,switch_time,updated_time \
from xxx_live_history_status \
where substr(switch_time,1,10)=substr(updated_time,1,10)) x \
where substr(updated_time,1,10) between '2017-12-08' and '2018-01-08' \
) \
select a1.room_id,a1.game_id,a1.game_name,a1.pt_day live_date,a1.live_minutes,'rangetime({time_start}~{time_end})' hourtype \
from tab_live_time a1 \
inner join xxxx_liuyl_first a2 on a1.room_id=a2.room_id and a1.game_id=a2.game_id; \
" """.format(time_start=time_start, time_end=time_end)).readlines();
Sum_Data_list = []
for sum_list in Sum_Data:
sum = re.split('\t', sum_list.replace('\n', ''))
Sum_Data_list.append(sum)
i = 0
insert_mysql_sql = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into xxxxxx_liuyl(roomid,gameid,gamename,live_date,live_minutes,hourtype) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
for sumd in Sum_Data_list:
roomid = sumd[0]
gameid = sumd[1]
gamename = sumd[2]
live_date = sumd[3]
live_minutes = sumd[4]
hourtype = sumd[5]
etl_time = time.strftime('%Y-%m-%d %X', time.localtime())
i += 1
insert_mysql_sql = insert_mysql_sql + """('{roomid}','{gameid}','{gamename}','{live_date}','{live_minutes}','{hourtype}'),""".format(host=host, port=port, user=user, passwd=passwd, db=db, roomid=roomid, gameid=gameid, gamename=gamename, live_date=live_date, live_minutes=live_minutes, hourtype=hourtype, etl_time=etl_time)
if (i % 1000 == 0):
insert_mysql_sql = insert_mysql_sql.rstrip(',') + """ ;" """
os.system(insert_mysql_sql)
insert_mysql_sql = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
insert into xxxxxx_liuyl(roomid,gameid,gamename,live_date,live_minutes,hourtype) \
values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
insert_mysql_sql = insert_mysql_sql.rstrip(',') + """ ;" """
os.system(insert_mysql_sql)
hourTypeList = [('00:00:00', '00:59:59'),
('01:00:00', '01:59:59'),
('02:00:00', '02:59:59'),
('03:00:00', '03:59:59'),
('04:00:00', '04:59:59'),
('05:00:00', '05:59:59'),
('06:00:00', '06:59:59'),
('07:00:00', '07:59:59'),
('08:00:00', '08:59:59'),
('09:00:00', '09:59:59'),
('10:00:00', '10:59:59'),
('11:00:00', '11:59:59'),
('12:00:00', '12:59:59'),
('13:00:00', '13:59:59'),
('14:00:00', '14:59:59'),
('15:00:00', '15:59:59'),
('16:00:00', '16:59:59'),
('17:00:00', '17:59:59'),
('18:00:00', '18:59:59'),
('19:00:00', '19:59:59'),
('20:00:00', '20:59:59'),
('21:00:00', '21:59:59'),
('22:00:00', '22:59:59'),
('23:00:00', '23:59:59')]
for hourType in hourTypeList:
print hourType[0], hourType[1]
hiveDataSum2Mysql_liuyl(time_start=hourType[0], time_end=hourType[1])
5、第二步的结果查询脚本
--时段内单次直播过40分钟
select gameid,gamename,roomid,hourtype,count(*) cnt
from xxxxxx_liuyl
where live_minutes>=40
group by gameid,gamename,roomid,hourtype
having count(*)>=20
order by gameid,hourtype;
--时段内总直播过40分钟
select gameid,gamename,roomid,hourtype,count(*) cnt
from (select gameid,gamename,roomid,live_date,hourtype,sum(live_minutes) live_minutes
from xxxxxx_liuyl
group by gameid,gamename,roomid,live_date,hourtype
having sum(live_minutes)>=40) x
group by gameid,gamename,roomid,hourtype
having count(*)>=20
order by gameid,hourtype;