主播游戏专区开播时段范围统计

需求描述:
1、对应专区排名有20次及以上保持前20名记录的主播roomid、uid、昵称
2、上述拉出的各个专区的主播,在2017-12-08至2018-01-08期间,如果同一个时段被覆盖40分钟超过20次以上,记录为该时段主播
问题分析:
需求的第一部分可以很方便的拉出,关键是第二部分比较麻烦。
涉及到直播时间跨小时及跨天的处理;计算 时段内的直播时长,有四个逻辑情况处理; 针对跨天的数据情况,将跨天的直播分成两段,一段是开播时间到当天的[23:59:59],一段是从第二天的[00:00:00]到时关播时间
数据是先在Hive里计算出第一步的结果,然后将直播时长覆盖按每个时段分别在Hive计算后装载到Mysql上,最后再进行汇总处理,满足最终的需要。
处理整个需求,分别使用了SQL、HQL及Python进行了处理,以及常用的Hive汇总数据向Mysql传输处理。
1、Mysql目标表建表
show create table xxxxxx_liuyl;
CREATE TABLE `xxxxxx_liuyl` (
  `roomid` int(11) DEFAULT NULL,
  `gameid` int(11) DEFAULT NULL,
  `gamename` varchar(255) DEFAULT NULL,
  `live_date` datetime DEFAULT NULL,
  `live_minutes` decimal(22,3) DEFAULT NULL,
  `hourtype` varchar(80) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
2、第一部分的数据获取sql
drop table if exists xxxx_liuyl_first;
create table xxxx_liuyl_first as
with tab_user_room_info as(
select a2.nickname,a1.id room_id,a2.uid
from oss_room_v2 a1
inner join oss_bi_all_user_profile a2 on a1.creator_uid=a2.uid
where a1.pt_day='2018-01-09' and a2.pt_day='2018-01-09'),
tab_game_info as(
select id game_id,name game_name,category_id
from xxx_game 
where name in('王者荣耀','球球大作战','穿越火线:荒岛特训','迷你世界','我的世界','全民枪战','天天酷跑','一起来飞车','火影忍者','QQ飞车手游','荒野行动','终结者2:审判日')
),
tab_room_rank as(
select room_id,category_id,count(distinct pt_day) top20_cnt
from (select last_time pt_day,room_id,rank,category_id
from data_chushou_room_category_rank
where pt_day between '2017-12-08' and '2018-01-08' and rank<=20
group by last_time,room_id,rank,category_id)x
group by room_id,category_id
having count(distinct pt_day)>=20)
select a1.category_id,a1.game_id,a1.game_name,a2.room_id,a3.uid,a3.nickname,a2.top20_cnt
from tab_game_info a1
inner join tab_room_rank a2 on a1.category_id=a2.category_id
left join tab_user_room_info a3 on a2.room_id=a3.room_id
order by a1.category_id,a2.top20_cnt desc
;
3、直播时长时段覆盖的样例sql
with tab_live_time as(
select room_id,game_id,game_name,substr(updated_time,1,10) pt_day,
case when substr(switch_time,12,8) between '00:00:00' and '01:00:00' and substr(updated_time,12,8)>='01:00:00' then (unix_timestamp(concat(substr(updated_time,1,11),'01:00:00'))-unix_timestamp(switch_time))/60
     when substr(switch_time,12,8) between '00:00:00' and '01:00:00' and substr(updated_time,12,8) between '00:00:00' and '01:00:00' then (unix_timestamp(updated_time)-unix_timestamp(switch_time))/60
     when substr(switch_time,12,8) <= '00:00:00' and substr(updated_time,12,8) between '00:00:00' and '01:00:00' then (unix_timestamp(updated_time)-unix_timestamp(concat(substr(switch_time,1,11),'00:00:00')))/60
     when substr(switch_time,12,8) >= '00:00:00' and substr(updated_time,12,8) <= '01:00:00' then (unix_timestamp(concat(substr(updated_time,1,11),'01:00:00'))-unix_timestamp(concat(substr(switch_time,1,11),'00:00:00')))/60
else 0 end live_minutes
from (select room_id,game_id,game_name,switch_time,concat(substr(switch_time,1,10),' 23:59:59') updated_time
from xxx_live_history_status
where substr(switch_time,1,10)<>substr(updated_time,1,10)
union all
select room_id,game_id,game_name,concat(substr(updated_time,1,10),' 00:00:00') switch_time,updated_time
from xxx_live_history_status
where substr(switch_time,1,10)<>substr(updated_time,1,10)
union all
select room_id,game_id,game_name,switch_time,updated_time
from xxx_live_history_status
where substr(switch_time,1,10)=substr(updated_time,1,10)) x
where substr(updated_time,1,10) between '2017-12-08' and '2018-01-08'
)
select a1.room_id,a1.game_id,a1.game_name,a1.pt_day live_date,a1.live_minutes,'rangetime(0~1)' hourtype
from tab_live_time a1
inner join xxxx_liuyl_first a2 on a1.room_id=a2.room_id and a1.game_id=a2.game_id;
4、具体跑出时段覆盖的python脚本
/Users/nisj/PycharmProjects/BiDataProc/love/HiveDataSum2Mysql.py
# -*- coding=utf-8 -*-
import os
import re
import time
import warnings

warnings.filterwarnings("ignore")

srcMysqlConfig_jellyfish_hadoop_stat = {
    'host': 'MysqlHost',
    'user': 'MysqlUser',
    'passwd': 'MysqlPass',
    'port': MysqlPort,
    'db': 'funnyai_data'
}

def hiveDataSum2Mysql_liuyl(time_start, time_end):
    # 参数初始化赋值
    host = srcMysqlConfig_jellyfish_hadoop_stat['host']
    port = srcMysqlConfig_jellyfish_hadoop_stat['port']
    user = srcMysqlConfig_jellyfish_hadoop_stat['user']
    passwd = srcMysqlConfig_jellyfish_hadoop_stat['passwd']
    db = srcMysqlConfig_jellyfish_hadoop_stat['db']

    os.system("""source /etc/profile; \
             /usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
             delete from xxxxxx_liuyl where hourtype='{hourtype}'; " """.format(host=host, port=port, user=user, passwd=passwd, db=db, hourtype='rangetime({time_start}~{time_end})'.format(time_start=time_start, time_end=time_end)))

    Sum_Data = os.popen("""/usr/lib/hive-current/bin/hive -e " \
            with tab_live_time as( \
            select room_id,game_id,game_name,substr(updated_time,1,10) pt_day, \
            case when substr(switch_time,12,8) between '{time_start}' and '{time_end}' and substr(updated_time,12,8)>='{time_end}' then (unix_timestamp(concat(substr(updated_time,1,11),'{time_end}'))-unix_timestamp(switch_time))/60 \
                 when substr(switch_time,12,8) between '{time_start}' and '{time_end}' and substr(updated_time,12,8) between '{time_start}' and '{time_end}' then (unix_timestamp(updated_time)-unix_timestamp(switch_time))/60 \
                 when substr(switch_time,12,8) <= '{time_start}' and substr(updated_time,12,8) between '{time_start}' and '{time_end}' then (unix_timestamp(updated_time)-unix_timestamp(concat(substr(switch_time,1,11),'{time_start}')))/60 \
                 when substr(switch_time,12,8) >= '{time_start}' and substr(updated_time,12,8) <= '{time_end}' then (unix_timestamp(concat(substr(updated_time,1,11),'{time_end}'))-unix_timestamp(concat(substr(switch_time,1,11),'{time_start}')))/60 \
            else 0 end live_minutes \
            from (select room_id,game_id,game_name,switch_time,concat(substr(switch_time,1,10),' 23:59:59') updated_time \
            from xxx_live_history_status \
            where substr(switch_time,1,10)<>substr(updated_time,1,10) \
            union all \
            select room_id,game_id,game_name,concat(substr(updated_time,1,10),' 00:00:00') switch_time,updated_time \
            from xxx_live_history_status \
            where substr(switch_time,1,10)<>substr(updated_time,1,10) \
            union all \
            select room_id,game_id,game_name,switch_time,updated_time \
            from xxx_live_history_status \
            where substr(switch_time,1,10)=substr(updated_time,1,10)) x \
            where substr(updated_time,1,10) between '2017-12-08' and '2018-01-08' \
            ) \
            select a1.room_id,a1.game_id,a1.game_name,a1.pt_day live_date,a1.live_minutes,'rangetime({time_start}~{time_end})' hourtype \
            from tab_live_time a1 \
            inner join xxxx_liuyl_first a2 on a1.room_id=a2.room_id and a1.game_id=a2.game_id; \
            " """.format(time_start=time_start, time_end=time_end)).readlines();

    Sum_Data_list = []
    for sum_list in Sum_Data:
        sum = re.split('\t', sum_list.replace('\n', ''))
        Sum_Data_list.append(sum)

    i = 0
    insert_mysql_sql = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
           insert into xxxxxx_liuyl(roomid,gameid,gamename,live_date,live_minutes,hourtype) \
           values """.format(host=host, port=port, user=user, passwd=passwd, db=db)
    for sumd in Sum_Data_list:
        roomid = sumd[0]
        gameid = sumd[1]
        gamename = sumd[2]
        live_date = sumd[3]
        live_minutes = sumd[4]
        hourtype = sumd[5]
        etl_time = time.strftime('%Y-%m-%d %X', time.localtime())

        i += 1
        insert_mysql_sql = insert_mysql_sql + """('{roomid}','{gameid}','{gamename}','{live_date}','{live_minutes}','{hourtype}'),""".format(host=host, port=port, user=user, passwd=passwd, db=db, roomid=roomid, gameid=gameid, gamename=gamename, live_date=live_date, live_minutes=live_minutes, hourtype=hourtype, etl_time=etl_time)
        if (i % 1000 == 0):
            insert_mysql_sql = insert_mysql_sql.rstrip(',') + """ ;" """
            os.system(insert_mysql_sql)

            insert_mysql_sql = """/usr/bin/mysql -h{host} -P{port} -u{user} -p{passwd} -e "set names utf8;use {db}; \
                   insert into xxxxxx_liuyl(roomid,gameid,gamename,live_date,live_minutes,hourtype) \
                   values """.format(host=host, port=port, user=user, passwd=passwd, db=db)

    insert_mysql_sql = insert_mysql_sql.rstrip(',') + """ ;" """
    os.system(insert_mysql_sql)

hourTypeList = [('00:00:00', '00:59:59'),
                ('01:00:00', '01:59:59'),
                ('02:00:00', '02:59:59'),
                ('03:00:00', '03:59:59'),
                ('04:00:00', '04:59:59'),
                ('05:00:00', '05:59:59'),
                ('06:00:00', '06:59:59'),
                ('07:00:00', '07:59:59'),
                ('08:00:00', '08:59:59'),
                ('09:00:00', '09:59:59'),
                ('10:00:00', '10:59:59'),
                ('11:00:00', '11:59:59'),
                ('12:00:00', '12:59:59'),
                ('13:00:00', '13:59:59'),
                ('14:00:00', '14:59:59'),
                ('15:00:00', '15:59:59'),
                ('16:00:00', '16:59:59'),
                ('17:00:00', '17:59:59'),
                ('18:00:00', '18:59:59'),
                ('19:00:00', '19:59:59'),
                ('20:00:00', '20:59:59'),
                ('21:00:00', '21:59:59'),
                ('22:00:00', '22:59:59'),
                ('23:00:00', '23:59:59')]
for hourType in hourTypeList:
    print hourType[0], hourType[1]
    hiveDataSum2Mysql_liuyl(time_start=hourType[0], time_end=hourType[1])
5、第二步的结果查询脚本
--时段内单次直播过40分钟
select gameid,gamename,roomid,hourtype,count(*) cnt
from xxxxxx_liuyl 
where live_minutes>=40
group by gameid,gamename,roomid,hourtype
having count(*)>=20
order by gameid,hourtype;
--时段内总直播过40分钟
select gameid,gamename,roomid,hourtype,count(*) cnt
from (select gameid,gamename,roomid,live_date,hourtype,sum(live_minutes) live_minutes
from xxxxxx_liuyl 
group by gameid,gamename,roomid,live_date,hourtype
having sum(live_minutes)>=40) x
group by gameid,gamename,roomid,hourtype
having count(*)>=20
order by gameid,hourtype;



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值