今天做项目,要计算用户的标签覆盖率,就是实现这样一个表格:
根据用户的最近访问时间,计算这些用户含有某个标签属性的占比。
其实做起来不难,但是如何实现需要设计一个思路。
Sql思路:
1、先根据日期筛选用户
2、计算每个标签不为空的数量/该日期内的用户总数
其实sql做起来很简单,但是我们进行简单的优化
1、将该日期内的用户总数通过with加载到内存
2、通过脚本对各个日期维度进行计算,避免写重复sql,最后再对表进行合并
代码:
1、根据日期筛选用户
DROP TABLE IF EXISTS `profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}`;
CREATE TABLE `profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}` as
select user_id
,registration_date
,gender
,age
,birthday
,last_order_date
,device_id
,operator
,pushtype
,nearby_city_code
from profile_v2.user_member_stat_v1
where dt='${hivevar:yesterday}'
and nearby_update_time is not null
and nearby_update_time>='${hivevar:scope_date}';
2、计算标签覆盖率(通过with加载总数到内存)
DROP TABLE IF EXISTS `profile_v2.tmp_lable_coverage_rate_result_${hivevar:scope}`;
CREATE TABLE `profile_v2.tmp_lable_coverage_rate_result_${hivevar:scope}` as
with
cnts as (
select count(1) as cnt from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
),
result as (
select t_user_id.user_id_cnt
,t_registration_date.registration_date_cnt
,t_gender.gender_cnt
,t_age.age_cnt
,t_birthday.birthday_cnt
,t_last_order_date.last_order_date_cnt
,t_device_id.device_id_cnt
,t_operator.operator_cnt
,t_pushtype.pushtype_cnt
,t_nearby_city_code.nearby_city_code_cnt
from
(select count(*) as user_id_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where user_id!='-999' and user_id is not null
) t_user_id
join
(select count(*) as registration_date_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where registration_date!='-999' and registration_date is not null
) t_registration_date
join
(select count(*) as gender_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where gender!='-999' and gender is not null and gender!=0
) t_gender
join
(select count(*) as age_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where age!='-999' and age is not null
) t_age
join
(select count(*) as birthday_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where birthday!='-999' and birthday is not null
) t_birthday
join
(select count(*) as last_order_date_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where last_order_date!='-999' and last_order_date is not null
) t_last_order_date
join
(select count(*) as device_id_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where device_id!='-999' and device_id is not null
) t_device_id
join
(select count(*) as operator_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where operator!='-999' and operator is not null
) t_operator
join
(select count(*) as pushtype_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where pushtype!='-999' and pushtype is not null
) t_pushtype
join
(select count(*) as nearby_city_code_cnt
from profile_v2.tmp_lable_coverage_rate_table_${hivevar:scope}
where nearby_city_code!='-999' and nearby_city_code is not null
) t_nearby_city_code
)
select cnts.cnt as table_count${hivevar:scope}
,result.user_id_cnt as user_id_cnt${hivevar:scope}
,round(result.user_id_cnt/cnts.cnt,2) as user_id_rate${hivevar:scope}
,result.registration_date_cnt as registration_date_cnt${hivevar:scope}
,round(result.registration_date_cnt/cnts.cnt,2) as registration_date_rate${hivevar:scope}
,result.gender_cnt as gender_cnt${hivevar:scope}
,round(result.gender_cnt/cnts.cnt,2) as gender_rate${hivevar:scope}
,result.age_cnt as age_cnt${hivevar:scope}
,round(result.age_cnt/cnts.cnt,2) as age_rate${hivevar:scope}
,result.birthday_cnt as birthday_cnt${hivevar:scope}
,round(result.birthday_cnt/cnts.cnt,2) as birthday_rate${hivevar:scope}
,result.last_order_date_cnt as last_order_date_cnt${hivevar:scope}
,round(result.last_order_date_cnt/cnts.cnt,2) as last_order_date_rate${hivevar:scope}
,result.device_id_cnt as device_id_cnt${hivevar:scope}
,round(result.device_id_cnt/cnts.cnt,2) as device_id_rate${hivevar:scope}
,result.operator_cnt as operator_cnt${hivevar:scope}
,round(result.operator_cnt/cnts.cnt,2) as operator_rate${hivevar:scope}
,result.pushtype_cnt as pushtype_cnt${hivevar:scope}
,round(result.pushtype_cnt/cnts.cnt,2) as pushtype_rate${hivevar:scope}
,result.nearby_city_code_cnt as nearby_city_code_cnt${hivevar:scope}
,round(result.nearby_city_code_cnt/cnts.cnt,2) as nearby_city_code_rate${hivevar:scope}
from cnts,result;
3、编写执行脚本
#coding:utf-8
#@auth: 小菜菜1223
import os
import datetime
FLAGS = [3, 7, 15, 30, 90, 180, 360, 720]
#获取之后日期
def get_day(st,dt,format="%Y%m%d"):
day = st + datetime.timedelta(days=dt)
return day.strftime(format)
#日期格式转换
def format_date(t):
return datetime.date(int(t[0:4]),int(t[4:6]),int(t[6:]))
#执行shell命令
def execute_shell(shell_cmd):
result = os.system(shell_cmd)
if result != 0:
print("%s cmd execute failed!"%shell_cmd)
raise Exception("%s cmd execute failed!"%shell_cmd)
print("%s cmd execute success!"%shell_cmd)
#计算两个日期之间的时间差
def calculate_delta_t(st, et):
st = datetime.date(int(st[0:4]),int(st[4:6]),int(st[6:]))
et = datetime.date(int(et[0:4]),int(et[4:6]),int(et[6:]))
return (et - st).days
if __name__ == '__main__':
today = datetime.datetime.today()
yesterday = get_day(today, -1)
for i in FLAGS:
scope = i
scope_date = get_day(format_date(yesterday), -i, "%Y-%m-%d")
print(scope_date)
shell = "hive --hivevar yesterday=%s --hivevar scope=%s --hivevar scope_date=%s -f "%(yesterday, scope, scope_date)
execute_shell(shell)
print("success!")
4、数据合并
DROP TABLE IF EXISTS `profile_v2.tmp_lable_coverage_rate_result_all`;
CREATE TABLE `profile_v2.tmp_lable_coverage_rate_result_all` as
select * from(
(select * from profile_v2.tmp_lable_coverage_rate_result_3) a1
join
(select * from profile_v2.tmp_lable_coverage_rate_result_7) a2
join
(select * from profile_v2.tmp_lable_coverage_rate_result_15) a3
join
(select * from profile_v2.tmp_lable_coverage_rate_result_30) a4
join
(select * from profile_v2.tmp_lable_coverage_rate_result_90) a5
join
(select * from profile_v2.tmp_lable_coverage_rate_result_180) a6
join
(select * from profile_v2.tmp_lable_coverage_rate_result_360) a7
join
(select * from profile_v2.tmp_lable_coverage_rate_result_720) a8
)
;