一、为什么要做渠道质量评价体系:
结合推广运营目的,评估用户渠道的好坏,好的渠道要拓量,较差的渠道要控制;
二、如何做:
1.根据项目所处阶段构建渠道质量评价指标体系:
新项目-拉新:此阶段尽可能多的获取新用户,新增80%,活跃20%,注意砍掉大量作弊用户,可允许少部分;
下载量、登录量、注册量、登录率、注册率。。。
发展中项目-活跃:新增40%+活跃40%+营收20%;
日活、月活、次留、7留、30留、人均使用时长。。。
成熟项目-营收:新增20%-活跃40%+营收40%;
成本、收入、ROI、创收人数占比。。。
2.数据准备:
1)搜集第三方数据(主要是下载量及部分收入等)及自己数据库数据;
2)计算指标并归一化(Min-Max归一化);
3)按权重计算总得分;
三、效果:
市场部快速决策,T+3(可T+1)确定优劣渠道
"""
date:2020-11-03
@author: zhuweisj
theme:Browser_app_channels_evaluation
coding:utf-8
"""
# 导入相关包
import numpy as np
from numpy import float64
import pandas as pd
import lightgbm as lgbm
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import timedelta
import os
import time
from datetime import datetime
import warnings
# 忽略warning
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.2f}'.format
#更改当前工作目录路径
# os.chdir(r'your path')
def channel_evaluation_algo(project,calc_date):
# temp_result_tb = 'dp_dm.dw_channel_evaluation_{}'.format(project)
# result_tb = 'dp_dm.dw_channel_evaluation_{}_all'.format(project)
date = (calc_date - timedelta(days=4)).strftime(f"%Y-%m-%d")
print("channel_evaluation_algo开始计算:项目:{}, 日期:{}".format(project, date))
spark = SparkSession.builder.master("yarn").enableHiveSupport().getOrCreate()
sql = '''select s1.p_dt as p_dt
,s1.attribution_channel_id as attribution_channel_id
,attribution_new_uv
,retain_1d
,retain_2d
,retain_3d
,income
,renjunshichang
,avg_launch_time
,attrib_single_cost
,cast(sousuocishu/sousuo as decimal(5,2)) as sousuocishu
,cast(yueducishu/yuedu as decimal(5,2)) as yueducishu
,cast(xiaoshipincishu/xiaoshipin as decimal(5,2)) as xiaoshipincishu
,cast(xiaoshuocishu/xiaoshuo as decimal(5,2)) as xiaoshuocishu
,cast(lingxianjincishu/lingxianjin as decimal(5,2)) as lingxianjincishu
,cast(duanshipincishu/duanshipin as decimal(5,2)) as duanshipincishu
from
(select p_dt,attribution_channel_id,attribution_new_uv
,cast(retain_1d_uv/attribution_new_uv as decimal(5,2)) as retain_1d
,cast(retain_2d_uv/attribution_new_uv as decimal(5,2)) as retain_2d
,cast(retain_3d_uv/attribution_new_uv as decimal(5,2)) as retain_3d
from
(select p_dt,attribution_channel_id
,sum((case when is_black_attribution_channel=0 then attribution_new_uv end )) as attribution_new_uv
,sum(retain_1d_uv) as retain_1d_uv
,sum(retain_2d_uv) as retain_2d_uv
,sum(retain_3d_uv) as retain_3d_uv
from dm_db.dm_common_mobile_device_retains_attribution_aggr_d
where p_product = 'browser_app'
and p_dt = ''' + "'" + date + "'" + '''
and attribution_new_uv is not NULL
and attribution_new_uv <> 0
and attribution_channel_id not in
(select distinct channel from dw_db.dw_browser_app_mb_brush_cheat_blacklist
where p_dt = '2020-09-21')
group by p_dt,attribution_channel_id)t)s1
left join
(select t3.p_dt,t3.attribution_channel_id,count(t3.device_id) as renshu,sum(t3.use_time) as shichang
,cast(sum(t3.use_time)/count(t3.device_id)as decimal(10,2)) as renjunshichang
from
(select t2.p_dt,t1.attribution_channel_id,t1.device_id,t2.use_time
from
(select p_dt,attribution_channel_id,device_id
from dw_db.dw_common_mobile_device_user_mapping
where p_product = 'browser_app'
and first_date = p_dt
and p_dt = ''' + "'" + date + "'" + '''
group by p_dt,attribution_channel_id,device_id)t1
left join
(select
a.p_dt,
a.device_id,
sum(cast(coalesce(get_json_object(a.json, '$.duration'), 0) as bigint))as use_time
from dw_db.dw_common_mobile_use_log as a
where a.p_product='browser_app'
and a.p_dt = ''' + "'" + date + "'" + '''
--and cast(coalesce(get_json_object(a.json, '$.duration'), 0) as bigint)>900
and cast(coalesce(get_json_object(a.json, '$.duration'), 0) as bigint)<7200
and a.device_id<>''
group by a.p_dt,a.device_id)t2
on t1.device_id = t2.device_id and t1.p_dt = t2.p_dt
where t2.device_id is not null)t3
group by t3.p_dt,t3.attribution_channel_id)s2
on s1.p_dt = s2.p_dt and s1.attribution_channel_id = s2.attribution_channel_id
left join
(select
t1.p_dt
,t1.attribution_channel_id
,count(case when t2.event_name = 'toptitlebar_search' then t1.device_id else null end) sousuo
,count(case when t2.event_name in ('news_item_click','newssdk_all_news_item_click') then t1.device_id else null end) yuedu
,count(case when t2.event_name = 'navbar_click_smallvideo' then t1.device_id else null end) xiaoshipin
,count(case when t2.event_name = 'navbar_click_h5_http://t.mokayd.com/s/it8ym6y0qo' then t1.device_id else null end) xiaoshuo
,count(case when t2.event_name = 'navbar_click_taskpage' then t1.device_id else null end) lingxianjin
,count(case when t2.event_name = 'navbar_click_briefvideo' then t1.device_id else null end) duanshipin
,sum(case when t2.event_name = 'toptitlebar_search' then t2.event_counts else 0 end) sousuocishu
,sum(case when t2.event_name in ('news_item_click','newssdk_all_news_item_click') then t2.event_counts else 0 end) yueducishu
,sum(case when t2.event_name = 'navbar_click_smallvideo' then t2.event_counts else 0 end) xiaoshipincishu
,sum(case when t2.event_name = 'navbar_click_h5_http://t.mokayd.com/s/it8ym6y0qo' then t2.event_counts else 0 end) xiaoshuocishu
,sum(case when t2.event_name = 'navbar_click_taskpage' then t2.event_counts else 0 end) lingxianjincishu
,sum(case when t2.event_name = 'navbar_click_briefvideo' then t2.event_counts else 0 end) duanshipincishu
from
(select p_dt,device_id,attribution_channel_id
from dw_db.dw_common_mobile_device_user_mapping
where p_product = 'browser_app'
and p_dt = first_date
and p_dt = ''' + "'" + date + "'" + ''')t1
left join
(select p_dt,device_id,event_name,sum(event_counts) event_counts
from dw_db.dw_common_mobile_click_log
where p_product = 'browser_app'
and p_dt = ''' + "'" + date + "'" + '''
and event_name in(
'toptitlebar_search'
,'newssdk_all_news_item_click'
,'news_item_click'
,'navbar_click_smallvideo'
,'navbar_click_h5_http://t.mokayd.com/s/it8ym6y0qo'
,'navbar_click_taskpage'
,'navbar_click_briefvideo'
)
and event_counts > 0
group by p_dt,device_id,event_name)t2
on t1.device_id = t2.device_id and t1.p_dt = t2.p_dt
group by t1.p_dt,t1.attribution_channel_id)s3
on s1.p_dt = s3.p_dt and s1.attribution_channel_id = s3.attribution_channel_id
left join
(select attribution_channel_id
,attribution_channel_type
,avg(attrib_single_cost) as attrib_single_cost
,p_dt
from dm_db.dm_common_appunion_info_export2ck
where p_product = 'browser_app'
and attribution_channel_cooperate_status_name in ('正式','测试')
and p_dt = ''' + "'" + date + "'" + '''
group by attribution_channel_id
,attribution_channel_type
,p_dt)s4
on s1.p_dt = s4.p_dt and s1.attribution_channel_id = s4.attribution_channel_id
left join
(select p_dt
,attribution_channel_id
,count(distinct device_id) as num
,sum(launch_num) as launch_num
,cast(sum(launch_num)/count(distinct device_id) as decimal(5,1)) as avg_launch_time
from dw_db.dw_m_common_mobile_device_devices
where p_product = 'browser_app'
and is_new = 1
and p_dt = ''' + "'" + date + "'" + '''
group by p_dt,attribution_channel_id)s5
on s1.p_dt = s5.p_dt and s1.attribution_channel_id = s5.attribution_channel_id
left join
(select p_dt,attribution_channel_id,income
from dm_db.dm_common_roi_attribute_roi_export2ck
where p_product = 'browser_app'
and p_dt = ''' + "'" + date + "'" + ''')s6
on s1.p_dt = s6.p_dt and s1.attribution_channel_id = s6.attribution_channel_id
'''
predict_data = spark.sql(sql).toPandas()
new_uv_data = pd.read_table("all_data_right20201106.txt",index_col=[0],parse_dates=[0], delimiter=',')
# predict_data = pd.read_csv("1010~1109.csv",index_col=[0],parse_dates=[0])
# 筛选渠道日均新增50以上
channel_select = pd.DataFrame(new_uv_data.groupby('attribution_channel_id')['attribution_new_uv'].mean())
select_channel_01 = list(channel_select.iloc[(np.where(channel_select.attribution_new_uv >= 50))].index)
# 筛选渠道时间窗在2020-09-01~2020-09-21之间
temp_date_min = pd.DataFrame(new_uv_data.reset_index().groupby('attribution_channel_id')['p_dt'].min()).rename(columns={'p_dt':'min_p_dt'})
temp_date_max = pd.DataFrame(new_uv_data.reset_index().groupby('attribution_channel_id')['p_dt'].max()).rename(columns={'p_dt':'max_p_dt'})
temp_date_min_max = pd.concat([temp_date_min, temp_date_max], join='inner', axis = 1)
select_channel_02 = []
for channel_id in select_channel_01:
if (temp_date_min_max.loc[channel_id]['min_p_dt'] == datetime(2020,9,1)) & (temp_date_min_max.loc[channel_id]['max_p_dt'] == datetime(2020,9,21)):
select_channel_02.append(channel_id)
channel_data = pd.DataFrame(select_channel_02,columns = ['channel_id'])
# 根据选定的渠道,筛选相关渠道指标数据
all_data = new_uv_data[new_uv_data['attribution_channel_id'].isin(x for x in select_channel_02)]
all_data.fillna(0, inplace=True)
all_data_select = all_data[['attribution_channel_id','attribution_new_uv', 'retain_1d',
'retain_2d', 'retain_3d', 'income', 'renjunshichang', 'avg_launch_time',
'attrib_single_cost', 'sousuocishu', 'yueducishu', 'xiaoshipincishu',
'xiaoshuocishu', 'lingxianjincishu', 'duanshipincishu', 'score']]
predict_data = predict_data.fillna(0)
predict_data1 = predict_data.iloc[(np.where(predict_data.attribution_new_uv >= 50))].sort_values(['p_dt'])
predict_data1 = pd.DataFrame(predict_data1)
predict_data2 = predict_data1.iloc[:, 2:15]
predict_data2= pd.DataFrame(predict_data2,dtype=np.float64)
predict_data1.set_index(['attribution_channel_id'],inplace = True)
errors = []
# best_params = pd.DataFrame()
result = pd.DataFrame({'project': list(), 'channel_id': list(), 'score': list(),
'p_dt': list()}).set_index('p_dt', inplace=True)
pred_score = []
channel_id = []
p_dt = list(predict_data1['p_dt'])
for i in range(len(predict_data2)):
channel_data_select = all_data_select
X_train_valid = channel_data_select.iloc[:, 1:14]
y_train_valid = channel_data_select['score']
X_test=X_train_valid
y_test = y_train_valid
x_pre = predict_data2.iloc[[i]]
lgb_train = lgbm.Dataset(X_train_valid, y_train_valid)
lgb_eval = lgbm.Dataset(X_test, y_test, reference=lgb_train)
real_param = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'l1'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
gbm = lgbm.train(real_param,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)
print("gbm_train has been done!!!")
y_pred = gbm.predict(x_pre, num_iteration=gbm.best_iteration)[0]
pred_score.append(y_pred)
print("00001")
print(predict_data1.index[i])
channel_id.append(predict_data1.index[i])
result=pd.DataFrame({'project':'browser_app','channel_id': channel_id, 'score':pred_score,
'p_dt':p_dt})
# 将数据转成spark dataframe格式
schema = StructType([
StructField("project", StringType(), True),
StructField("channel_id", StringType(), True),
StructField("score", DoubleType(), True),
StructField("p_dt", StringType(), True)])
df = spark.createDataFrame(result, schema=schema)
first_run = "2"
if first_run != "1":
df.write.format("orc").mode("overwrite").saveAsTable('dp_dm.dw_channel_evaluation_browser_app')
df.write.format("orc").mode("append").partitionBy("p_dt").saveAsTable('dp_dm.dw_channel_evaluation_browser_app_all')
else:
df.write.format("orc").mode("overwrite").saveAsTable('dp_dm.dw_channel_evaluation_browser_app')
df.write.format("orc").mode("overwrite").partitionBy("p_dt").saveAsTable('dp_dm.dw_channel_evaluation_browser_app_all')
if __name__ == '__main__':
# 读取渠道新增及留存数据文件
time_now = datetime.now()
project = 'browser_app'
channel_evaluation_algo(project,time_now)