特征工程中用到了tf-idf,数据是游戏内行为数据、event_name为用户行为、
select
A.advertising_id,
CONCAT(A.event_name,'_tfidf'),
tf*idf as tf_idf
from
(
select
t1.advertising_id,
event_name,
tf_a,
tf_b,
if(tf_b=0,0,tf_a/tf_b) as tf
from
(
select
advertising_id,
event_name,
count(*) as tf_a
from
yotta.data_src_clean_android
where
event_date > '2019-06-14'
and
event_date <='2019-06-27'
group by advertising_id,event_name
)t1
join
(
select
advertising_id,
count(*) as tf_b
from
yotta.data_src_clean_android
where
event_date > '2019-06-14'
and
event_date <='2019-06-27'
group by advertising_id
)t2
on
(t1.advertising_id = t2.advertising_id)
)A
join
(
select
event_name,
idf_a,
idf_b,
log2(idf_a/(idf_b+1)) as idf
from
(
select
event_name,
count(distinct advertising_id) as idf_b,
1 as tmp
from
yotta.data_src_clean_android
where
event_date > '2019-06-14'
and
event_date <='2019-06-27'
group by
event_name
)t1
join
(
select
count(distinct advertising_id) as idf_a,
1 as tmp
from
yotta.data_src_clean_android
where
event_date > '2019-06-14'
and
event_date <='2019-06-27'
)t2
on
(t1.tmp = t2.tmp)
)B
on
(A.event_name = B.event_name)
统计每个用户的关键event_name即每个用户在游戏内的关键行为。
参考: