-- (x - min) / (max - min) 归一化
create temporary macro percentageValue(value double, min_value double, max_value double)
case
when value is null or min_value = max_value then 0.5
else (value-min_value)/(max_value-min_value)
end;
desc user_friend_event;
create table train_date
row format delimited
fields terminated by ","
stored as textfile
as
with pre_tain_data as
(
select min(user_age) as user_min_age,
max(user_age) as user_max_age,
min(user_member_days) as user_min_member_days,
max(user_member_days) as user_max_member_days,
min(user_friend_count) as user_min_friend_count,
max(user_friend_count) as user_max_friend_count,
min(user_invited_event_count) as user_min_invited_event_count,
max(user_invited_event_count) as user_max_invited_event_count,
min(invite_ahead_days) as invited_min_ahead_days,
max(invite_ahead_days) as invited_max_ahead_days,
min(event_city_level) as event_min_city_level,
max(event_city_level) as event_max_city_level,
min(event_country_level) as event_min_country_level,
max(event_country_level) as event_max_country_level,
min(event_lat) as event_min_lat,
max(event_lat) as event_max_lat,
min(event_lng) as event_min_lng,
max(event_lng) as event_max_lng,
min(event_start_month) as event_min_start_month,
max(event_start_month) as event_max_start_month,
min(event_start_dayofweek) as event_min_start_dayofweek,
max(event_start_dayofweek) as event_max_start_dayofweek,
min(event_start_hour) as event_min_start_hour,
max(event_start_hour) as event_max_start_hour,
min(start_ahead_days) as start_min_ahead_days,
max(start_ahead_days) as start_max_ahead_days,
min(invited_friends_count) as invited_min_friends_count,
max(invited_friends_count) as invited_max_friends_count,
min(maybe_friends_count) as maybe_min_friends_count,
max(maybe_friends_count) as maybe_max_friends_count,
min(yes_friends_count) as yes_min_friends_count,
max(yes_friends_count) as yes_max_friends_count,
min(no_friends_count) as no_min_friends_count,
max(no_friends_count) as no_max_friends_count,
min(user_had_event_count) as user_min_had_event_count,
max(user_had_event_count) as user_max_had_event_count,
min(user_maybe_attend_event_count) as user_min_maybe_attend_event_count,
max(user_maybe_attend_event_count) as user_max_maybe_attend_event_count,
min(user_no_attend_event_count) as user_min_no_attend_event_count,
max(user_no_attend_event_count) as user_max_no_attend_event_count,
min(user_yes_attend_event_count) as user_min_yes_attend_event_count,
max(user_yes_attend_event_count) as user_max_yes_attend_event_count,
min(user_locale) as user_min_locale,
max(user_locale) as user_max_locale,
min(user_time_zone) as user_min_time_zone,
max(user_time_zone) as user_max_time_zone,
min(event_invited_user_count) as event_min_invited_user_count,
max(event_invited_user_count) as event_max_invited_user_count,
min(event_yes_user_count) as event_min_yes_user_count,
max(event_yes_user_count) as event_max_yes_user_count,
min(event_no_user_count) as event_min_no_user_count,
max(event_no_user_count) as event_max_no_user_count,
min(event_maybe_user_count) as event_min_maybe_user_count,
max(event_maybe_user_count) as event_max_maybe_user_count
from user_friend_event
)
select
t.userid,
t.eventid,
t.user_interested,
t.user_invited,
percentageValue(t.event_city_level, s.event_min_city_level, s.event_max_city_level) as event_city_level,
percentageValue(t.event_country_level, s.event_min_country_level, s.event_max_country_level) as event_country_level,
percentageValue(t.event_lat, s.event_min_lat, s.event_max_lat) as event_lat,
percentageValue(t.event_lng, s.event_min_lng, s.event_max_lng) as event_lng,
percentageValue(t.invite_ahead_days, s.invited_min_ahead_days, s.invited_max_ahead_days) as invite_ahead_days,
percentageValue(t.event_start_month, s.event_max_start_month, s.event_min_start_month) as event_start_month,
percentageValue(t.event_start_dayofweek, s.event_max_start_dayofweek, s.event_min_start_dayofweek) as event_start_dayofweek,
percentageValue(t.event_start_hour, s.event_max_start_hour, s.event_min_start_hour) as event_start_hour,
percentageValue(t.start_ahead_days, s.start_min_ahead_days, s.start_max_ahead_days) as start_ahead_days,
percentageValue(t.user_friend_count, s.user_min_friend_count, s.user_max_friend_count) as user_friend_count,
t.event_creator_is_friend,
percentageValue(t.invited_friends_count, s.invited_min_friends_count, s.invited_max_friends_count) as invited_friends_count,
percentageValue(t.maybe_friends_count, s.maybe_min_friends_count, s.maybe_max_friends_count) as maybe_friends_count,
percentageValue(t.yes_friends_count, s.yes_min_friends_count, s.yes_max_friends_count) as yes_friends_count,
percentageValue(t.no_friends_count, s.no_min_friends_count, s.no_max_friends_count) as no_friends_count,
t.invited_friends_percentage,
t.yes_friends_percentage,
t.no_friends_percentage,
t.maybe_friends_percentage,
percentageValue(t.user_age, s.user_min_age, s.user_max_age) as user_age,
t.user_gender,
percentageValue(t.user_member_days, s.user_min_member_days, s.user_max_member_days) as user_member_days,
t.location_similar,
percentageValue(t.user_had_event_count, s.user_min_had_event_count, s.user_max_had_event_count) as user_had_event_count,
percentageValue(t.user_invited_event_count, s.user_min_invited_event_count, s.user_max_invited_event_count) as user_invited_event_count,
percentageValue(t.user_maybe_attend_event_count, s.user_min_maybe_attend_event_count, s.user_max_maybe_attend_event_count) as user_maybe_attend_event_count,
percentageValue(t.user_no_attend_event_count, s.user_min_no_attend_event_count, s.user_max_no_attend_event_count) as user_no_attend_event_count,
percentageValue(t.user_yes_attend_event_count, s.user_min_yes_attend_event_count, s.user_max_yes_attend_event_count) as user_yes_attend_event_count,
percentageValue(t.user_locale, s.user_min_locale, s.user_max_locale) as user_locale,
percentageValue(t.user_time_zone, s.user_min_time_zone, s.user_max_time_zone) as user_time_zone,
percentageValue(t.event_invited_user_count, s.event_min_invited_user_count, s.event_max_invited_user_count) as event_invited_user_count,
percentageValue(t.event_yes_user_count, s.event_min_yes_user_count, s.event_max_yes_user_count) as event_yes_user_count,
percentageValue(t.event_no_user_count, s.event_min_no_user_count, s.event_max_no_user_count) as event_no_user_count,
percentageValue(t.event_maybe_user_count, s.event_min_maybe_user_count, s.event_max_maybe_user_count) as event_maybe_user_count
from dm_attendee.user_friend_event t cross join pre_tain_data s;
select count(*) from train_date;
select * from user_friend_event where user_locale is null ;
离线Hive数据仓库学习 归一化分析
于 2022-01-17 22:19:20 首次发布