离线Hive数据仓库学习 归一化分析

--  (x - min) / (max - min)  归一化
create temporary macro percentageValue(value double, min_value double, max_value double)
    case
        when value is null or min_value = max_value then 0.5
        else (value-min_value)/(max_value-min_value)
        end;

desc user_friend_event;

create table train_date
    row format delimited
fields terminated by ","
stored as textfile
as
with pre_tain_data as
         (
             select min(user_age)                      as user_min_age,
                    max(user_age)                      as user_max_age,
                    min(user_member_days)              as user_min_member_days,
                    max(user_member_days)              as user_max_member_days,
                    min(user_friend_count)             as user_min_friend_count,
                    max(user_friend_count)             as user_max_friend_count,
                    min(user_invited_event_count)      as user_min_invited_event_count,
                    max(user_invited_event_count)      as user_max_invited_event_count,
                    min(invite_ahead_days)             as invited_min_ahead_days,
                    max(invite_ahead_days)             as invited_max_ahead_days,
                    min(event_city_level)              as event_min_city_level,
                    max(event_city_level)              as event_max_city_level,
                    min(event_country_level)           as event_min_country_level,
                    max(event_country_level)           as event_max_country_level,
                    min(event_lat)                     as event_min_lat,
                    max(event_lat)                     as event_max_lat,
                    min(event_lng)                     as event_min_lng,
                    max(event_lng)                     as event_max_lng,
                    min(event_start_month)             as event_min_start_month,
                    max(event_start_month)             as event_max_start_month,
                    min(event_start_dayofweek)         as event_min_start_dayofweek,
                    max(event_start_dayofweek)         as event_max_start_dayofweek,
                    min(event_start_hour)              as event_min_start_hour,
                    max(event_start_hour)              as event_max_start_hour,
                    min(start_ahead_days)              as start_min_ahead_days,
                    max(start_ahead_days)              as start_max_ahead_days,
                    min(invited_friends_count)         as invited_min_friends_count,
                    max(invited_friends_count)         as invited_max_friends_count,
                    min(maybe_friends_count)           as maybe_min_friends_count,
                    max(maybe_friends_count)           as maybe_max_friends_count,
                    min(yes_friends_count)             as yes_min_friends_count,
                    max(yes_friends_count)             as yes_max_friends_count,
                    min(no_friends_count)              as no_min_friends_count,
                    max(no_friends_count)              as no_max_friends_count,
                    min(user_had_event_count)          as user_min_had_event_count,
                    max(user_had_event_count)          as user_max_had_event_count,
                    min(user_maybe_attend_event_count) as user_min_maybe_attend_event_count,
                    max(user_maybe_attend_event_count) as user_max_maybe_attend_event_count,
                    min(user_no_attend_event_count)    as user_min_no_attend_event_count,
                    max(user_no_attend_event_count)    as user_max_no_attend_event_count,
                    min(user_yes_attend_event_count)   as user_min_yes_attend_event_count,
                    max(user_yes_attend_event_count)   as user_max_yes_attend_event_count,
                    min(user_locale)                   as user_min_locale,
                    max(user_locale)                   as user_max_locale,
                    min(user_time_zone)                as user_min_time_zone,
                    max(user_time_zone)                as user_max_time_zone,
                    min(event_invited_user_count)      as event_min_invited_user_count,
                    max(event_invited_user_count)      as event_max_invited_user_count,
                    min(event_yes_user_count)          as event_min_yes_user_count,
                    max(event_yes_user_count)          as event_max_yes_user_count,
                    min(event_no_user_count)           as event_min_no_user_count,
                    max(event_no_user_count)          as event_max_no_user_count,
                    min(event_maybe_user_count)        as event_min_maybe_user_count,
                    max(event_maybe_user_count)          as event_max_maybe_user_count
             from user_friend_event
         )
select
    t.userid,
    t.eventid,
    t.user_interested,
    t.user_invited,
    percentageValue(t.event_city_level, s.event_min_city_level, s.event_max_city_level) as event_city_level,
    percentageValue(t.event_country_level, s.event_min_country_level, s.event_max_country_level) as event_country_level,
    percentageValue(t.event_lat, s.event_min_lat, s.event_max_lat) as event_lat,
    percentageValue(t.event_lng, s.event_min_lng, s.event_max_lng) as event_lng,
    percentageValue(t.invite_ahead_days, s.invited_min_ahead_days, s.invited_max_ahead_days) as invite_ahead_days,
    percentageValue(t.event_start_month, s.event_max_start_month, s.event_min_start_month) as event_start_month,
    percentageValue(t.event_start_dayofweek, s.event_max_start_dayofweek, s.event_min_start_dayofweek) as event_start_dayofweek,
    percentageValue(t.event_start_hour, s.event_max_start_hour, s.event_min_start_hour) as event_start_hour,
    percentageValue(t.start_ahead_days, s.start_min_ahead_days, s.start_max_ahead_days) as start_ahead_days,
    percentageValue(t.user_friend_count, s.user_min_friend_count, s.user_max_friend_count) as user_friend_count,
    t.event_creator_is_friend,
    percentageValue(t.invited_friends_count, s.invited_min_friends_count, s.invited_max_friends_count) as invited_friends_count,
    percentageValue(t.maybe_friends_count, s.maybe_min_friends_count, s.maybe_max_friends_count) as maybe_friends_count,
    percentageValue(t.yes_friends_count, s.yes_min_friends_count, s.yes_max_friends_count) as yes_friends_count,
    percentageValue(t.no_friends_count, s.no_min_friends_count, s.no_max_friends_count) as no_friends_count,
    t.invited_friends_percentage,
    t.yes_friends_percentage,
    t.no_friends_percentage,
    t.maybe_friends_percentage,
    percentageValue(t.user_age, s.user_min_age, s.user_max_age) as user_age,
    t.user_gender,
    percentageValue(t.user_member_days, s.user_min_member_days, s.user_max_member_days) as user_member_days,
    t.location_similar,
    percentageValue(t.user_had_event_count, s.user_min_had_event_count, s.user_max_had_event_count) as user_had_event_count,
    percentageValue(t.user_invited_event_count, s.user_min_invited_event_count, s.user_max_invited_event_count) as user_invited_event_count,
    percentageValue(t.user_maybe_attend_event_count, s.user_min_maybe_attend_event_count, s.user_max_maybe_attend_event_count) as user_maybe_attend_event_count,
    percentageValue(t.user_no_attend_event_count, s.user_min_no_attend_event_count, s.user_max_no_attend_event_count) as user_no_attend_event_count,
    percentageValue(t.user_yes_attend_event_count, s.user_min_yes_attend_event_count, s.user_max_yes_attend_event_count) as user_yes_attend_event_count,
    percentageValue(t.user_locale, s.user_min_locale, s.user_max_locale) as user_locale,
    percentageValue(t.user_time_zone, s.user_min_time_zone, s.user_max_time_zone) as user_time_zone,
    percentageValue(t.event_invited_user_count, s.event_min_invited_user_count, s.event_max_invited_user_count) as event_invited_user_count,
    percentageValue(t.event_yes_user_count, s.event_min_yes_user_count, s.event_max_yes_user_count) as event_yes_user_count,
    percentageValue(t.event_no_user_count, s.event_min_no_user_count, s.event_max_no_user_count) as event_no_user_count,
    percentageValue(t.event_maybe_user_count, s.event_min_maybe_user_count, s.event_max_maybe_user_count) as event_maybe_user_count
from dm_attendee.user_friend_event t cross join pre_tain_data s;


select count(*) from train_date;
select * from user_friend_event where user_locale is null ;

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
足球比赛基于Hive数据仓库的数据分析是指通过使用Hive这个分布式数据仓库工具来进行足球比赛相关数据的分析和处理。 首先,Hive作为一个基于Hadoop数据仓库工具,可以存储和处理大量的结构化和半结构化数据。足球比赛的数据通常包括球员的统计数据、比赛结果、比赛事件等。这些数据可以直接导入Hive中进行存储和管理。 然后,通过使用Hive提供的SQL-like查询语言HQL,可以对足球比赛数据进行灵活的查询和分析。可以通过编写HQL查询语句,按照不同的维度和指标对比赛数据进行筛选和聚合。例如,可以根据球员的表现数据进行数据分析,比如评估球员的得分、助攻和射门次数等指标,从而得出球员的表现情况,为球队调整和战术安排提供参考。 此外,Hive还支持数据的可视化和报告生成。可以使用Hive提供的可视化工具或将Hive导出的数据传递给其他数据可视化工具(如Tableau或Power BI)来进行更加直观和易于理解的数据展示。这样可以方便足球分析师或球队管理者对比赛数据进行整体和细分的分析,更好地了解球队和球员的状态和特点。 总之,足球比赛基于Hive数据仓库的数据分析是通过使用Hive这个强大的数据仓库工具,对足球比赛相关数据进行存储、管理、查询和分析,从而为球队和分析师提供决策参考,优化球队的表现和战术安排。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值