接1.12日的博客:
create external table locale(
locale_id int,
locale string
)
row format delimited
fields terminated by '\t'
location '/user/event/data/locale';
create external table time_zone(
time_zone_id int,
time_zone string
)
row format delimited
fields terminated by ','
location '/user/event/data/time_zone';
每个用户有多少个朋友?
user_friend
注意点,在重做的时候,记得user_friend表的数据是从hb_user_friend中导入的
create table user_friend
stored as ORC AS
select * from hb_user_friend
但是,bh_user_friend表的数据不是直接通过flume导入Kafka的user_friend_raw
而是通过Kafka stream处理后的数据!!!!
2020.12.08课堂笔记(从flume读取数据连接到Kafka中) 文末附有代码
否则user_friend_raw的形式是event,yes,maybe,invited,no而不是3列
第二个问题:
由于处理字段的时候 invited 的 attend_type 是 invite 而不是 invited
所以和课上的代码有出入,但效果是一样的,只是在类型的时候需要使用 invite
同样的 event_attendee 的数据也是经过Kafka stream 处理过的。
create table user_friend_count stored as orc as
select user_id,count(1) friend_count from user_friend group by user_id ;
这个事件event_id有多少个朋友user_id邀请了 来,不来,可能来
create table event_attendee_count stored as orc as
select event_id,attend_type,count(1) attend_count
from event_attendee
group by event_id,attend_type;
select event_id,user_id,
if(attend_type="invite",1,0) invited,
if(attend_type="yes",1,0) yes,
if(attend_type="no",1,0) no,
if(attend_type="maybe",1,0) maybe
from event_attendee
limit 10;
select event_id,user_id,
sum(if(attend_type="invite",1,0)) as invited,
sum(if(attend_type="maybe",1,0)) as maybe,
sum(if(attend_type="yes",1,0)) as yes,
sum(if(attend_type="no",1,0)) as no
from event_attendee
group by user_id,event_id;
第一个阶段:一条数据里面只可能有一个是1
select event_id,user_id,
case when attend_type="invite" then 1 else 0 end as invited,
case when attend_type="yes" then 1 else 0 end as attend,
case when attend_type="no" then 1 else 0 end as not_attend,
case when attend_type="maybe" then 1 else 0 end as maybe_attend
from event_attendee limit 10;
第二阶段:如果有两个相同的event_id和user_id就整到一起
create table user_attend_status
stored as orc as
select tmp.event_id,tmp.user_id as attend_user_id,
max(tmp.invited) as invited,
max(tmp.attended) as attended,
max(tmp.not_attended) as not_attended,
max(tmp.maybe_attended) as maybe_attended
from
(select event_id,user_id,
case when attend_type='invite' then 1 else 0 end as invited,
case when attend_type='yes' then 1 else 0 end as attended,
case when attend_type='no' then 1 else 0 end as not_attended,
case when attend_type='maybe' then 1 else 0 end as maybe_attended
from event_attendee) tmp
group by tmp.event_id,tmp.user_id;
第三阶段:分组求和生成一张新的表
create table user_attend_event_count
stored as orc as
select attend_user_id as user_id,
sum(invited) as invited_count,
sum(attended) as attended_count,
sum(not_attended) as not_attended,
sum(maybe_attended) as maybe_attended
from user_attend_status group by attend_user_id;
更改表名:
alter table user_attend_count rename to user_event_attend_count;
梳理一下现在有哪几张表了:
hive> desc user_friend_count;
OK
user_id string 用户
friend_count bigint 用户朋友的数量
hive> desc event_attendee_count;
OK
event_id string 事件ID
attend_type string 参与类型 4种情况
attend_count bigint 四种情况中 每种情况的数量
hive> desc user_attend_event_count;
OK
user_id string 被邀请人的用户id
invited_count bigint 被邀请的次数
attended_count bigint 参加的次数
not_attended bigint 不参加的次数
maybe_attended bigint 可能参加的次数
hive> desc user_attend_status;
OK
event_id string 事件ID
attend_user_id string 被邀请的用户ID
invited int 邀请用户的参与状态(邀请)
attended int 邀请用户的参与状态(参加)
not_attended int 邀请用户的参与状态(不参加)
maybe_attended int 邀请用户的参与状态(可能邀请)