内容目录
Hive实战小项目
需求分析:
统计硅谷影音视频网站的常规指标,各种TopN指标:
– 统计视频观看数Top10
– 统计视频类别热度Top10
– 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
– 统计视频观看数Top50所关联视频的所属类别Rank
– 统计每个类别中的视频热度Top10,以Music为例
– 统计每个类别视频观看数Top10
– 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
数据结构:
1、视频表
字段 | 备注 | 详细描述 |
---|---|---|
videoId | 视频唯一id(String) | 11位字符串 |
uploader | 视频上传者(String) | 上传视频的用户名String |
age | 视频年龄(int) | 视频在平台上的整数天 |
category | 视频类别(Array) | 上传视频指定的视频分类 |
length | 视频长度(Int) | 整形数字标识的视频长度 |
views | 观看次数(Int) | 视频被浏览的次数 |
rate | 视频评分(Double) | 满分5分 |
Ratings | 流量(Int) | 视频的流量,整型数字 |
conments | 评论数(Int) | 一个视频的整数评论数 |
relatedId | 相关视频id(Array) | 相关视频的id,最多20个 |
2、用户表
字段 | 备注 | 字段类型 |
---|---|---|
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
1、数据准备
①创建原始表
视频表
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile;
用户表
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile;
②创建orc存储格式带snappy压缩的表
视频表
create table gulivideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
用户表
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
③向ori表中插入数据
load data local inpath "/opt/module/hive/datas/video" into table gulivideo_ori;
load data local inpath "/opt/module/hive/datas/guliVideo/user/user.txt" into table gulivideo_user_ori;
④向orc表中插入数据
insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
2、业务分析
①统计视频观看数Top10
使用order by按照views字段做一个全局排序即可,同时我们设置只显示前10条。
select
videoId,
`views`
from gulivideo_orc
order by `views` desc
limit 10;
②统计视频类别热度Top10
思路:就是统计出每个视频类别中包含的视频数,将数量最多的十个输出
因为一个视频对应一个或者多个类别,所以还要来一个列转行
1)首先进行列转行
select videoId, category_col from gulivideo_ori
lateral view explode(category) t as category_col
limit 10;
2)然后按照类别进行分组,然后计算count()即可,排序输出前10
select
tmp01.category_col,
count(tmp01.videoId) nums
from
(
select videoId, category_col from gulivideo_ori
lateral view explode(category) t as category_col
) tmp01
group by tmp01.category_col
order by nums desc
limit 10;
③统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
1)首先,视频观看数最高的20个视频所属的类别
select
videoId,
views,
category
from gulivideo_ori
order by views desc
limit 20;
2)通过上述的结果,将其列转行
select
category_col,
count(tmp01.videoId) nums
from(
select
videoId,
views,
category
from gulivideo_ori
order by views desc
limit 20
) tmp01
lateral view
explode(tmp01.category) t as category_col
group by category_col
order by nums
limit 10;
④统计视频观看数Top50所关联视频的所属类别排序
1)首先查看视频观看数Top50的视频
select
videoId,
relatedId,
`views`
from gulivideo_orc
order by `views` desc
limit 50
2)将查询结果的列转为行
select
realte_id
from (
select
videoId,
relatedId,
`views`
from gulivideo_orc
order by `views` desc
limit 50
) t1
lateral view
explode(t1.relatedId) tmp as realte_id;
3)上述就得到了所有的视频类型结果,然后找到这些视频的关联类别
select
t2.realte_id,
t2.cate
from (
select
realte_id,
t1.category cate
from (
select
videoId,
relatedId,
category,
`views`
from gulivideo_orc
order by `views` desc
limit 50
) t1
lateral view
explode(t1.relatedId) tmp as realte_id
) t2 join gulivideo_orc g on t2.realte_id = g.videoId;
4)找到了视频对应的类型,还要将相应的视频类型列转行
select
t3.realte_id,
category_name
from (
select
t2.realte_id,
g.category
from (
select
realte_id
from (
select
videoId,
relatedId,
`views`
from gulivideo_orc
order by `views` desc
limit 50
) t1
lateral view
explode(t1.relatedId) tmp as realte_id
) t2 join gulivideo_orc g on t2.realte_id = g.videoId
) t3
lateral view
explode(t3.category) tmp as category_name;
5)按照类别名进行分组,统计每组的数量,进行排序
select
t5.category_name,
t5.num,
rank() over(order by t5.num desc ) rk
from (
select
t4.category_name,
count(t4.realte_id) num
from (
select
t3.realte_id,
category_name
from (
select
t2.realte_id,
g.category
from (
select
realte_id
from (
select
videoId,
relatedId,
`views`
from gulivideo_orc
order by `views` desc
limit 50
) t1
lateral view
explode(t1.relatedId) tmp as realte_id
) t2 join gulivideo_orc g on t2.realte_id = g.videoId
) t3
lateral view
explode(t3.category) tmp as category_name
) t4
group by t4.category_name
) t5 ;
⑤统计类别视频观看数Top10
1)查找出视频的观看数以及所属于的类别
select
videoId,
views,
category
from gulivideo_orc;
2)将类别进行分行
select
videoId,
views,
category_name
from gulivideo_orc
lateral view
explode(category) tmp as category_name;
3)按照类别分组,求和观看数
select
sum(t1.views)
sum_views,
t1.category_name
from (
select
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp as category_name) t1
group by t1.category_name limit 10;
4)排序得到Top10
select
t2.sum_views,
t2.category_name
from(
select
sum(t1.views) sum_views,
t1.category_name
from (
select
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp as category_name) t1
group by t1.category_name
) t2
order by t2.sum_views desc limit 10;
⑥统计每个类别视频观看数Top10
注意,这个和上题的区别是统计每个类别中视频观看数的Top10
使用窗口函数,按照视频类别进行开窗,每个分区内按照视频播放量进行排序,取Top10
1)将视频类型的列转为行
select
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp as category_name;
2)开窗,按照视频类别进行分区,视频播放量进行排序,取Top10
select
t2.views,
t2.category_name,
t2.rk
from(
select
t1.views,
t1.category_name,
rank() over(partition by t1.category_name order by t1.views desc) rk
from(
select
views,
category_name
from gulivideo_orc
lateral view explode(category) tmp as category_name
) t1
) t2
where rk<=10;
⑦统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
1)首先需要查找上传视频最多的用户Top10
select
uploader
from gulivideo_user_orc
order by videos desc
limit 10;
2)查找出其上传的视频
select
t1.uploader,
g.videoId,
g.`views`
from (
select
uploader
from gulivideo_user_orc
order by videos desc
limit 10
) t1
join gulivideo_orc g on t1.uploader = g.uploader
3)开窗按照uploader分组,观看量进行排序
select
t3.uploader,
t3.videoId,
t3.views,
t3.rk
from(
select
t2.uploader,
t2.videoId,
t2.views,
rank() over(partition by t2.uploader order by t2.views desc) rk
from(
select
t1.uploader,
g.videoId,
g.`views`
from (
select
uploader
from gulivideo_user_orc
order by videos desc
limit 10
) t1
join gulivideo_orc g on t1.uploader = g.uploader
) t2
) t3
where rk<=20;