一丶数据清洗
1.由于数据不符合数据分析要求需要改变格式
- 首先过滤残缺的数据,既不合格的数据
- 将分隔符统一
- 统一数据的格式(如多余的空格,会导致分组时出错)
二丶建表导入数据
1.建立原始数据表
create table gulivideo_ori(
videoID string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedID array<string>)
row format delimited
fields terminated by '\t'
collection items terminated by '&';
2.建立用户表
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by '\t'
3.建立orc压缩原始数据表
create table gulivideo_orc(
videoID string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedID array<string>)
row format delimited
fields terminated by '\t'
collection items terminated by '&'
stored as orc;
4.建立orc压缩用户数据表
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by '\t'
stored as orc;
5.向orc(压缩表)导入数据
insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
三丶实现需求
数据结构
原始数据表
字段 | 备注 | 详细描述 |
---|---|---|
videoID | 视频唯一id | 11位字符串 |
uploader | 视频上传者 | 上传视频的用户名String |
age | 视频年龄 | 视频在平台上的整数天 |
category | 视频类别 | 上传视频指定的视频分类 |
length | 视频长度 | 整形数字标识的视频长度 |
views | 观看次数 | 视频被浏览的次数 |
rate | 视频评分 | 满分5分 |
Ratings | 流量 | 视频的流量,整型数字 |
conments | 评论数 | 一个视频的整数评论数 |
relatedid | 相关视频id | 相关视频的id,最多20个 |
用户表
字段 | 备注 | 字段类型 |
---|---|---|
uploader | 上传者用户名 | string |
videos | 上传视频数 | int |
friends | 朋友数量 | int |
需求
–统计视频观看数Top10
select videoID,views from gulivideo_orc order by views desc limit 10;
–统计视频类别热度Top10
①使用UDTF函数炸裂开
select videoID,category_name from
gulivideo_orc lateral view explode(category) tmp_category as category_name t1
②按category_name分组,统计每个种类的总数,然后排序,取前10;
select count(videoID) count_num,category_name from t1 group by category_name order by count_num desc limit 10
③组合sql
select count(videoID) count_num,category_name from (select videoID,category_name from gulivideo_orc lateral view explode(category) tmp_category as category_name) t1
group by category_name order by count_num desc limit 10
–统计视频观看数Top20所属类别以及类别包含的TOP20的视频个数
①获取观看数top20视频的所有类别(不去重,直接统计)
select videoID,views,category from gulivideo_orc order by views desc limit 20 t1
[^]: 注:在hive中使用order by 对某个字段排序而 select 后面没有那个字段可能会报错
②炸裂后分组求count
select category_name,count(*) count_num from t1 lateral view explode(category) tem_category as category_name group by category_name order by count_num desc
③组合sql
select category_name,count(*) count_num from (select videoID,views,category from gulivideo_orc order by views desc limit 20) t1
lateral view explode(category) tem_category as category_name group by category_name order by count_num desc
–统计视频观看数Top5所关联视频的所属类别Rank
①统计观看数top5的所有关联视频,此时得到的是数组形式的
select views,relatedid from gulivideo_orc order by views desc limit 50 t1
②将关联视频字段炸裂,然后去重,得到统计观看数top5的所有关联视频
select related_id from t1 lateral view explode(relatedid) tem_relatedid
as related_id group by related_id t2
③得到统计观看数top5的所有关联视频的类别
select category from t2 join gulivideo_orc orc on t2.related_id=orc.videoID t3
④因为③得到的是数组所以需要再炸裂一次
select explode(category) category_name from t3 t4
⑤然后再分组 排序
select category_name,count(*) category_count from t4 group by category_name order by category_count desc
⑥组合SQL
select category_name,count(*) category_count from (select explode(category) category_name from (select category from (select related_id from (select views,relatedid from gulivideo_orc order by views desc limit 50) t1 lateral view explode(relatedid) tem_relatedid
as related_id group by related_id) t2 join gulivideo_orc orc on t2.related_id=orc.videoID) t3) t4 group by category_name order by category_count desc
–统计每个类别中的视频热度Top10(和最后一个需求一致)
[^]: 注: 某类视频类别个数作为视频热度
select category_name,videoID,views,rk from (select category_name,videoID,views,rank() over(partition by category_name order by views desc) rk from(select videoID,views,category_name from gulivideo_orc lateral view explode(category) tem_category as category_name) t1) t2 where rk <= 10;
–统计每个类别中视频流量Top10(和最后一个需求相似)
select category_name,videoID,Ratings,rk from (select category_name,videoID,Ratings,rank() over(partition by category_name order by Ratings desc) rk from (select videoID,Ratings,category_name from gulivideo_orc lateral view explode(category) tem_category as category_name) t1) t2 where rk <= 10;
–统计上传视频最多的用户Top10以及他们上传的观看次数在前20的视频
①上传视频最多的10个用户
select uploader,videos from gulivideo_user_orc order by videos desc limit 10; t1
③观看次数在前20的视频
select * from gulivideo_orc order by views desc limit 20;t2
④join 两张表
select t2.* from t1 join t2 on t1.uploader=t2.uploader
⑤合并SQL
select t2.* from (select uploader,videos from gulivideo_user_orc order by videos desc limit 10) t1 join (select * from gulivideo_orc order by views desc limit 20) t2 on t1.uploader=t2.uploader
①上传视频最多的10个用户
select uploader,videos from gulivideo_user_orc order by videos desc limit 10; t1
②取出这10个人上传的所有视频,按照观看次数取前20
select video.videoID,video.views from t1 join gulivideo_orc video on t1.uploader=video.uploader order by views desc limit 10
③合并SQL
select video.videoID,video.views from (select uploader,videos from gulivideo_user_orc order by videos desc limit 10) t1 join gulivideo_orc video on t1.uploader=video.uploader order by views desc limit 20
–统计每个类别视频观看数Top10
①因为类别是用数组存储的所以先把类别炸裂出来
select videoID,views,category_name from gulivideo_orc lateral view explode(category) tem_category as category_name t1
②根据类别分组,开窗
select category_name,videoID,views,rank() over(partition by category_name order by views desc) rk from t1 t2
③取每个类别的前10
select category_name,videoID,views,rk from t2 where rk < 10;
④组合SQL
select category_name,videoID,views,rk from (select category_name,videoID,views,rank() over(partition by category_name order by views desc) rk from (select videoID,views,category_name from gulivideo_orc lateral view explode(category) tem_category as category_name) t1) t2 where rk <= 10;