数据源:https://download.csdn.net/download/qq_41106844/12657640
-- 默认表
create table gulivideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited
fields terminated by "\t"
collection items terminated by "&"
stored as textfile
;
create table gulivideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as textfile
;
-- ORC格式
create table gulivideo_orc(
videoId string, -- 视频ID
uploader string, -- 上传程序
age int, -- 年龄
category array<string>, -- 类别
length int, --长度
views int, --视频观看数
rate float, --偏移值
ratings int, --额定值
comments int, --注释
relatedId array<string>) --相关视频ID
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as orc
;
create table gulivideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
;
-- 导入数据
load data inpath "/gulivideo/output/video/2008/0222" into table gulivideo_ori;
load data inpath "/gulivideo/user/2008/0903" into table gulivideo_user_ori;
-- 数据转存
insert into table gulivideo_orc select * from gulivideo_ori;
insert into table gulivideo_user_orc select * from gulivideo_user_ori;
统计视频观看数 Top10
按照视频观看数降序排序,然后limit分页。
select
videoId,
uploader,
age,
category,
length,
views,
rate,
ratings,
comments
from
gulivideo_orc
order by
views desc
limit 10
;
统计视频类别热度 Top10
- 即统计每个类别有多少个视频,显示出包含视频最多的前 10 个类别。
- 我们需要按照类别 group by 聚合,然后 count 组内的 videoId 个数即可。
- 因为当前表结构为:一个视频对应一个或多个类别。所以如果要 group by 类别,需要先
将类别进行列转行(展开),然后再进行 count 即可。 - 最后按照热度排序,显示前 10 条。
select
category_name as category,
count(t1.videoId) as hot
from (
select
videoId,
category_name
from
gulivideo_orc
lateral view explode(category) t_catetory as category_name
) t1
group by
t1.category_name
order by
hot desc
limit
10
;
统计视频观看数 Top20 所属类别以及类别包含的 Top20 的视频个数
- 先找到观看数最高的 20 个视频所属条目的所有信息,降序排列
- 把这 20 条信息中的 category 分裂出来(列转行)
- 最后查询视频分类名称和该分类下有多少个 Top20 的视频
select
category_name as category,
count(t2.videoId) as hot_with_views
from (
select
videoId,
category_name
from (
select
*
from
gulivideo_orc
order by
views desc
limit
20
) t1
lateral view explode(category) t_catetory as category_name
) t2
group by
category_name
order by
hot_with_views desc
;
统计视频观看数 Top50 所关联视频的所属类别
- 查询视频观看数前50
- 将这50个视频的关联视频列转行
- 将相关视频的 id 和 gulivideo_orc 表进行 inner join 操作
- 按照视频类别进行分组,统计每组视频个数,然后排行
select
category_name as category,
count(t5.videoId) as hot
from (
select
videoId,
category_name
from (
select
distinct(t2.videoId),
t3.category
from (
select
explode(relatedId) as videoId
from (
select
*
from
gulivideo_orc
order by
views desc
limit 50
) t1
) t2
inner join
gulivideo_orc t3 on t2.videoId = t3.videoId
) t4
lateral view explode(category) t_catetory as category_name
) t5
group by
category_name
order by
hot desc
;
统计每个类别中的视频热度 Top10
- 按照视频播放量排序
- 取出前十条
select
videoId,
views
from
gulivideo_orc
lateral view explode(category) catetory as categoryId
where
categoryId = "Music"
order by
views desc
limit 10
;
统计每个类别中视频流量 Top10
- 同上
select
videoId,
views,
ratings
from
gulivideo_orc
lateral view explode(category) catetory as categoryId
where
categoryId = "Music"
order by
ratings desc
limit
10
;
统计上传视频最多的用户 Top10 以及他们上传的观看次数在前 20 视频
- 通过用户表的上传ID与视频ID连接
- 按照视频播放量排序
- 取出前20条
select
t2.videoId,
t2.views,
t2.ratings,
t1.videos,
t1.friends
from (
select
*
from
gulivideo_user_orc
order by
videos desc
limit
10
) t1
join
gulivideo_orc t2 on t1.uploader = t2.uploader
order by
views desc
limit
20
;
统计每个类别视频观看数 Top10
- 使用窗口函数将序号rank添加上去
- 使用rank取前10位
select
t1.*
from (
select
videoId,
categoryId,
views,
row_number() over(partition by categoryId order by views desc) rank
from
gulivideo_orc
lateral view explode(category) catetory as categoryId
) t1
where
rank <= 10
;