统计影音视频网站的常规指标,各种TopN指标:
– 统计视频观看数Top10
– 统计视频类别热度Top10(类别热度:类别下的总视频数)
– 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
– 统计视频观看数Top50所关联视频的所属类别Rank
– 统计每个类别中的视频热度Top10,以Music为例(视频热度:视频观看数)
– 统计每个类别视频观看数Top10
– 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
- 需要准备的表
创建外部数据表:myvideo_ori,myvideo_user_ori,
创建最终表:myvideo_orc,myvideo_user_orc
- 创建外部数据表
(1)上传原始数据到HDFS
[yuwenzhi@hadoop102 datas] pwd
/opt/module/hive/datas
[yuwenzhi@hadoop102 datas] hadoop fs -mkdir -p /myvideo/video
[yuwenzhi@hadoop102 datas] hadoop fs -mkdir -p /myvideo/user
[yuwenzhi@hadoop102 datas] hadoop fs -put myvideo/user/user.txt /myvideo/user
[yuwenzhi@hadoop102 datas] hadoop fs -put myvideo/video/*.txt /myvideo/video
(2)创建外部数据表:myvideo_ori
create external table myvideo_ori(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
row format delimited fields terminated by "\t"
collection items terminated by "&"
stored as textfile
location '/myvideo/video';
(3)创建外部数据表: myvideo_user_ori
create external table myvideo_user_ori(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t" ,
stored as textfile
location '/myvideo/user';
1) 创建orc存储格式带snappy压缩的管理表:
(1)myvideo_orc
create table myvideo_orc(
videoId string,
uploader string,
age int,
category array<string>,
length int,
views int,
rate float,
ratings int,
comments int,
relatedId array<string>)
stored as orc
tblproperties("orc.compress"="SNAPPY");
(2)myvideo_user_orc
create table myvideo_user_orc(
uploader string,
videos int,
friends int)
row format delimited
fields terminated by "\t"
stored as orc
tblproperties("orc.compress"="SNAPPY");
2) 向表中插入数据
(1)向orc表插入数据
insert into table myvideo_orc select * from myvideo_ori;
insert into table myvideo_user_orc select * from myvideo_user_ori;
sql示例
---统计影音视频网站的常规指标,各种TopN指标:
-- 统计视频观看数Top10
SELECT
t1.videoid,t1.`views` ,rk
FROM
(SELECT
videoid ,`views`,
rank() over(order by `views` desc) rk
FROM
gulivideo_orc)t1
where rk <=10;
set hive.exec.mode.local.auto = true; --设置为本地模式
-- 统计视频类别热度Top10(类别热度:类别下的总视频数Top10)
SELECT
t1.category_name ,count(videoid) cnt
FROM
(SELECT
category_name,
videoid
FROM
gulivideo_orc
lateral view explode(category) tmp as category_name) t1
group by t1.category_name
order by cnt desc
limit 10;
-- 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
SELECT
category_name ,
count(videoid ) cnt ,
rank() over(order by count(videoid ) DESC ) rk
FROM
(
SELECT
videoid ,
`views` ,
category
FROM
gulivideo_orc
order by `views` DESC
limit 20
)t1 lateral view explode(category) tmp as category_name
group by category_name ;
-- 统计视频观看数Top50所关联视频的所属类别Rank(所属类别中视频热度(视频总数)排行)
SELECT
tmp.category_name,
count(t3.new_relate_id ) cnt,
rank() over(order by count(t3.new_relate_id) desc) rk
FROM
(
SELECT
t2.new_relate_id,category
FROM
(SELECT
tmp.new_relate_id
FROM
(
SELECT
videoid ,
`views` ,
relatedid
FROM
gulivideo_orc
order by `views` DESC
limit 50
) t1 lateral view explode(relatedid) tmp as new_relate_id)t2
join gulivideo_orc on t2.new_relate_id = videoid )t3
lateral view explode(category) tmp as category_name
group by tmp.category_name;
-- 统计每个类别中的视频热度Top10,以Music为例(视频热度:视频观看数)
SELECT
category_name,
videoid ,
`views`
FROM
gulivideo_orc
lateral view explode(category) tmp as category_name
where category_name = 'Music'
order by `views` DESC limit 10;
-- 统计每个类别视频观看数Top10
SELECT
t1.category_name,
t1.videoid,
t1.`views`,
t1.rk
FROM
(SELECT
category_name ,
`views` ,
videoid ,
rank() over(partition by category_name order by `views` DESC ) rk
FROM
gulivideo_orc
lateral view explode(category) tmp as category_name) t1
where rk BETWEEN 1 and 10;
-- 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频
SELECT * from gulivideo_user_orc ;
--------1. top10用户的总视频中观看热度在前20的视频----------
SELECT
t1.uploader, gc.videoid ,gc.`views`
FROM
(SELECT
uploader ,videos
FROM
gulivideo_user_orc
order by videos DESC
limit 10) t1 join gulivideo_orc gc on t1.uploader = gc.uploader
order by gc.`views` DESC
limit 20;
--------2. top10用户中每个人的观看热度前20的视频------------
SELECT
*
FROM
(SELECT
t1.uploader, gc.videoid ,gc.`views`,
rank() over(partition by t1.uploader order by gc.`views` DESC ) rk
FROM
(SELECT
uploader ,videos
FROM
gulivideo_user_orc
order by videos DESC
limit 10) t1 join gulivideo_orc gc on t1.uploader = gc.uploader)t2
where rk <= 20;
--------3. top10用户在中所有视频列表中观看热度在前20的视频-------
SELECT
t2.videoid,
t2.`views`
FROM
(
SELECT
uploader , videos
FROM
gulivideo_user_orc
order by
videos DESC
limit 10) t1
JOIN (
SELECT
videoid , uploader , `views`
FROM
gulivideo_orc
order by
`views` DESC
LIMIT 20)t2 on
t1.uploader = t2.uploader;
有关大数据学习资源,请关注微信公众号“码农书斋”。回复“大数据”,免费获取学习视频、源码及资料!