HQL:统计影音视频网站的常规指标,各种TopN指标

统计影音视频网站的常规指标,各种TopN指标:

– 统计视频观看数Top10

– 统计视频类别热度Top10(类别热度:类别下的总视频数)

– 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数

– 统计视频观看数Top50所关联视频的所属类别Rank

– 统计每个类别中的视频热度Top10,以Music为例(视频热度:视频观看数)

– 统计每个类别视频观看数Top10

– 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频

  1. 需要准备的表

创建外部数据表:myvideo_ori,myvideo_user_ori,

创建最终表:myvideo_orc,myvideo_user_orc

  1. 创建外部数据表

(1)上传原始数据到HDFS

[yuwenzhi@hadoop102 datas] pwd

/opt/module/hive/datas

[yuwenzhi@hadoop102 datas] hadoop fs -mkdir -p  /myvideo/video

[yuwenzhi@hadoop102 datas] hadoop fs -mkdir -p  /myvideo/user

[yuwenzhi@hadoop102 datas] hadoop fs -put myvideo/user/user.txt   /myvideo/user

[yuwenzhi@hadoop102 datas] hadoop fs -put myvideo/video/*.txt   /myvideo/video

(2)创建外部数据表:myvideo_ori

create external table myvideo_ori(

    videoId string, 
    uploader string, 
    age int, 
    category array<string>, 
    length int, 
    views int, 

    rate float, 

    ratings int, 

    comments int,

    relatedId array<string>)

row format delimited fields terminated by "\t"

collection items terminated by "&"

stored as textfile

location '/myvideo/video';

(3)创建外部数据表: myvideo_user_ori

create external table myvideo_user_ori(

    uploader string,

    videos int,

    friends int)

row format delimited 

fields terminated by "\t" ,

stored as textfile

location '/myvideo/user';

1) 创建orc存储格式带snappy压缩的管理表:

(1)myvideo_orc

create table myvideo_orc(

    videoId string, 

    uploader string, 

    age int, 

    category array<string>, 

    length int, 

    views int, 

    rate float, 

    ratings int, 

    comments int,

    relatedId array<string>)

stored as orc

tblproperties("orc.compress"="SNAPPY");

(2)myvideo_user_orc

create table myvideo_user_orc(

    uploader string,

    videos int,

    friends int)

row format delimited 

fields terminated by "\t" 

stored as orc

tblproperties("orc.compress"="SNAPPY");

2) 向表中插入数据

(1)向orc表插入数据

insert into table myvideo_orc select * from myvideo_ori;

insert into table myvideo_user_orc select * from myvideo_user_ori;
sql示例
---统计影音视频网站的常规指标,各种TopN指标:
-- 统计视频观看数Top10
SELECT 
t1.videoid,t1.`views` ,rk
FROM 
(SELECT 
videoid ,`views`,
rank() over(order by `views` desc) rk
FROM 
gulivideo_orc)t1
where rk <=10;

set hive.exec.mode.local.auto = true;  --设置为本地模式

-- 统计视频类别热度Top10(类别热度:类别下的总视频数Top10)

SELECT 
t1.category_name ,count(videoid) cnt
FROM 
(SELECT 
category_name,
videoid 
FROM 
gulivideo_orc 
lateral view explode(category) tmp as category_name) t1 
group by t1.category_name 
order by cnt desc
limit 10;


-- 统计出视频观看数最高的20个视频的所属类别以及类别包含Top20视频的个数
SELECT 
category_name ,
count(videoid )  cnt ,
rank() over(order by count(videoid ) DESC ) rk
FROM 
	(
		SELECT 
			videoid ,
			`views` ,
			category 
		FROM 
			gulivideo_orc 
			order by `views` DESC 
			limit 20
	)t1 lateral view explode(category) tmp as category_name
	group by category_name ;

-- 统计视频观看数Top50所关联视频的所属类别Rank(所属类别中视频热度(视频总数)排行)
	SELECT 
		tmp.category_name,
		count(t3.new_relate_id )  cnt,
		rank() over(order by count(t3.new_relate_id) desc) rk
	FROM 
	(
	SELECT 
		t2.new_relate_id,category 
	FROM 
		(SELECT 
			tmp.new_relate_id 
		FROM 
		(
			SELECT 
				videoid ,
				`views` ,
				relatedid 
			FROM 
				gulivideo_orc 
			order by `views` DESC 
			limit 50
		) t1 lateral view explode(relatedid) tmp as new_relate_id)t2
	join gulivideo_orc on t2.new_relate_id = videoid )t3
	lateral view explode(category) tmp as category_name
	group by tmp.category_name;
	

-- 统计每个类别中的视频热度Top10,以Music为例(视频热度:视频观看数)

SELECT 
category_name,
videoid ,
`views`
FROM 
gulivideo_orc 
lateral view explode(category) tmp as category_name
where category_name = 'Music'
order by `views` DESC limit 10;

-- 统计每个类别视频观看数Top10

SELECT 
t1.category_name,
t1.videoid,
t1.`views`,
t1.rk
FROM 
(SELECT 
category_name ,
`views` ,
videoid ,
rank() over(partition by category_name order by `views` DESC ) rk
FROM 
gulivideo_orc 
lateral view explode(category) tmp as category_name) t1 
where rk BETWEEN 1 and 10;

-- 统计上传视频最多的用户Top10以及他们上传的视频观看次数在前20的视频 
SELECT * from gulivideo_user_orc ;

--------1. top10用户的总视频中观看热度在前20的视频----------
SELECT 
t1.uploader, gc.videoid ,gc.`views`
FROM 
(SELECT 
uploader ,videos 
FROM 
gulivideo_user_orc 
order by videos DESC 
limit 10) t1 join gulivideo_orc gc on t1.uploader = gc.uploader
order by gc.`views`  DESC 
limit 20;

--------2. top10用户中每个人的观看热度前20的视频------------
SELECT 
*
FROM 
(SELECT 
t1.uploader, gc.videoid ,gc.`views`,
rank() over(partition by t1.uploader order by gc.`views` DESC ) rk
FROM 
(SELECT 
uploader ,videos 
FROM 
gulivideo_user_orc 
order by videos DESC 
limit 10) t1 join gulivideo_orc gc on t1.uploader = gc.uploader)t2
where rk <= 20;

--------3. top10用户在中所有视频列表中观看热度在前20的视频-------
SELECT
	t2.videoid,
	t2.`views`
FROM
	(
	SELECT
		uploader , videos
	FROM
		gulivideo_user_orc
	order by
		videos DESC
	limit 10) t1
JOIN (
	SELECT
		videoid , uploader , `views`
	FROM
		gulivideo_orc
	order by
		`views` DESC
	LIMIT 20)t2 on
	t1.uploader = t2.uploader;

有关大数据学习资源,请关注微信公众号“码农书斋”。回复“大数据”,免费获取学习视频、源码及资料!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值