Hive SQL 影评案例练习

一、数据准备

1、users.dat
数据格式:2::M::56::16::70072,
数据条数:共有6040条数据
对应字段:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
字段解释:用户id,性别,年龄,职业,邮政编码

2、movies.dat
数据格式:2::Jumanji (1995)::Adventure|Children’s|Fantasy,
数据条数:共有3883条数据
对应字段:MovieID BigInt, Title String, Genres String
字段解释:电影ID,电影名字,电影类型

3、ratings.dat
数据格式:1::1193::5::978300760,
数据条数:共有1000209条数据
对应字段:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
字段解释:用户ID,电影ID,评分,评分时间戳

二、需求分析

1、正确建表,导入数据(三张表,三份数据),并验证是否正确

(1)创建表

create table t_user(
	userid bigint,	    --用户id
	gender string,	    --性别
	age int,	        --年龄
	occupation string,	--职业
	zipcode string) 	--邮政编码
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;

create table t_movie(
	movieid bigint,	    --电影ID
	title string,	    --电影名字
	genres string) 	    --电影类型
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;

create table t_rating(
	userid bigint,	    --用户ID
	movieid bigint,	    --电影ID
	rating double,	    --评分
	timestamped String) --评分时间戳
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe' 
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;

(2)导入数据

load data local inpath '/usr/mydir/data/users.dat' into table t_user;
load data local inpath '/usr/mydir/data/movies.dat' into table t_movie;
load data local inpath '/usr/mydir/data/ratings.dat' into table t_rating;

(3)验证数据

select * from t_user;
select count(1) from t_user ;	--6040条数据

select * from t_movie;
select count(1) from t_movie ;	--3883条数据

select * from t_rating;
select count(1) from t_rating ;	--1000209条数据

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
2、求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)

SELECT 
	title,
	rating_count
FROM 
	(SELECT 
		movieid,
		COUNT(rating) rating_count
	from t_rating 
	group by movieid
	) t1
left join t_movie t2
on t1.movieid = t2.movieid
order by rating_count DESC 
LIMIT 10;

在这里插入图片描述
3、分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分),评分人数大于等于50

SELECT * from
	(SELECT 
		'M' as gender,
		title,
		AVG(rating) avg_rating,
		count(rating) count_rating
	from 
		(SELECT 	--三张表关联,拿到性别,电影名,影评分
			t2.gender,
			t3.title,
			t1.rating
		from t_rating t1
		left join t_user t2 on t1.userid = t2.userid 
		left join t_movie t3 on t1.movieid = t3.movieid
		where t2.gender = 'M' 
		) t4
	group by title
	HAVING count_rating >= 50
	order by avg_rating DESC 
	LIMIT 10
	) a
UNION ALL 
SELECT * from 
	(SELECT 
		'F' as gender,
		title,
		AVG(rating) avg_rating,
		count(rating) count_rating
	from 
		(SELECT 	--三张表关联,拿到性别,电影名,影评分
			t2.gender,
			t3.title,
			t1.rating
		from t_rating t1
		left join t_user t2 on t1.userid = t2.userid 
		left join t_movie t3 on t1.movieid = t3.movieid
		where t2.gender = 'F' 
		) t4
	group by title
	HAVING count_rating >= 50
	order by avg_rating DESC 
	LIMIT 10
	) b

在这里插入图片描述
4、求好片(评分>=4.0)最多的那个年份的最好看的10部电影

SELECT 
	title,
	avg_rating
from 
	(SELECT 
		t1.title,
		SUBSTRING(t1.title,-5,4) year,
		avg(t2.rating) avg_rating
	from t_movie t1
	left join t_rating t2
	on t1.movieid = t2.movieid
	where t2.rating >= 4
	group by t1.title
	) t4
where year IN 
	(SELECT year 
	from
		(SELECT 
			year,
			COUNT(year) count_year
		from
			(SELECT 
				t1.title,
				SUBSTRING(t1.title,-5,4) year
			from t_movie t1
			left join t_rating t2
			on t1.movieid = t2.movieid
			where t2.rating >= 4
			group by t1.title
			) t3
		group by year 
		order by count_year DESC 
		LIMIT 1) t5
	)
order by avg_rating DESC 
limit 10;

在这里插入图片描述
5、求1997年上映的电影中,评分最高的10部Comedy类电影

SELECT 
	title,
	genres,
	AVG(rating) avg_rating
from
	(SELECT 
		t1.title,
		substr(t1.title,-5,4) year,
		t1.genres,
		t2.rating
	from t_movie t1
	left join t_rating t2
	on t1.movieid = t2.movieid
	) t3
where year = '1997' and genres LIKE '%Comedy%'
group by title,genres
order by avg_rating DESC 
LIMIT 10;

在这里插入图片描述
6、该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)

SELECT 
	type,
	title
FROM 
	(SELECT 
		title,
		type,
		ROW_NUMBER() over(partition by type order by avg_rating DESC) rn
	from
		(SELECT 
			title,
			type,
			AVG(rating) avg_rating
		from 
			(select
				t2.genres,
				t2.title,
				t1.rating
			from t_rating t1
			left join t_movie t2 
			on t1.movieid =t2.movieid
			) t3
		lateral view explode(split(t3.genres,"\\|")) tmp as type
		group by type,title
		) t4
	)t5
where rn <= 5;

在这里插入图片描述
7、各年评分最高的电影类型(年份,类型,影评分)

with tmp as
(SELECT 
	title,
	type,			
	rating,
	movie_year
from 
	(select
		t2.genres,
		t2.title,
		t1.rating,
		SUBSTRING(t2.title,-5,4) as movie_year 
	from t_rating t1
	left join t_movie t2 
	on t1.movieid =t2.movieid
	) t3
lateral view explode(split(t3.genres,"\\|")) tmp as type
)
	
select 
	movie_year,
	type,
	avg_rating 
from
	(select
		movie_year,
		type,
		avg_rating, 
		ROW_NUMBER() over(partition by movie_year order by avg_rating DESC) rn
	FROM 
		(select
			movie_year,
			type,
			AVG(rating) avg_rating 
		FROM tmp
		group by movie_year,type
		) t1
	) t2
where rn = 1;

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值