一、数据准备
1、users.dat
数据格式:2::M::56::16::70072,
数据条数:共有6040条数据
对应字段:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
字段解释:用户id,性别,年龄,职业,邮政编码
2、movies.dat
数据格式:2::Jumanji (1995)::Adventure|Children’s|Fantasy,
数据条数:共有3883条数据
对应字段:MovieID BigInt, Title String, Genres String
字段解释:电影ID,电影名字,电影类型
3、ratings.dat
数据格式:1::1193::5::978300760,
数据条数:共有1000209条数据
对应字段:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
字段解释:用户ID,电影ID,评分,评分时间戳
二、需求分析
1、正确建表,导入数据(三张表,三份数据),并验证是否正确
(1)创建表
create table t_user(
userid bigint, --用户id
gender string, --性别
age int, --年龄
occupation string, --职业
zipcode string) --邮政编码
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
create table t_movie(
movieid bigint, --电影ID
title string, --电影名字
genres string) --电影类型
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
create table t_rating(
userid bigint, --用户ID
movieid bigint, --电影ID
rating double, --评分
timestamped String) --评分时间戳
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
(2)导入数据
load data local inpath '/usr/mydir/data/users.dat' into table t_user;
load data local inpath '/usr/mydir/data/movies.dat' into table t_movie;
load data local inpath '/usr/mydir/data/ratings.dat' into table t_rating;
(3)验证数据
select * from t_user;
select count(1) from t_user ; --6040条数据
select * from t_movie;
select count(1) from t_movie ; --3883条数据
select * from t_rating;
select count(1) from t_rating ; --1000209条数据
2、求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
SELECT
title,
rating_count
FROM
(SELECT
movieid,
COUNT(rating) rating_count
from t_rating
group by movieid
) t1
left join t_movie t2
on t1.movieid = t2.movieid
order by rating_count DESC
LIMIT 10;
3、分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分),评分人数大于等于50
SELECT * from
(SELECT
'M' as gender,
title,
AVG(rating) avg_rating,
count(rating) count_rating
from
(SELECT --三张表关联,拿到性别,电影名,影评分
t2.gender,
t3.title,
t1.rating
from t_rating t1
left join t_user t2 on t1.userid = t2.userid
left join t_movie t3 on t1.movieid = t3.movieid
where t2.gender = 'M'
) t4
group by title
HAVING count_rating >= 50
order by avg_rating DESC
LIMIT 10
) a
UNION ALL
SELECT * from
(SELECT
'F' as gender,
title,
AVG(rating) avg_rating,
count(rating) count_rating
from
(SELECT --三张表关联,拿到性别,电影名,影评分
t2.gender,
t3.title,
t1.rating
from t_rating t1
left join t_user t2 on t1.userid = t2.userid
left join t_movie t3 on t1.movieid = t3.movieid
where t2.gender = 'F'
) t4
group by title
HAVING count_rating >= 50
order by avg_rating DESC
LIMIT 10
) b
4、求好片(评分>=4.0)最多的那个年份的最好看的10部电影
SELECT
title,
avg_rating
from
(SELECT
t1.title,
SUBSTRING(t1.title,-5,4) year,
avg(t2.rating) avg_rating
from t_movie t1
left join t_rating t2
on t1.movieid = t2.movieid
where t2.rating >= 4
group by t1.title
) t4
where year IN
(SELECT year
from
(SELECT
year,
COUNT(year) count_year
from
(SELECT
t1.title,
SUBSTRING(t1.title,-5,4) year
from t_movie t1
left join t_rating t2
on t1.movieid = t2.movieid
where t2.rating >= 4
group by t1.title
) t3
group by year
order by count_year DESC
LIMIT 1) t5
)
order by avg_rating DESC
limit 10;
5、求1997年上映的电影中,评分最高的10部Comedy类电影
SELECT
title,
genres,
AVG(rating) avg_rating
from
(SELECT
t1.title,
substr(t1.title,-5,4) year,
t1.genres,
t2.rating
from t_movie t1
left join t_rating t2
on t1.movieid = t2.movieid
) t3
where year = '1997' and genres LIKE '%Comedy%'
group by title,genres
order by avg_rating DESC
LIMIT 10;
6、该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
SELECT
type,
title
FROM
(SELECT
title,
type,
ROW_NUMBER() over(partition by type order by avg_rating DESC) rn
from
(SELECT
title,
type,
AVG(rating) avg_rating
from
(select
t2.genres,
t2.title,
t1.rating
from t_rating t1
left join t_movie t2
on t1.movieid =t2.movieid
) t3
lateral view explode(split(t3.genres,"\\|")) tmp as type
group by type,title
) t4
)t5
where rn <= 5;
7、各年评分最高的电影类型(年份,类型,影评分)
with tmp as
(SELECT
title,
type,
rating,
movie_year
from
(select
t2.genres,
t2.title,
t1.rating,
SUBSTRING(t2.title,-5,4) as movie_year
from t_rating t1
left join t_movie t2
on t1.movieid =t2.movieid
) t3
lateral view explode(split(t3.genres,"\\|")) tmp as type
)
select
movie_year,
type,
avg_rating
from
(select
movie_year,
type,
avg_rating,
ROW_NUMBER() over(partition by movie_year order by avg_rating DESC) rn
FROM
(select
movie_year,
type,
AVG(rating) avg_rating
FROM tmp
group by movie_year,type
) t1
) t2
where rn = 1;