Hive面试题之影评分析

数据和需求的概览

现有如此三份数据:
1、users.dat 数据格式为: 2::M::56::16::70072
对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
对应字段中文解释:用户id,性别,年龄,职业,邮政编码

2、movies.dat 数据格式为: 2::Jumanji (1995)::Adventure|Children’s|Fantasy
对应字段为:MovieID BigInt, Title String, Genres String
对应字段中文解释:电影ID,电影名字,电影类型

3、ratings.dat 数据格式为: 1::1193::5::978300760
对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
对应字段中文解释:用户ID,电影ID,评分,评分时间戳

数据要求:
(1)写shell脚本清洗数据。(hive不支持解析多字节的分隔符,也就是说hive只能解析’:’, 不支持解析’::’,所以用普通方式建表来使用是行不通的,要求对数据做一次简单清洗)
(2)使用Hive能解析的方式进行

Hive要求:
(1)正确建表,导入数据(三张表,三份数据)
(2)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
(3)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
(4)求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
(5)求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
(6)求好片(评分>=4.0)最多的那个年份的最好看的10部电影
(7)求1997年上映的电影中,评分最高的10部Comedy类电影
(8)该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
(9)各年评分最高的电影类型(年份,类型,影评分)
(10)每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)

1. 数据清洗

1.1 users.dat

sed -i "s/::/:/g" users.dat

image-20200827191838024

1.2 movies.dat

sed -i "s/::/:/g" movies.dat

image-20200827191958427

1.3 ratings.dat

sed -i "s/::/:/g" ratings.dat

image-20200827192047369

2. 正确建表,导入数据(三张表,三份数据)

2.1 users

create table users(
userid bigint,
gender String,
age int,
occupation String,
zipcode String
)
row format delimited fields terminated by ':';
load data local inpath '/opt/data/users.dat'  overwrite into  table  moves_test.users

2.2 movies

create table moves_test.movies
(
movieid bigint,
title string,
genres string
)
row format delimited fields terminated by ':';

load data local inpath '/opt/data/movies.dat' overwrite into table moves_test.movies;

2.3 ratings

create table moves_test.ratings
(
userid bigint,
movieid bigint,
rating double,
timestamped  string
)
row format delimited fields terminated by ':';
load data local inpath '/opt/data/ratings.dat' overwrite into table moves_test.ratings;

3. 求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)

3.1 先找出来评分次数最多的10部电影和他评分次数

select movieid ,count(userid) rat_count from moves_test.ratings
group by movieid 
limit 10;

3.2 再和movies表关联拿到电影名称

select title,t1.movieid,rat_count from  (select movieid ,count(userid) rat_count from moves_test.ratings
group by movieid 
limit 10 ) t1
join (select movieid,title from moves_test.movies ) t2
on t1.movieid = t2.movieid;

4. 分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)

4.1 先找出来每个评分的性别

select t1.gender,t2.movieid,t2.rating from moves_test.users  t1
join (select userid ,movieid,rating from moves_test.ratings ) t2
on t1.userid = t2.userid

4.2 使用row_number开窗函数求出来每个电影的排名

select gender,movieid,ct from (select *,row_number() over(partition by gender order by  ct desc ) rk from (
select gender,movieid,sum(rating) ct  from (
select t1.gender,t2.movieid,t2.rating from moves_test.users  t1
join (select userid ,movieid,rating from moves_test.ratings ) t2
on t1.userid = t2.userid) t
group by gender, movieid ) t ) t
where rk <=10

4.3 根据movies拿到电影名称

select t1.*,t2.title from (select gender,movieid,ct from (
select *,row_number() over(partition by gender order by  ct desc ) rk from (
select gender,movieid,sum(rating) ct  from (
select t1.gender,t2.movieid,t2.rating from moves_test.users  t1
join (select userid ,movieid,rating from moves_test.ratings ) t2
on t1.userid = t2.userid) t
group by gender, movieid ) t ) t
where rk <=10 )  t1 join moves_test.movies t2
on t1.movieid = t2.movieid 

5. 求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)

5.1 找出来对2116 进行评分的人

select userid,rating from moves_test.ratings where movieid  = 2116

5.2 和user表进行关联得到最终结果

select age,avg(rating ) avg_rat from (select userid,rating 
from moves_test.ratings where movieid  = 2116 ) r
join (select userid ,age from users ) u
on r.userid = u.userid
group by age

6. 求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)

6.1 找出来最喜欢看电影的那位女性

select userid  from (select r.userid,count(*) ct  from ratings  r
join users u
on r.userid  = u.userid 
where u.gender = 'F'
group by r.userid 
order by ct desc limit 1 ) t1

6.2 找出来这位女性评分最高的十部电影

select userid,movieid from (select userid,movieid ,rating from ratings 
where userid  in (select userid  from (select r.userid,count(*) ct  from ratings  r
join users u
on r.userid  = u.userid 
where u.gender = 'F'
group by r.userid 
order by ct desc limit 1 ) t1 )
order by rating desc 
limit 10
) t2

6.3 找出来这几部电影的平均得分

select t2.userid,t2.movieid,avg(r.rating ) avg_rating from (select userid,movieid ,rating from ratings 
where userid  in (select userid  from (select r.userid,count(*) ct  from ratings  r
join users u
on r.userid  = u.userid 
where u.gender = 'F'
group by r.userid 
order by ct desc limit 1 ) t1 )
order by rating desc 
limit 10
) t2
join ratings r
on r.movieid  = t2.movieid
group by t2.userid,t2.movieid

7. 求好片(评分>=4.0)最多的那个年份的最好看的10部电影

7.1 找出来那个年份

select y from(select from_unixtime(cast(timestamped  as bigint) ,'yyyy') y,count(*) ct
from ratings 
where rating  >=4.0
group by from_unixtime(cast(timestamped  as bigint) ,'yyyy')
order by ct desc 
limit 1) t2

7.2 找出来哪一年的电影,并且根据电影求平均

select movieid ,avg(rating ) avg_rat from ratings where from_unixtime(cast(timestamped  as bigint) ,'yyyy') in  (select y from(select from_unixtime(cast(timestamped  as bigint) ,'yyyy') y,count(*) ct
from ratings 
where rating  >=4.0
group by from_unixtime(cast(timestamped  as bigint) ,'yyyy')
order by ct desc 
limit 1) t2 )
group by movieid  
order by avg_rat desc 
limit 10

8. 求1997年上映的电影中,评分最高的10部Comedy类电影

8.1 找到1997年上映的电影和Comedy类的

select  movieid from movies  where title  like '%1997%' and genres  like '%Comedy%'
group by movieid

8.2 最终结果

select movieid ,avg(rating ) avg_rating from (select r.movieid,r.rating from ratings r
left semi join (select  movieid from movies  where title  like '%1997%' and genres  like '%Comedy%'
group by movieid )t1
on t1.movieid = r.movieid ) t2
group by movieid  
order by avg_rating desc 
limit 10

9. 该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)

9.1 先把类型爆炸开

select movieid ,genre from movies  
lateral view explode(split(genres ,'\\|')) ad as genre

9.2 找出来各个电影的评分

select movieid,avg(rating ) a_rating from ratings 
group by movieid 

9.3 使用开窗求得结果

select movieid,genre,a_rating from (select r.movieid ,genre ,r.a_rating,row_number() over(partition by genre order by a_rating desc   ) rk  from (select movieid ,genre from movies  
lateral view explode(split(genres ,'\\|')) ad as genre ) t1
join (select movieid,avg(rating ) a_rating from ratings 
group by movieid 
) r
on r.movieid = t1.movieid
) t1 
where t1.rk <=5

10. 各年评分最高的电影类型(年份,类型,影评分)

10.1 先把电影类型炸开

select movieid ,genre from movies  
lateral view explode(split(genres ,'\\|')) ad as genre

10.2 把年份获取到然后进行开窗汇总获取第一个

select y,genre,avgs  from (select *,row_number() over(partition by y order by avgs desc ) rk from (select from_unixtime(cast(timestamped  as bigint) ,'yyyy') y,genre ,avg(rating) avgs from ratings  r
join ( select movieid ,genre from movies  
lateral view explode(split(genres ,'\\|')) ad as genre
) m
on m.movieid = r.movieid 
group by from_unixtime(cast(timestamped  as bigint) ,'yyyy'),genre ) t1) t2
where rk = 1

11. 每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)

create table moves_test.move11 as 
select zipcode,movieid,a_rating from (select *,row_number() over(partition by zipcode,movieid order by a_rating desc  ) rk from (select zipcode,r.movieid ,avg(r.rating ) a_rating  from users u
join ratings r
on r.userid  = u.userid 
group by zipcode,r.movieid ) d) d1
where rk =1
  • 0
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值