Hive 案例(1)--- 影评案例

影评案例---现有如此三份数据:
1、users.dat    数据格式为:  2::M::56::16::70072
对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
对应字段中文解释:用户id,性别,年龄,职业,邮政编码

2、movies.dat        数据格式为: 2::Jumanji (1995)::Adventure|Children's|Fantasy
对应字段为:MovieID BigInt, Title String, Genres String
对应字段中文解释:电影ID,电影名字,电影类型

3、ratings.dat        数据格式为:  1::1193::5::978300760
对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
对应字段中文解释:用户ID,电影ID,评分,评分时间戳

案例需求:

1.数据处理要求:
(1)使用Hive能解析的方式进行

create database demodb04;

create table users(UserID BigInt,Gender String,Age Int,Occupation String,Zipcode String)row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;

create table movies(MovieID BigInt,Title String,Genres String)row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;

create table ratings(UserID BigInt,MovieID BigInt,Rating Double,Timestamped String)row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;

load data local inpath '/home/hdp/demo/film/users.dat' into table users;
load data local inpath '/home/hdp/demo/film/movies.dat' into table movies;
load data local inpath '/home/hdp/demo/film/ratings.dat' into table ratings;

select * from users limit 10;
select * from movies limit 10;
select * from ratings limit 10;

(2)写shell脚本清洗数据。(hive不支持解析多字节的分隔符,也就是说hive只能解析':', 不支持解析'::',所以用普通方式建表来使用是行不通的,要求对数据做一次简单清洗)

sed 's/::/%/g' users.dat >> newusers.dat
sed 's/::/%/g' movies.dat >> newmovies.dat
sed 's/::/%/g' ratings.dat >> newratings.dat

create table users(UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String)
row format delimited fields terminated by '%' stored as textfile;

create table movies(MovieID BigInt, Title String, Genres String) 
row format delimited fields terminated by '%' stored as textfile;

create table ratings(UserID BigInt, MovieID BigInt, Rating Double, Timestamped String) 
row format delimited fields terminated by '%' stored as textfile;

load data local inpath '/home/hdp/demo/film/newusers.dat' into table users;
load data local inpath '/home/hdp/demo/film/newmovies.dat' into table movies;
load data local inpath '/home/hdp/demo/film/newratings.dat' into table ratings;

select * from users limit 10;
select * from movies limit 10;
select * from ratings limit 10;

2.Hive要求:

1、users :对应字段为:UserID 用户id, Gender 性别, Age 年龄, Occupation 职业, Zipcode 邮政编码
2、movies:对应字段为:MovieID 电影ID, Title 电影名字, Genres 电影类型
3、ratings : 对应字段为:UserID 用户ID, MovieID 电影ID, Rating 评分, Timestamped 评分时间戳

(1)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
(2)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
(3)求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
(4)求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
(5)求好片(评分>=4.0)最多的那个年份的最好看的10部电影
(6)求1997年上映的电影中,评分最高的10部Comedy类电影
(7)该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
(8)各年评分最高的电影类型(年份,类型,影评分)
(9)每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)

--(1)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
create table answer01 as
 select r.movieid,m.title, count(r.rating) total 
 from ratings r join movies m on r.movieid=m.movieid
 group by r.movieid,m.title 
 order by total desc
 limit 10;

-- (2)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
create table answer02_F as 
select u.gender,m.title,round(avg(r.rating),3) rating 
from ratings r  
join movies m on r.movieid=m.movieid 
join users u on r.userid=u.userid 
where u.gender='F' 
group by u.gender,m.title 
order by rating desc
limit 10;

create table answer02_M as 
select u.gender,m.title,round(avg(r.rating),3) rating 
from ratings r  
join movies m on r.movieid=m.movieid 
join users u on r.userid=u.userid 
where u.gender='M' 
group by u.gender,m.title 
order by rating desc
limit 10;

-- (3)求movieid = 2116这部电影各年龄段((0,20),[20,40),[40,60),[60,+∞))的平均影评(年龄段,影评分)
    -- 先查查有哪些年龄
    select distinct age from users;
    -- 结果:1、18、25、35、45、50、56
    
    -- 年龄段就按照这7个值划分的查询
select users.age age,avg(ratings.rating) rating from ratings
 join users on ratings.userid=users.userid
 group by users.age 
 order by age;

    --分段划分
create table answer03 as 
select 
 if(age>=60,'[60,+∞)',if(age>=40,'[40,60)',if(age>=20,'[20,40)','(0,20)'))) age,
 avg(rating) rating
 from ratings join users on ratings.userid=users.userid
 group by if(age>=60,'[60,+∞)',if(age>=40,'[40,60)',if(age>=20,'[20,40)','(0,20)')))
 order by age;

-- (4)求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的总的平均影评分(观影者,电影名,影评分)
    -- ①求最喜欢看电影(影评次数最多)的那位女性的id
create view answer_view04 as
select u.userid, count(r.rating) rating 
from ratings r 
join users u on r.userid=u.userid 
join movies m on r.movieid=m.movieid 
where u.gender='M' 
group by u.userid 
order by rating desc limit 1;

    -- ②求她评分最高的10部片子
create view answer_view04_top10 as
select 
userid,movieid, avg(rating) rating 
from ratings 
where ratings.userid in (select userid from answer_view04) 
group by userid,movieid 
order by rating desc 
limit 10;

    -- ③她评分最高的10部电影的平均影评分
create table answer04 as 
select ratings.movieid movieid,title,avg(rating) rating 
from movies join ratings on movies.movieid=ratings.movieid 
where ratings.movieid in (select movieid from answer_view04_top10) 
group by ratings.movieid,title;


-- (5)求好片(平均评分>=4.0)最多的那个年份的最好看的10部电影
    -- ①在title中截取年份
select substr("jfidjidji(1991)",length('jfidjidji(1991)')-4,4); -- 测试
    -- ② 算出每部电影的平均评分,并截取出年份
create table answer05_1 as
select 
substr(m.title,length(m.title)-4,4) year,
m.movieid movieid,
m.title title,
m.genres genres,
avg(r.rating) rating
from movies m
join ratings r on r.movieid=m.movieid
group by substr(m.title,length(m.title)-4,4),m.movieid,m.title,m.genres;
    -- ③ 求好片最多的年份
select a.year from 
(select year,count(*) total
from answer05_1 where rating>=4.0 
group by year order by total desc limit 1) a;

    -- ④计算指定年份中最好看的10部电影
create table answer05_2 as 
select * from answer05_1
where year in (
    select a.year from 
    (select year,count(*) total
    from answer05_1 where rating>=4.0 
    group by year order by total desc limit 1) a
) order by rating desc limit 10;

-- (6)求1997年上映的电影中,评分最高的10部Comedy类电影
    -- ①查询1997年上映的电影及评分
create table answer06_1 as
select 
substr(m.title,length(m.title)-4,4) year,
m.movieid movieid,
m.title title,
m.genres genras,
avg(r.rating) rating
from movies m join ratings r on m.movieid=r.movieid
group by substr(m.title,length(m.title)-4,4),m.movieid ,m.title,m.genres
having substr(m.title,length(m.title)-4,4)=1997
order by rating desc;
    -- ②求1997年上映的电影中,评分最高的10部Comedy类电影
create table answer06_2 as
select 
substr(m.title,length(m.title)-4,4) year,
m.movieid movieid,
m.title title,
m.genres genras,
if(m.genres like '%Comedy%',1,0) Comedy,
avg(r.rating) rating
from movies m join ratings r on m.movieid=r.movieid
group by substr(m.title,length(m.title)-4,4),m.movieid ,m.title,m.genres
having substr(m.title,length(m.title)-4,4)=1997 and if(m.genres like '%Comedy%',1,0)=1 
order by rating desc limit 10;

-- (7)该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
    -- ①将电影类型拆分出来在一列中分别显示
create table answer07_1 as 
select  
m.movieid movieid,m.title title,
t.movietype movietype
from movies m lateral view explode(split(genres,'\\|')) t as movietype;
    --②连接评分,并进行组内排序
create table answer07_2 as 
select 
a.movieid movieid,a.title title,a.movietype movietype,
avg(r.rating) rating,
row_number() over(partition by a.movietype order by avg(r.rating) desc) rank
from answer07_1 a left join ratings r on a.movieid=r.movieid
group by a.movietype,a.movieid,a.title;
    -- ③取出各种类型前5的电影
create table answer07_3 as 
select movietype,movieid,title,rating 
from answer07_2 where rank<=5;

-- (8)各年评分前3的电影类型(年份,类型,影评分)
    -- ①先提取电影年份year 和电影的各个类型movietype
create table answer08_1 as 
select 
m.movieid movieid,
substr(m.title,length(m.title)-4,4) year,
t.movietype movietype,
m.title title  
from movies m 
lateral view explode(split(genres,"\\|")) t as movietype;
    -- ②连接评分表,并计算每年的每种类型的电影评分排名
create table answer08_2 as 
select  
a.year year, a.movietype movietype, avg(r.rating) rating,
rank() over(partition by a.year order by avg(r.rating) desc) rank 
from answer08_1 a left join ratings r on a.movieid=r.movieid 
group by a.year,a.movietype;
    -- ③ 取出组内前3
create table answer08_3 as 
select * from answer08_2 where rank<=3;

-- (9)每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)
    --①计算每个地区的每部电影的评分,每个地区的评分排名
create table answer09_1 as 
select 
u.zipcode zipcode,
m.title title,
avg(r.rating) rating,
rank() over(partition by u.zipcode order by avg(r.rating) desc) rank 
from ratings r 
join users u on r.userid=u.userid 
join movies m on m.movieid=r.movieid
group by u.zipcode,m.title ;
    --②取出各个地区的第一名
create table answer09_2 as 
select * from  answer09_1 where rank=1;

 

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值