数据准备
数据下载链接:
https://github.com/qianhonglinIT/HiveMovieCaseStudy
在数据库中创建3张表,t_user,t_movie,t_rating
create table t_user(
userid bigint,
sex string,
age int,
occupation string,
zipcode string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
create table t_movie(
movieid bigint,
moviename string,
movietype string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
create table t_rating(
userid bigint,
movieid bigint,
rate double,
times string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
练习
(1)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
按照电影名进行分组,计算每组的评论次数,并且降序排列,取前10条。
create table question1 as select m.moviename,count(m.moviename) as total from t_movie m join t_rating r on m.movieid=r.movieid group by m.moviename order by total desc limit 10;
0: jdbc:hive2://Hbase:10000> select * from question1;
+--------------------------------------------------------+------------------+--+
| question1.moviename | question1.total |
+--------------------------------------------------------+------------------+--+
| American Beauty (1999) | 3428 |
| Star Wars: Episode IV - A New Hope (1977) | 2991 |
| Star Wars: Episode V - The Empire Strikes Back (1980) | 2990 |
| Star Wars: Episode VI - Return of the Jedi (1983) | 2883 |
| Jurassic Park (1993) | 2672 |
| Saving Private Ryan (1998) | 2653 |
| Terminator 2: Judgment Day (1991) | 2649 |
| Matrix, The (1999) | 2590 |
| Back to the Future (1985) | 2583 |
| Silence of the Lambs, The (1991) | 2578 |
+--------------------------------------------------------+------------------+--+
10 rows selected (0.147 seconds)
2)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)
三表联合查询,按照性别过滤条件,电影名作为分组条件,影评分作为排序条件进行查询
create table question2 as select "F" as sex,m.moviename,count(m.moviename) as total from t_rating r join t_user u on r.userid=u.userid join t_movie m on r.movieid=m.movieid where sex="F" group by m.moviename order by total desc limit 10;
0: jdbc:hive2://Hbase:10000> select * from question2;
+----------------+--------------------------------------------------------+------------------+--+
| question2.sex | question2.moviename | question2.total |
+----------------+--------------------------------------------------------+------------------+--+
| F | American Beauty (1999) | 946 |
| F | Shakespeare in Love (1998) | 798 |
| F | Silence of the Lambs, The (1991) | 706 |
| F | Sixth Sense, The (1999) | 664 |
| F | Groundhog Day (1993) | 658 |
| F | Fargo (1996) | 657 |
| F | Star Wars: Episode VI - Return of the Jedi (1983) | 653 |
| F | Star Wars: Episode V - The Empire Strikes Back (1980) | 648 |
| F | Star Wars: Episode IV - A New Hope (1977) | 647 |
| F | Forrest Gump (1994) | 644 |
+----------------+--------------------------------------------------------+------------------+--+
10 rows selected (0.177 seconds)
(3)求movieid = 2116这部电影各年龄段的平均影评(年龄段,影评分)
先以movieid = 2116为过滤条件,再按年龄分组,进行查询
create table question3 as select u.age,avg(r.rate) rate from t_rating r join t_user u on r.userid=u.userid where r.movieid=2116 group by u.age;
0: jdbc:hive2://Hbase:10000> select * from question3;
+----------------+---------------------+--+
| question3.age | question3.rate |
+----------------+---------------------+--+
| 1 | 3.2941176470588234 |
| 18 | 3.3580246913580245 |
| 25 | 3.436548223350254 |
| 35 | 3.2278481012658227 |
| 45 | 2.8275862068965516 |
| 50 | 3.32 |
| 56 | 3.5 |
+----------------+---------------------+--+
7 rows selected (0.134 seconds)
(4)求最喜欢看电影(影评次数最多)的那位女性评分最高的10部电影的影评分(观影者,电影名,影评分)
分析:
1)首先求出最喜欢看电影的女性
select u.userid,count(u.userid) from t_rating r join t_user u on r.userid=u.userid where u.sex="F" group by u.userid limit 1;
+-----------+--------+--+
| u.userid | total |
+-----------+--------+--+
| 1150 | 1302 |
+-----------+--------+--+
1 row selected (113.353 seconds)
查询用户id为1150评分最高的10部电影
select r.userid,m.moviename,r.rate from t_rating r join t_movie m on r.movieid=m.movieid where r.userid=1150 order by r.rate desc limit 10;
+-----------+------------------------------+---------+--+
| r.userid | m.moviename | r.rate |
+-----------+------------------------------+---------+--+
| 1150 | His Girl Friday (1940) | 5.0 |
| 1150 | Blazing Saddles (1974) | 5.0 |
| 1150 | City Lights (1931) | 5.0 |
| 1150 | Annie Hall (1977) | 5.0 |
| 1150 | Rear Window (1954) | 5.0 |
| 1150 | Crumb (1994) | 5.0 |
| 1150 | White Christmas (1954) | 5.0 |
| 1150 | Metropolitan (1990) | 5.0 |
| 1150 | Topsy-Turvy (1999) | 5.0 |
| 1150 | Hands on a Hard Body (1996) | 5.0 |
+-----------+------------------------------+---------+--+
10 rows selected (66.002 seconds)
求出2)中10部电影的平均影评分
create table question3_2 as select m.moviename,avg(r.rate) as avgrate from question3_1 q join t_movie m on q.moviename=m.moviename join t_rating r on m.movieid=r.movieid group by m.moviename;
0: jdbc:hive2://Hbase:10000> select * from question3_2;
+------------------------------+----------------------+--+
| question3_2.moviename | question3_2.avgrate |
+------------------------------+----------------------+--+
| Annie Hall (1977) | 4.14167916041979 |
| Blazing Saddles (1974) | 4.047363717605005 |
| City Lights (1931) | 4.387453874538745 |
| Crumb (1994) | 4.063136456211812 |
| Hands on a Hard Body (1996) | 4.163043478260869 |
| His Girl Friday (1940) | 4.249370277078086 |
| Metropolitan (1990) | 3.6464646464646466 |
| Rear Window (1954) | 4.476190476190476 |
| Topsy-Turvy (1999) | 3.7039473684210527 |
| White Christmas (1954) | 3.8265682656826567 |
+------------------------------+----------------------+--+
10 rows selected (0.126 seconds)
(5)求好片(评分>=4.0)最多的那个年份的最好看的10部电影
分析:
1)需要将t_rating和t_movie表进行联合查询,将电影名当中的上映年份截取出来,保存到临时表question5_1中
create table question5_1 as select m.movieid as movieid, m.moviename as moviename,substr(m.moviename,-5,4) as years,avg(r.rate) as avgrate from t_rating r join t_movie m on r.movieid=m.movieid group by m.movieid,m.moviename;
+----------------------+-------------------------------------------------------------------------------------+--------------------+----------------------+--+
| 3947 | Get Carter (1971) | 1971 | 3.4727272727272727 |
| 3948 | Meet the Parents (2000) | 2000 | 3.6357308584686776 |
| 3949 | Requiem for a Dream (2000) | 2000 | 4.115131578947368 |
| 3950 | Tigerland (2000) | 2000 | 3.6666666666666665 |
| 3951 | Two Family House (2000) | 2000 | 3.9 |
| 3952 | Contender, The (2000) | 2000 | 3.7809278350515463 |
+----------------------+-------------------------------------------------------------------------------------+--------------------+----------------------+--+
3,706 rows selected (0.894 seconds)
2)求拥有好片最多的那一年。
create table question5_2 as select years,count(years) as cot from question5_1 where avgrate > 4 group by years order by cot desc limit 1;
+--------------------+------------------+--+
| question5_2.years | question5_2.cot |
+--------------------+------------------+--+
| 1998 | 27 |
+--------------------+------------------+--+
3)求出这一年最好的10部电影。
create table question5_3 as select moviename,years,avgrate from question5_1 where years=1998 order by avgrate desc limit 10;
+---------------------------------------------+--------------------+----------------------+--+
| question5_3.moviename | question5_3.years | question5_3.avgrate |
+---------------------------------------------+--------------------+----------------------+--+
| Follow the Bitch (1998) | 1998 | 5.0 |
| Apple, The (Sib) (1998) | 1998 | 4.666666666666667 |
| Inheritors, The (Die Siebtelbauern) (1998) | 1998 | 4.5 |
| Return with Honor (1998) | 1998 | 4.4 |
| Saving Private Ryan (1998) | 1998 | 4.337353938937053 |
| Celebration, The (Festen) (1998) | 1998 | 4.3076923076923075 |
| West Beirut (West Beyrouth) (1998) | 1998 | 4.3 |
| Central Station (Central do Brasil) (1998) | 1998 | 4.283720930232558 |
| 42 Up (1998) | 1998 | 4.2272727272727275 |
| American History X (1998) | 1998 | 4.2265625 |
+---------------------------------------------+--------------------+----------------------+--+