一、案例说明
现有如此三份数据:
1、users.dat 数据格式为: 2::M::56::16::70072,
共有6040条数据
对应字段为:UserID BigInt, Gender String, Age Int, Occupation String, Zipcode String
对应字段中文解释:用户id,性别,年龄,职业,邮政编码
2、movies.dat 数据格式为: 2::Jumanji (1995)::Adventure|Children's|Fantasy,
共有3883条数据
对应字段为:MovieID BigInt, Title String, Genres String
对应字段中文解释:电影ID,电影名字,电影类型
3、ratings.dat 数据格式为: 1::1193::5::978300760,
共有1000209条数据
对应字段为:UserID BigInt, MovieID BigInt, Rating Double, Timestamped String
对应字段中文解释:用户ID,电影ID,评分,评分时间戳
要求:
将数据分别导入MySQL和Hive数据库,并分别在里面编写SQL语句完成问题解答。
二、获取数据
将数据从网页上下载https://files.cnblogs.com/files/qingyunzong/hive%E5%BD%B1%E8%AF%84%E6%A1%88%E4%BE%8B.zipfiles.cnblogs.com
解压后使用rz命令上传至服务器
rz
放在lanyin目录下面
三、导入数据
1、导入MySQL
(1)python写数据到mysql
import pymysql
conn = pymysql.connect(host = 'localhost',user = 'root',password = '********',port = 3306,db = 'lanyin')
cursor = conn.cursor()
i=0
with open(r'Y:\datafrog\movie_comment\hive_movie\users.dat',"r",encoding='utf8') as f:
for row in f.readlines():
userid = row.split("::")[0]
sex = row.split("::")[1]
age = row.split("::")[2]
occupation = row.split("::")[3]
zipcode = row.split("::")[4].rstrip('\n')
sql = "insert into user values('{}','{}','{}','{}','{}');".format(userid,sex,age,occupation,zipcode)
i+=1
cursor.execute(sql)
conn.commit()
print("成功插入{:.2f}%的数据".format(i*100/6040))
print(userid,sex,age,occupation,zipcode)
# try:
# cursor.execute(sql)
# conn.commit()
# print("成功插入第{}条数据".format(i))
# print(userid,movieid,rating,timestamped)
# except:
# conn.rollback()
cursor.close() # conn.rollback()
conn.close()
(2)pandas写入mysql,注意安装sqlalchemy
import pandas as pd
from sqlalchemy import create_engine
# conn = pymysql.connect(host = 'localhost',user = 'root',password = '********',port = 3306,db = 'lanyin')
# cursor = conn.cursor()
conn = create_engine('mysql+pymysql://root:7950288@localhost:3306/lanyin?charset=utf8')
df = pd.read_csv('movies.dat',delimiter='::',encoding='utf8',header = None,index_col=None)
print(df.columns.values)
df.rename(columns={0:'movieid',1:'moviename',2:'movietype'},inplace=True)
print(df)
df.to_sql('movies',conn,if_exists='append',index = False)
2、Hive建表导入数据
(1)建表
在hive里面建立自己的数据库
create database lanyin;
在其中建立三个表,由于源文件是用”::“分隔的,所以需要使用能够进行多字节解析的SerDe
使用RegexSerde
需要两个参数:
input.regex = "(.*)::(.*)::(.*)"
output.format.string = "%1$s %2$s %3$s"
分别建立users,movie,和rating三个表,注意不能是user会有关键字重复报错
use lanyin;
create table users(
userid bigint,
sex string,
age int,
occupation string,
zipcode string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s %5$s')
stored as textfile;
create table movies(
movieid bigint,
moviename string,
movietype string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s')
stored as textfile;
create table ratings(
userid bigint,
movieid bigint,
rating double,
timestamped string)
row format serde 'org.apache.hadoop.hive.serde2.RegexSerDe'
with serdeproperties('input.regex'='(.*)::(.*)::(.*)::(.*)','output.format.string'='%1$s %2$s %3$s %4$s')
stored as textfile;
(2)导入数据
load data local inpath "/root/lanyin/users.dat" into table users;
load data local inpath "/root/lanyin/movies.dat" into table movies;
load data local inpath "/root/lanyin/ratings.dat" into table ratings;
(3)查看数据
三、题目解答
(1)求被评分次数最多的10部电影,并给出评分次数(电影名,评分次数)
MySQL:
SELECT moviename,rating_count FROM(
SELECT a.movieid,COUNT(a.rating)as rating_count FROM ratings AS a
GROUP BY a.movieid ORDER BY rating_count DESC LIMIT 10) as b
LEFT JOIN movies
ON b.movieid = movies.movieid;
结果
Hive:
select moviename,count(rating) as rating_count from ratings
left join movies on ratings.movieid = movies.movieid
group by moviename
order by rating_count desc
limit 10;
结果:
(2)分别求男性,女性当中评分最高的10部电影(性别,电影名,影评分)(至少评分50次)
MySQL
SELECT sex,moviename,avg_rating FROM
(SELECT sex,movieid,AVG(rating) AS avg_rating,COUNT(movieid) AS total
FROM users LEFT JOIN ratings
ON users.userid = ratings.userid
WHERE sex = 'F'
GROUP BY movieid
HAVING total>50
ORDER BY avg_rating DESC
LIMIT 10) AS a LEFT JOIN movies
ON a.movieid = movies.movieid;
或者
SELECT sex,ratings.movieid,AVG(rating) AS avg_rating,COUNT(ratings.movieid) AS total
FROM users
LEFT JOIN ratings
ON users.userid = ratings.userid
LEFT JOIN movies
ON ratings.movieid = movies.movieid
WHERE sex = 'F'
GROUP BY ratings.movieid
HAVING total>50
ORDER BY avg_rating DESC
LIMIT 10;
结果
男性一样操作,将“F”改成“M”即可。
Hive:
select sex,moviename,avg(rating) as avg_rating,count(moviename) as total from ratings
left join movies on ratings.movieid = movies.movieid
left join users on ratings.userid = users.userid
where sex = 'M'
group by sex,moviename
having total> 50
order by avg_rating desc
limit 10;
结果:
(3)求movieid = 2116这部电影各年龄段(因为年龄就只有7个,就按这个7个分就好了)的平均影评(年龄段,影评分)
MySQL:
SELECT age,AVG(rating) FROM users LEFT JOIN ratings
ON users.userid = ratings.userid
WHERE movieid = 2116
GROUP BY age;
结果
Hive:
select age,round(avg(rate),2) from ratings
left join users on users.userid = ratings.userid
where movieid = 2116
group by age;
结果:
(4)求最喜欢看电影(影评次数最多)的那位女性评最高分的10部电影的平均影评分(观影者,电影名,影评分)
MySQL:
SELECT moviename,avg_rating FROM
(SELECT movieid,AVG(rating) AS avg_rating FROM ratings
WHERE movieid IN
(SELECT movieid FROM
(SELECT movieid,rating FROM ratings
WHERE userid = (SELECT ratings.userid FROM ratings
LEFT JOIN users
ON ratings.userid = users.userid
WHERE sex = 'F'
GROUP BY userid
ORDER BY COUNT(rating) DESC
LIMIT 1)
ORDER BY rating DESC
LIMIT 10) AS a)
GROUP BY movieid) AS b
LEFT JOIN movies
ON b.movieid = movies.movieid;
结果
Hive:
1、先求出最喜欢看电影的女性,得出userid = 1150
select users.userid,count(users.userid) as counts from users
left join ratings on users.userid = ratings.userid
where sex = 'F'
group by users.userid
order by counts desc
limit 1;
2、接着找出userid为1150的女性评分最高的10部电影的movieid
create table F_top10 as
select ratings.movieid,rating from ratings
left join movies on ratings.movieid = movies.movieid
where userid = 1150
order by rating desc
limit 10;
3、最后,从ratings里面找到这10个movieid的平均影评分
select F_top10.movieid,moviename,avg(ratings.rating) from F_top10
left join ratings on F_top10.movieid = ratings.movieid
left join movies on ratings.movieid = movies.movieid
group by F_top10.movieid,moviename;
结果:
(5)求好片(评分>=4.0)最多的那个年份的最好看的10部电影
MySQL
1、创建视图,找出每部电影平均分
CREATE VIEW year_view AS
SELECT SUBSTRING(moviename,-5,4) AS movieyear,moviename,AVG(rating) AS avg_rating FROM ratings
LEFT JOIN movies
ON ratings.movieid = movies.movieid
GROUP BY moviename;
2、找出好片最多的年份
SELECT movieyear,COUNT(moviename) AS movie_count FROM year_view
WHERE avg_rating>=4.0
GROUP BY movieyear
ORDER BY movie_count DESC
LIMIT 1;
3、找出该年份最好看的10部电影
SELECT moviename,avg_rating FROM year_view
WHERE movieyear = 1998
ORDER BY avg_rating DESC
LIMIT 10;
结果
Hive:
1、找到好片最多的年份1999
select substr(moviename,length(moviename)-4,4) as movie_year,count(moviename) as amount from movies
left join ratings on movies.movieid = ratings.movieid
where rating>=4.0
group by moviename,substr(moviename,length(moviename)-4,4)
---group by后面不能使用别名
order by amount desc
limit 1;
2、找到1999年最好看的10部电影
select movies.moviename,avg(rating) as avg_rate from movies
left join ratings on movies.movieid = ratings.movieid
where substr(moviename,length(moviename)-4,4) = 1999
group by movies.movieid,movies.moviename
order by avg_rate desc
limit 10;
结果
(6)求1997年上映的电影中,评分最高的10部Comedy类电影
MySQL
1、创建comedy_view
CREATE VIEW comedy_view AS
SELECT SUBSTRING(moviename,-5,4) AS movieyear,moviename,rating FROM ratings
LEFT JOIN movies
ON ratings.movieid = movies.movieid
WHERE LOCATE('comedy',movietype);
2、从comedy_view中找出1997年评分最高的10部Comedy类电影
SELECT moviename, AVG(rating) AS avg_rating FROM comedy_view
WHERE movieyear = 1997
GROUP BY moviename
ORDER BY avg_rating DESC
LIMIT 10;
结果
Hive
1、求出电影年份,用substr()
create table movie_years as
select movies.movieid as movieid, movies.moviename as moviename, substr(movies.moviename,-5,4) as years, ratings.rating as rating,movies.movietype as movietype
from movies left join ratings on movies.movieid=ratings.movieid ;
2、求出10部comedy电影,判断是不是Comedy用instr()>0
select moviename,avg(rating) as avg_rating from movie_years
where years = 1997 and instr(lower(movietype), 'comedy')>0
group by moviename
order by avg_rating desc
limit 10;
结果
(7)该影评库中各种类型电影中评价最高的5部电影(类型,电影名,平均影评分)
MySQL
1、视图中不能包含子查询,先创建平均分视图
CREATE VIEW avg_rating_view AS
SELECT movieid,AVG(rating) AS avg_rating
FROM ratings
GROUP BY movieid
2、将类型分开,一行变多行
CREATE VIEW type_view AS
SELECT movies.movieid,moviename,
SUBSTRING_INDEX(SUBSTRING_INDEX(movietype,'|',help_topic_id+1),'|',-1) AS type,
avg_rating
FROM movies LEFT JOIN mysql.help_topic
ON help_topic_id
LEFT JOIN avg_rating_view
ON movies.movieid = avg_rating_view.movieid;
3、分组排序求前五
SELECT c.* FROM (
SELECT a.type,a.moviename,a.avg_rating FROM type_view AS a
WHERE (
SELECT COUNT(b.type) FROM type_view AS b
WHERE a.type = b.type
AND a.avg_rating
ORDER BY a.type,a.avg_rating DESC) AS c
WHERE c.avg_rating IS NOT NULL;
结果
Hive
1、先拆分类型 用explode
create table movie_types as
select movieid,moviename,years,avg(rating) as avg_rating,type_exploded.movietype as movietype from movie_years
lateral view explode(split(movietype,'\\|')) type_exploded as movietype
group by movieid,moviename,years,type_exploded.movietype;
2、求top_N,用row_number() over按照类型求每种类型的top_N
create table movietype_topN as
select movieid,moviename,years,avg_rating,movietype,row_number() over(distribute by movietype sort by avg_rating desc) as rank
from movie_types;
3、求前5
select * from movietype_topN where rank<=5;
结果:
(8)各年评分最高的电影类型(年份,类型,影评分)
MySQL
1、视图中不能包含子查询,先创建平均分视图
CREATE VIEW avg_rating_view AS
SELECT movieid,AVG(rating) AS avg_rating
FROM ratings
GROUP BY movieid
2、将类型分开,一行变多行,并加入年份
CREATE VIEW year_type_view AS
SELECT movies.movieid,moviename,SUBSTRING(moviename,-5,4) AS movieyear,
SUBSTRING_INDEX(SUBSTRING_INDEX(movietype,'|',help_topic_id+1),'|',-1) AS type,
avg_rating
FROM movies LEFT JOIN mysql.help_topic
ON help_topic_id
LEFT JOIN avg_rating_view
ON movies.movieid = avg_rating_view.movieid;
3、通过两次groupby,找出每年评分最高的类型
SELECT movieyear,type,MAX(avg_year_rating) FROM
(SELECT movieyear,type,AVG(avg_rating) AS avg_year_rating FROM year_type_view
GROUP BY movieyear,type) AS a
GROUP BY movieyear;
但是这样找出来的类型是错误的,为了解决这个问题,这里暂时再联一次表
SELECT a.movieyear,a.type,b.max_rating FROM(
SELECT movieyear,type,AVG(avg_rating) AS avg_year_rating FROM year_type_view
GROUP BY movieyear,type
) AS a INNER JOIN
(SELECT movieyear,MAX(avg_year_rating) AS max_rating FROM
(SELECT movieyear,type,AVG(avg_rating) AS avg_year_rating FROM year_type_view
GROUP BY movieyear,type) AS c
GROUP BY movieyear) AS b
ON a.movieyear = b.movieyear AND a.avg_year_rating = b.max_rating;
结果:
错误结果如下
正确结果如下:
Hive
利用上一题分好列的movie_types,求出每年最好看的电影类型
select a.years,a.movietype,max(a.rating ) from(
select years,movietype,avg(avg_rating) as rating from movie_types
group by years,movietype) as a
group by a.years;
但是,这里跟这样写直接报错,movietype没有使用聚合函数无法出现。
因此转换思路,求出每年各类电影的平均分之后按照年份分组排序,求最大值
1、求出每年各类电影平均分
create table movie_year_type as
select years,movietype,avg(avg_rating) as rating from movie_types
group by years,movietype
order by rating;
2、分组排序
select a.* from (select years,movietype, rating,row_number() over(partition by years order by rating desc) as rank
from movie_year_type) as a
where a.rank = 1;
结果如下:
(9)每个地区最高评分的电影名,把结果存入HDFS(地区,电影名,影评分)
跟前面差不多,这里就不再使用MySQL了,需要存入HDFS
1、先求出每个地区电影的平均分
create table zipcode_rating as
select zipcode,moviename,avg(rating) as avg_rating from ratings
left join users
on users.userid = ratings.userid
left join movies
on movies.movieid = ratings.movieid
group by zipcode,moviename;
2、分组排名
select a.* from(
select zipcode,moviename,avg_rating,row_number() over(partition by zipcode order by avg_rating desc) as rank
from zipcode_rating) as a
where a.rank = 1;
3、写入HDFS
insert overwrite directory "/lanyin/" select a.* from(
select zipcode,moviename,avg_rating,row_number() over(partition by zipcode order by avg_rating desc) as rank
from zipcode_rating) as a
where a.rank = 1;