电影点评系统的数据来源
//下载1M的数据就够了
固定链接: http://grouplens.org/datasets/movielens/
//创建hivestudy数据库
hive> create database hivestudy;
//查看数据库
hive> show databases;
OK
default
hivestudy
//使用该数据库
hive> use hivestudy;
//创建用户表
hive> CREATE TABLE users(UserID BigInt,Gender String,Age Int,Occupation String,Zipcode String) PARTITIONED BY (dt String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '::';
删除表操作:DROP TABLE users;
//导入数据
hive> LOAD DATA LOCAL INPATH '/bigdata/learn_data/ml-1m/users.dat' INTO TABLE users PARTITION(dt=20161201);
//查看数据条数
hive> select count(1) from users;
//创建点评表
hive> CREATE TABLE ratings(UserID BigInt, MovieID BigInt, Rating Double, Timestamped String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '::';
//导入数据
hive> LOAD DATA LOCAL INPATH '/bigdata/learn_data/ml-1m/ratings.dat' INTO TABLE ratings;
//创建电影表
hive> CREATE TABLE movies(MovieID BigInt, Title String, Genres String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '::';
//导入数据
hive> LOAD DATA LOCAL INPATH '/bigdata/learn_data/ml-1m/movies.dat' INTO TABLE movies;
//进行join操作
hive> SELECT users.UserID, users.Age, users.Gender FROM ratings JOIN users ON (ratings.UserID = users.UserID) WHERE ratings.MovieID = 2116 ;
上述操作发现没有结果,查看数据发现有的数值是UNLL,原因此处的:: 分割,如果不对导入数据是空的。: 可以改成二进制编码,再次加载数据。或者把数据分割符:: 全部替换成为\t.
DROP TABLE users;
DROP TABLE ratings;
DROP TABLE movies;
CREATE TABLE users(UserID BigInt,Gender String,Age Int,Occupation String,Zipcode String) PARTITIONED BY (dt String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/bigdata/learn_data/ml-1m/users.dat' INTO TABLE users PARTITION(dt=20161201);
CREATE TABLE ratings(UserID BigInt, MovieID BigInt, Rating Double, Timestamped String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/bigdata/learn_data/ml-1m/ratings.dat' INTO TABLE ratings;
CREATE TABLE movies(MovieID BigInt, Title String, Genres String) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
LOAD DATA LOCAL INPATH '/bigdata/learn_data/ml-1m/movies.dat' INTO TABLE movies;
//进行join操作
hive> SELECT users.UserID, users.Age, users.Gender FROM ratings JOIN users ON (ratings.UserID = users.UserID) WHERE ratings.MovieID = 2116 ;
hive> SELECT users.UserID, users.Age, users.Gender,movies.Title,movies.Genres FROM ratings JOIN users ON (ratings.UserID = users.UserID) JOIN movies on(ratings.MovieID = movies.MovieID) WHERE ratings.MovieID = 2116 ;
//map端进行join 提升速度
hive> SELECT /*+MAPJOIN(movies)*/ users.UserID, users.Age, users.Gender,movies.Title,movies.Genres FROM ratings JOIN users ON (ratings.UserID = users.UserID) JOIN movies on(ratings.MovieID = movies.MovieID) WHERE ratings.MovieID = 2116 ;