配置hive
1 下载hive版本
2 解压hive
$ tar -xzvf hive-x.y.z.tar.gz
$ cd hive-x.y.z
3 设置环境变量
export HIVE_HOME=/Users/$User/software/hive/apache-hive-2.1.1
export HADOOP_HOME=/Users/$User/software/hadoop/hadoop-2.9.1
export PATH=$HIVE_HOME/bin:$PATH
export HIVE_CONF_DIR=/Users/$User/software/hive/apache-hive-2.1.1/conf
4 建立目录
$HADOOP_HOME/bin/hadoop fs -rm -r /tmp
$HADOOP_HOME/bin/hadoop fs -rm -r /user/
$HADOOP_HOME/bin/hadoop fs -mkdir /tmp
$HADOOP_HOME/bin/hadoop fs -mkdir -p /user/hive/warehouse
$HADOOP_HOME/bin/hadoop fs -chmod g+w /tmp
$HADOOP_HOME/bin/hadoop fs -chmod g+w /user/hive/
5 创建元数据表
mysql -u root -p
输入密码root
drop database hive;
create database hive;
use hive;
drop user hive@'%';
CREATE USER 'hive'@'%' IDENTIFIED BY '123456';
GRANT ALL PRIVILEGES ON *.* TO 'hive'@'%' WITH GRANT OPTION;
flush privileges;
6 初始化元数据:
$HIVE_HOME/bin/schematool -dbType mysql -initSchema
7 hive启动服务:
cd $HIVE_HOME
$HIVE_HOME/bin/hive --service metastore & 远程服务端启动
$HIVE_HOME/bin/hive --service hiveserver2 & 远程客户端启动
$HIVE_HOME/bin/beeline
!connect jdbc:hive2://localhost:10000 $user_name $password
8 查看执行情况:
9 案例:
创建关于电影的一张表:
0: jdbc:hive2://localhost:10000>
CREATE TABLE u_data (
userid INT,
movieid INT,
rating INT,
unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
下载数据源
$ curl --remote-name http://files.grouplens.org/datasets/movielens/ml-100k.zip
文件下载在当前目录下,解压数据包
$ unzip ml-100k.zip
导入数据包到表里面
0: jdbc:hive2://localhost:10000>
LOAD DATA LOCAL INPATH '/Users/xxxxxxxx/Downloads/ml-100k/u.data'
OVERWRITE INTO TABLE u_data;
查询总的记录数,应该是100000条
0: jdbc:hive2://localhost:10000>
SELECT COUNT(*) FROM u_data;
创建一张新的表用于存放每周的记录书
0: jdbc:hive2://localhost:10000>
CREATE TABLE u_data_new (
userid INT,
movieid INT,
rating INT,
weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';
输入
$ vi weekday_mapper.py
import sys
import datetime
for line in sys.stdin:
line = line.strip()
userid, movieid, rating, unixtime = line.split('\t')
weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
print '\t'.join([userid, movieid, rating, str(weekday)])
0: jdbc:hive2://localhost:10000>
add FILE /Users/$User/Downloads/weekday_mapper.py;
0: jdbc:hive2://localhost:10000>
INSERT OVERWRITE TABLE u_data_new
SELECT
TRANSFORM (userid, movieid, rating, unixtime)
USING 'python weekday_mapper.py'
AS (userid, movieid, rating, weekday)
FROM u_data;
0: jdbc:hive2://localhost:10000>
SELECT weekday, COUNT(*)
FROM u_data_new
GROUP BY weekday;