1. 修改配置文件:
1.1 修改alluxio的配置文件
roles/install-alluxio/templates/alluxio-site.properties.j2:
alluxio.zookeeper.enabled=true
alluxio.zookeeper.address={{ groups['zookeeper'][0] }}:2181,{{ groups['zookeeper'][1] }}:2181,{{ groups['zookeeper'][2] }}:2181
1.2 修改spark的配置文件
playbooks/spark/vars.yml:
增加alluxio配置
#alluxio
alluxio_path: "/opt/alluxio-1.8.1"
spark_driver_path: "{{ alluxio_path }}/client/alluxio-1.8.1-client.jar"
spark_executor_path: "{{ alluxio_path }}/client/alluxio-1.8.1-client.jar"
alluxio_zookeeper_address: "192.168.6.12:2181,192.168.6.13:2181,192.168.6.14:2181"
#spark_properties部分增加:
spark_properties:
- {
"name":"spark.driver.extraClassPath",
"value":"{{ spark_driver_path }}"
}
- {
"name":"spark.executor.extraClassPath",
"value":"{{ spark_executor_path }}"
}
- {
"name":"spark.driver.extraJavaOptions",
"value":"-Dalluxio.zookeeper.address=192.168.6.12:2181,192.168.6.13:2181,192.168.6.14:2181 -Dalluxio.zookeeper.enabled=true"
}
- {
"name":"spark.executor.extraJavaOptions",
"value":"-Dalluxio.zookeeper.address=192.168.6.12:2181,192.168.6.13:2181,192.168.6.14:2181 -Dalluxio.zookeeper.enabled=true"
}
1.3 修改hive配置文件
playbooks/hive/vars.yml:
#alluxio
alluxio_path: "/opt/alluxio-1.8.1"
alluxio_zookeeper_address: "192.168.6.12:2181,192.168.6.13:2181,192.168.6.14:2181"
# hive configration增加
hive_site_properties:
- {
"name":"fs.alluxio.impl",
"value":"alluxio.hadoop.FileSystem"
}
- {
"name":"hive.metastore.schema.verification",
"value":"false"
}
- {
"name":"alluxio.zookeeper.enabled",
"value":"true"
}
- {
"name":"alluxio.zookeeper.address",
"value":"{{ alluxio_zookeeper_address }}"
}
1.4 修改hive配置文件
roles/hive/templates/hive_env.sh:
export HIVE_AUX_JARS_PATH={{ alluxio_path }}/client/alluxio-1.8.1-client.jar:$HIVE_AUX_JARS_PATH
export CLASSPATH=$CLASSPATH.:$HIVE_HOME/lib:$HIVE_AUX_JARS_PATH
2.准备测试数据
https://grouplens.org/datasets/movielens/
bin/alluxio fs mkdir /ml-100k
bin/alluxio fs copyFromLocal /opt/ml-100k/u.user alluxio://192.168.6.12:19998/ml-100k
3.启动spark thriftserver
sbin/start-thriftserver.sh
4.使用beeline测试
beeline
!connect jdbc:hive2://192.168.6.12:10000/default
hadoop #username
hadoop #password
用hadoop连接成功后,先创建表再查询:
内部表
CREATE TABLE u_user (
userid INT,
age INT,
gender CHAR(1),
occupation STRING,
zipcode STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LOCATION 'alluxio://192.168.6.12:19998/ml-100k';
#查询:
select * from u_user limit 10;
#统计查询:
select count(*) from u_user;
#查看表结构、分组查询:
desc u_user;
select avg(age),gender from u_user group by gender;
select count(*),gender from u_user group by gender;
创建外部表
CREATE EXTERNAL TABLE u_user_e (
userid INT,
age INT,
gender CHAR(1),
occupation STRING,
zipcode STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '|'
LOCATION 'alluxio://192.168.6.12:19998/ml-100k';