目录
1. 读配置文件的方式
根据官网相关章节的说明,为了使spark能够获取到hive, hadoop的相关配置,可将hive-site.xml, core-site.xml, hdfs-site.xml这三个配置文件放到集群的${SPARK_HOME}/conf之下,而我们在项目开发时,maven项目中,则需放到resources目录之下,方便SparkConf实例化对象取到值。由于spark是通过thrift协议连接到hive的metastore服务,因此,在hive-site.xml中应加入如下配置:
<property>
<name>hive.metastore.uris</name>
<value>thrift://vm01:9083,thrift://vm02:9083,thrift://vm03:9083</value>
</property>
<property>
<name>hive.metastore.schema.verification</name>
<value>false</value>
</property>
配置文件准备就绪后,现在启动hadoop集群和hive metastore服务:
#!/bin/bash
# 启动hadoop集群hdfs和yarn
${HADOOP_HOME}/sbin/start-all.sh
# 启动hive metastore
${HIVE_HOME}/bin/hive --service metastore
在maven项目的pom.xml文件中添加相关依赖包:
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
编码:
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
public class JavaSparkSQLHive {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaSparkSQLHive").setMaster("local");
SparkSession spark = SparkSession.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate();
spark.sql("SHOW databases").show();
spark.close();
}
}
很简单,如果获取不到对应配置文件信息,可以手动读取并加载。
2. 通过SparkConf类自定义配置
先扯个hadoop的闲篇,我们知道,如果要在maven项目中开发相关程序,最方便快捷的方式也是将core-site.xml, hdfs-site.xml等配置文件放到resources目录下,有时候配置信息过多,看得我们很眼花,或者我们想进行自定义配置,我们会这样做:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import java.io.IOException;
import java.util.StringJoiner;
public class HaNamNodeDemo {
private static final String defaultFSStr = "vm01:9000,vm02:9000,vm03:9000";
private static final String splitter = ",";
public static void main(String[] args) throws IOException {
// 进行hadoop的相关配置
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hdfsCluster");
conf.set("dfs.nameservices", "hdfsCluster");
StringJoiner hostJoiner = new StringJoiner(splitter);
String[] defaultFSArr = defaultFSStr.split(splitter);
for (int i = 1; i <= defaultFSArr.length; i++) {
hostJoiner.add("nn" + 1);
conf.set("dfs.namenode.rpc-address.hdfsCluster.nn" + i, defaultFSArr[i - 1]);
}
conf.set("dfs.ha.namenodes.hdfsCluster", hostJoiner.toString());
conf.set("dfs.client.failover.proxy.provider.hdfsCluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
// 列出HDFS /user 目录下的文件并打印
FileSystem fs = FileSystem.get(conf);
RemoteIterator<LocatedFileStatus> files = fs.listFiles(new Path("/user"), false);
while (files.hasNext()) {
System.out.println(files.next().getPath());
}
fs.close();
}
}
这里Configuration的实例对象完成了NameNode的高可用配置,三个NameNode节点在集群主备切换时也能自动完成对应切换,换成SparkConf,写法完全相同:
package com.baich.bigdata.learning_spark_java.spark_sql;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import java.util.StringJoiner;
public class JavaSparkReadHive {
private static final String defaultFSStr = "vm01:9000,vm02:9000,vm03:9000";
private static final String splitter = ",";
public static void main(String[] args) {
// 进行spark的相关配置
SparkConf conf = new SparkConf().setAppName("JavaSparkReadHive").setMaster("local");
conf.set("fs.defaultFS", "hdfs://hdfsCluster");
conf.set("dfs.nameservices", "hdfsCluster");
StringJoiner hostJoiner = new StringJoiner(splitter);
String[] defaultFSArr = defaultFSStr.split(splitter);
for (int i = 1; i <= defaultFSArr.length; i++) {
hostJoiner.add("nn" + 1);
conf.set("dfs.namenode.rpc-address.hdfsCluster.nn" + i, defaultFSArr[i - 1]);
}
conf.set("dfs.ha.namenodes.hdfsCluster", hostJoiner.toString());
conf.set("dfs.client.failover.proxy.provider.hdfsCluster", "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider");
// 获取hive连接配置
conf.set("hive.exec.dynamic.partition.mode", "nonstrict");
conf.set("hive.metastore.uris", "thrift://vm01:9083,thrift://vm02:9083,thrift://vm03:9083");
SparkSession spark = SparkSession.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate();
spark.sql("SHOW DATABASES").show();
spark.close();
}
}
END.