问题描述:
在spark streaming的foreachrdd中使用spark session获取hive中的数据时,只会显示一个default库
解决方法:
1、在resources中放入集群中的core-site.xml、hdfs-site.xml、hive-site.xml
2、修改代码
之前的代码:
@transient
val sparkConf = new SparkConf()
.setAppName("REPORT_SYSTEM")
.setMaster("local[*]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//反压
.set("spark.streaming.backpressure.enabled", "true")
//限制每次作业中每个 Kafka 分区最多读取的记录条数
.set("spark.streaming.kafka.maxRatePerPartition", "100")
val ssc = new StreamingContext(sparkConf, Seconds(60))
...
DStream.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
SQLContextSingleton.getInstance(rdd.sparkContext.getConf).sql("show databases").show()
}
})
...
ssc.start()
ssc.awaitTermination()
object SQLContextSingleton {
@transient
private var instance: SparkSession = _
def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
instance = SparkSession.builder()
.config(sparkConf)
.config("hive.metastore.uris", "thrift://localhost:9083")
.config("spark.sql.warehouse.dir", GlobalConfigUtil.hdfsHosts + "/user/hive2/warehouse")
.config("hive.metastore.warehouse.dir", GlobalConfigUtil.hdfsHosts + "/user/hive2/warehouse")
.enableHiveSupport()
.getOrCreate()
}
instance
}
}
修改之后的代码:
@transient
val sparkConf = new SparkConf()
.setAppName("REPORT_SYSTEM")
.setMaster("local[*]")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//反压
.set("spark.streaming.backpressure.enabled", "true")
//限制每次作业中每个 Kafka 分区最多读取的记录条数
.set("spark.streaming.kafka.maxRatePerPartition", "100")
val spark: SparkSession = SparkSession.builder()
.enableHiveSupport()
.config(sparkConf)
.config("hive.metastore.uris", "thrift://localhost:9083")
.getOrCreate()
@transientval
sc = spark.sparkContext
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(60))
...
DStream.foreachRDD(rdd => {
if (!rdd.isEmpty()) {
spark.sql("show databases").show()
}
})
...
ssc.start()
ssc.awaitTermination()