背景:用sparkstreaming消费kafka然后用sparksql写入hive
出现的问题:单独写一个sparksql的程序,用sql(“show databases”)能正常显示所有的库,但是同样的代码,和sparkstreaming写到一个程序里,然后sql("show databases")就只显示一个default库,并且两个程序打印的sparkwarehouse都是同一路径。
初始代码:
package com.liuxw.main
import com.alibaba.fastjson.{JSON, JSONObject}
import com.liuxw.bean.Car
import com.liuxw.kafka.MyKafkaUtil
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.{SparkConf}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Author XiaoWen
* Date 2021/2/25 11:20
* Version 1.0
*/
object SparkToHive {
def main(args: Array[String]): Unit = {
// 定义常量
val hiveDriver = "org.apache.hadoop.hive.jdbc.HiveDriver"
val hiveUrl = "jdbc:hive2://ambari1:2181,ambari2:2181,ambari3:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2"
val warehouseLocation = "/warehouse/tablespace/managed/hive"
// Class.forName(hiveDriver)
// val connection: Connection = DriverManager.getConnection(hiveUrl)
// spark streaming init
val sparkConf: SparkConf = new SparkConf().setAppName("WriteToHiveTmp").setMaster("local[*]")
val streamingContext: StreamingContext = new StreamingContext(sparkConf,Seconds(5))
// spark sql init
val sparkSession: SparkSession = SparkSession
.builder()
.appName("tmpToPart")
.config("spark.sql.warehouse.dir",warehouseLocation)
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
// 接kafka数据
val recordStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream("vehicle",streamingContext)
val jsonStream: DStream[JSONObject] = recordStream.map(_.value()).map(jsonString => {
val data: JSONObject = JSON.parseObject(jsonString)
println(data)
val lp: String = data.get("license_plate").toString.replace("鲁","京")
data.replace("license_plate",lp)
data
})
// DStream的RDD转DF,并写入hive
jsonStream.foreachRDD(rdd => {
if (rdd.count()>0) {
println("=======================================================================================")
println("RDD size : " + rdd.count())
println("=======================================================================================")
import sparkSession.implicits._
// 导入sparkSession.sql以后,sparkSession.sql("xxx")就可以替换成sql("xxx")
import sparkSession.sql
println("show databases")
sql("show databases").show()
sql("use yisa_oe")
val carDF: DataFrame = rdd.map(data => Car(data.getIntValue("capture_time"),
data.getLongValue("location_id"),
data.getIntValue("color_id"),
data.getString("license_plate")))
.toDF()
carDF.createOrReplaceTempView("tmpVehicle")
println("show tables...")
sql("show tables").show()
println("select capture_time,location_id,color_id,license_plate from tmpVehicle limit 10")
sql("select capture_time,location_id,color_id,license_plate from tmpVehicle limit 10").show()
println("insert...")
sql("insert into test_vehicle partition(date = '2020-03-02') " +
"select capture_time,location_id,color_id,license_plate from tmpVehicle")
println("over.")
}
})
streamingContext.start()
streamingContext.awaitTermination()
}
}
解决:
因为我单独把sparksql的代码拿出去是正常的,所以排除代码的问题,然后我就把sparksql的初始化放到sparkstreaming前面去,并在初始化sparksql以后,立马执行了sparksession.sql(“show databases”)
这时sparksql是正常显示了数据库有哪些,随后程序就报了错:
Exception in thread "main" org.apache.spark.SparkException: Only one SparkContext may be running in this JVM (see SPARK-2243). To ignore this error, set spark.driver.allowMultipleContexts = true. The currently running SparkContext was created at:
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)
顾名思义,sparkcontext在sparksql创建了一次,在sparkstreaming又创建了一次。
修改以后,程序正常启动了。
附代码:
package com.liuxw.main
import com.alibaba.fastjson.{JSON, JSONObject}
import com.liuxw.bean.Car
import com.liuxw.kafka.MyKafkaUtil
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Author XiaoWen
* Date 2021/2/25 11:20
* Version 1.0
*/
object SparkToHive {
def main(args: Array[String]): Unit = {
// 定义常量
val hiveDriver = "org.apache.hadoop.hive.jdbc.HiveDriver"
val hiveUrl = "jdbc:hive2://ambari1:2181,ambari2:2181,ambari3:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2"
val warehouseLocation = "/warehouse/tablespace/managed/hive"
// Class.forName(hiveDriver)
// val connection: Connection = DriverManager.getConnection(hiveUrl)
// spark sql init
val sparkSession: SparkSession = SparkSession
.builder()
.appName("tmpToPart")
.config("spark.sql.warehouse.dir",warehouseLocation)
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
// 必须用一个sparkcontext,下面不能再创建了,否则会冲突
val sc: SparkContext = sparkSession.sparkContext
/*println("=======================================================================================")
sparkSession.sql("show databases").show()
println("=======================================================================================")*/
// spark streaming init
// val sparkConf: SparkConf = new SparkConf().setAppName("WriteToHiveTmp").setMaster("local[*]")
val streamingContext: StreamingContext = new StreamingContext(sc,Seconds(5))
// 接kafka数据
val recordStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream("vehicle",streamingContext)
val jsonStream: DStream[JSONObject] = recordStream.map(_.value()).map(jsonString => {
val data: JSONObject = JSON.parseObject(jsonString)
println(data)
val lp: String = data.get("license_plate").toString.replace("鲁","京")
data.replace("license_plate",lp)
data
})
// DStream的RDD转DF,并写入hive
jsonStream.foreachRDD(rdd => {
if (rdd.count()>0) {
println("=======================================================================================")
println("RDD size : " + rdd.count())
println("=======================================================================================")
import sparkSession.implicits._
// 导入sparkSession.sql以后,sparkSession.sql("xxx")就可以替换成sql("xxx")
import sparkSession.sql
println("show databases")
sql("show databases").show()
sql("use yisa_oe")
val carDF: DataFrame = rdd.map(data => Car(data.getIntValue("capture_time"),
data.getLongValue("location_id"),
data.getIntValue("color_id"),
data.getString("license_plate")))
.toDF()
carDF.createOrReplaceTempView("tmpVehicle")
println("show tables...")
sql("show tables").show()
println("select capture_time,location_id,color_id,license_plate from tmpVehicle order by capture_time desc limit 10 ")
sql("select capture_time,location_id,color_id,license_plate from tmpVehicle limit 10").show()
println("insert...")
sql("insert into test_vehicle partition(capture_date = '2020-03-02') " +
"select capture_time,location_id,color_id,license_plate from tmpVehicle")
println("over.")
println("select capture_time,location_id,color_id,license_plate from tmpVehicle order by capture_time desc limit 10")
println()
}
})
streamingContext.start()
streamingContext.awaitTermination()
}
}