sparkstreaming和sparksql整合时的问题

背景:用sparkstreaming消费kafka然后用sparksql写入hive

出现的问题:单独写一个sparksql的程序,用sql(“show databases”)能正常显示所有的库,但是同样的代码,和sparkstreaming写到一个程序里,然后sql("show databases")就只显示一个default库,并且两个程序打印的sparkwarehouse都是同一路径。

初始代码:

package com.liuxw.main

import com.alibaba.fastjson.{JSON, JSONObject}
import com.liuxw.bean.Car
import com.liuxw.kafka.MyKafkaUtil
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.{SparkConf}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
 * Author  XiaoWen
 * Date  2021/2/25 11:20
 * Version 1.0
 */
object SparkToHive {

  def main(args: Array[String]): Unit = {

    // 定义常量
    val hiveDriver = "org.apache.hadoop.hive.jdbc.HiveDriver"
    val hiveUrl = "jdbc:hive2://ambari1:2181,ambari2:2181,ambari3:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2"
    val warehouseLocation = "/warehouse/tablespace/managed/hive"

//    Class.forName(hiveDriver)
//    val connection: Connection = DriverManager.getConnection(hiveUrl)

    // spark streaming init
    val sparkConf: SparkConf = new SparkConf().setAppName("WriteToHiveTmp").setMaster("local[*]")
    val streamingContext: StreamingContext = new StreamingContext(sparkConf,Seconds(5))
    
    // spark sql init
    val sparkSession: SparkSession = SparkSession
      .builder()
      .appName("tmpToPart")
      .config("spark.sql.warehouse.dir",warehouseLocation)
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()

    // 接kafka数据
    val recordStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream("vehicle",streamingContext)
    val jsonStream: DStream[JSONObject] = recordStream.map(_.value()).map(jsonString => {
      val data: JSONObject = JSON.parseObject(jsonString)
      println(data)
      val lp: String = data.get("license_plate").toString.replace("鲁","京")
      data.replace("license_plate",lp)
      data
    })

    // DStream的RDD转DF,并写入hive
    jsonStream.foreachRDD(rdd => {
      if (rdd.count()>0) {
        println("=======================================================================================")
        println("RDD size : " + rdd.count())
        println("=======================================================================================")
        import sparkSession.implicits._
        // 导入sparkSession.sql以后,sparkSession.sql("xxx")就可以替换成sql("xxx")
        import sparkSession.sql
        println("show databases")
        sql("show databases").show()
        sql("use yisa_oe")
        val carDF: DataFrame = rdd.map(data => Car(data.getIntValue("capture_time"),
          data.getLongValue("location_id"),
          data.getIntValue("color_id"),
          data.getString("license_plate")))
          .toDF()
        carDF.createOrReplaceTempView("tmpVehicle")
        println("show tables...")
        sql("show tables").show()
        println("select capture_time,location_id,color_id,license_plate from tmpVehicle limit 10")
        sql("select capture_time,location_id,color_id,license_plate from tmpVehicle limit 10").show()
        println("insert...")
        sql("insert into test_vehicle partition(date = '2020-03-02')  " +
          "select capture_time,location_id,color_id,license_plate from tmpVehicle")
        println("over.")
      }
    })
    streamingContext.start()
    streamingContext.awaitTermination()
  }
}

解决:

因为我单独把sparksql的代码拿出去是正常的,所以排除代码的问题,然后我就把sparksql的初始化放到sparkstreaming前面去,并在初始化sparksql以后,立马执行了sparksession.sql(“show databases”)

这时sparksql是正常显示了数据库有哪些,随后程序就报了错:

Exception in thread "main" org.apache.spark.SparkException: Only one SparkContext may be running in this JVM (see SPARK-2243). To ignore this error, set spark.driver.allowMultipleContexts = true. The currently running SparkContext was created at:
org.apache.spark.sql.SparkSession$Builder.getOrCreate(SparkSession.scala:925)

顾名思义,sparkcontext在sparksql创建了一次,在sparkstreaming又创建了一次。

修改以后,程序正常启动了。

附代码:

package com.liuxw.main

import com.alibaba.fastjson.{JSON, JSONObject}
import com.liuxw.bean.Car
import com.liuxw.kafka.MyKafkaUtil
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
 * Author  XiaoWen
 * Date  2021/2/25 11:20
 * Version 1.0
 */
object SparkToHive {

  def main(args: Array[String]): Unit = {

    // 定义常量
    val hiveDriver = "org.apache.hadoop.hive.jdbc.HiveDriver"
    val hiveUrl = "jdbc:hive2://ambari1:2181,ambari2:2181,ambari3:2181/;serviceDiscoveryMode=zooKeeper;zooKeeperNamespace=hiveserver2"
    val warehouseLocation = "/warehouse/tablespace/managed/hive"

//    Class.forName(hiveDriver)
//    val connection: Connection = DriverManager.getConnection(hiveUrl)

    // spark sql init
    val sparkSession: SparkSession = SparkSession
      .builder()
      .appName("tmpToPart")
      .config("spark.sql.warehouse.dir",warehouseLocation)
      .master("local[*]")
      .enableHiveSupport()
      .getOrCreate()
    // 必须用一个sparkcontext,下面不能再创建了,否则会冲突
    val sc: SparkContext = sparkSession.sparkContext

    /*println("=======================================================================================")
    sparkSession.sql("show databases").show()
    println("=======================================================================================")*/

    // spark streaming init
//    val sparkConf: SparkConf = new SparkConf().setAppName("WriteToHiveTmp").setMaster("local[*]")
    val streamingContext: StreamingContext = new StreamingContext(sc,Seconds(5))




    // 接kafka数据
    val recordStream: InputDStream[ConsumerRecord[String, String]] = MyKafkaUtil.getKafkaStream("vehicle",streamingContext)
    val jsonStream: DStream[JSONObject] = recordStream.map(_.value()).map(jsonString => {
      val data: JSONObject = JSON.parseObject(jsonString)
      println(data)
      val lp: String = data.get("license_plate").toString.replace("鲁","京")
      data.replace("license_plate",lp)
      data
    })

    // DStream的RDD转DF,并写入hive
    jsonStream.foreachRDD(rdd => {
      if (rdd.count()>0) {
        println("=======================================================================================")
        println("RDD size : " + rdd.count())
        println("=======================================================================================")
        import sparkSession.implicits._
        // 导入sparkSession.sql以后,sparkSession.sql("xxx")就可以替换成sql("xxx")
        import sparkSession.sql
        println("show databases")
        sql("show databases").show()
        sql("use yisa_oe")
        val carDF: DataFrame = rdd.map(data => Car(data.getIntValue("capture_time"),
          data.getLongValue("location_id"),
          data.getIntValue("color_id"),
          data.getString("license_plate")))
          .toDF()
        carDF.createOrReplaceTempView("tmpVehicle")
        println("show tables...")
        sql("show tables").show()
        println("select capture_time,location_id,color_id,license_plate from tmpVehicle order by capture_time desc limit 10 ")
        sql("select capture_time,location_id,color_id,license_plate from tmpVehicle limit 10").show()
        println("insert...")
        sql("insert into test_vehicle partition(capture_date = '2020-03-02')  " +
          "select capture_time,location_id,color_id,license_plate from tmpVehicle")
        println("over.")
        println("select capture_time,location_id,color_id,license_plate from tmpVehicle order by capture_time desc  limit 10")
        println()
      }
    })

    streamingContext.start()
    streamingContext.awaitTermination()

  }

}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值