spark streaming使用数据源方式插入mysql数据

5 篇文章 0 订阅
4 篇文章 0 订阅
import java.sql.{Connection, ResultSet}
import com.jolbox.bonecp.{BoneCP, BoneCPConfig}
import org.slf4j.LoggerFactory

object ConnectionPool {

  val logger = LoggerFactory.getLogger(this.getClass)
  private val connectionPool = {
    try{
      Class.forName("com.mysql.jdbc.Driver")
      val config = new BoneCPConfig()
      config.setJdbcUrl("jdbc:mysql://192.168.0.46:3306/test")
      config.setUsername("test")
      config.setPassword("test")
      config.setMinConnectionsPerPartition(2)
      config.setMaxConnectionsPerPartition(5)
      config.setPartitionCount(3)
      config.setCloseConnectionWatch(true)
      config.setLogStatementsEnabled(true)
      Some(new BoneCP(config))
    } catch {
      case exception:Exception=>
        logger.warn("Error in creation of connection pool"+exception.printStackTrace())
        None
    }
  }

  def getConnection:Option[Connection] ={
    connectionPool match {
      case Some(connPool) => Some(connPool.getConnection)
      case None => None
    }
  }

  def closeConnection(connection:Connection): Unit = {
    if(!connection.isClosed) connection.close()
  }
}

 

import java.sql.{Connection, DriverManager, PreparedStatement}

import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory

/**
 * 记录最近五秒钟的数据
 */
object  RealtimeCount1{

  case class Loging(vtime:Long,muid:String,uid:String,ucp:String,category:String,autoSid:Int,dealerId:String,tuanId:String,newsId:String)

  case class Record(vtime:Long,muid:String,uid:String,item:String,types:String)


  val logger = LoggerFactory.getLogger(this.getClass)

  def main(args: Array[String]) {
    val argc = new Array[String](4)
    argc(0) = "10.0.0.37"
    argc(1) = "test-1"
    argc(2) = "test22"
    argc(3) = "1"
    val Array(zkQuorum, group, topics, numThreads) = argc
    val sparkConf = new SparkConf().setAppName("RealtimeCount").setMaster("local[2]")
    val sc = new SparkContext(sparkConf)
    val ssc = new StreamingContext(sc, Seconds(5))


    val topicMap = topics.split(",").map((_,numThreads.toInt)).toMap

    val lines = KafkaUtils.createStream(ssc, zkQuorum, group, topicMap).map(x=>x._2)

    val sql = "insert into loging_realtime1(vtime,muid,uid,item,category) values (?,?,?,?,?)"

    val tmpdf = lines.map(_.split("\t")).map(x=>Loging(x(9).toLong,x(1),x(0),x(3),x(25),x(18).toInt,x(29),x(30),x(28))).filter(x=>(x.muid!=null && !x.muid.equals("null") && !("").equals(x.muid))).map(x=>Record(x.vtime,x.muid,x.uid,getItem(x.category,x.ucp,x.newsId,x.autoSid.toInt,x.dealerId,x.tuanId),getType(x.category,x.ucp,x.newsId,x.autoSid.toInt,x.dealerId,x.tuanId)))
    tmpdf.filter(x=>x.types!=null).foreachRDD{rdd =>
      //rdd.foreach(println)
      rdd.foreachPartition(partitionRecords=>{
        val connection = ConnectionPool.getConnection.getOrElse(null)
        if(connection!=null){
          partitionRecords.foreach(record=>process(connection,sql,record))
          ConnectionPool.closeConnection(connection)
        }
      })
    }
    ssc.start()
    ssc.awaitTermination()
  }

  def getItem(category:String,ucp:String,newsId:String,autoSid:Int,dealerId:String,tuanId:String):String = {
    if(category!=null && !category.equals("null")){
      val pattern = "http://www.ihaha.com/\\d{4}-\\d{2}-\\d{2}/\\d{9}.html"
      val matcher = ucp.matches(pattern)
      if(matcher) {
        ucp.substring(33,42)
      }else{
        null
      }
    }else if(autoSid!=0){
      autoSid.toString
    }else if(dealerId!=null && !dealerId.equals("null")){
      dealerId
    }else if(tuanId!=null && !tuanId.equals("null")){
      tuanId
    }else{
      null
    }
  }

  def getType(category:String,ucp:String,newsId:String,autoSid:Int,dealerId:String,tuanId:String):String = {
    if(category!=null && !category.equals("null")){
      val pattern = "100000726;100000730;\\d{9};\\d{9}"
      val matcher = category.matches(pattern)

      val pattern1 = "http://www.chexun.com/\\d{4}-\\d{2}-\\d{2}/\\d{9}.html"
      val matcher1 = ucp.matches(pattern1)

      if(matcher1 && matcher) {
        "nv"
      }else if(newsId!=null && !newsId.equals("null") && matcher1){
        "ns"
      }else if(matcher1){
        "ne"
      }else{
        null
      }
    }else if(autoSid!=0){
      "as"
    }else if(dealerId!=null && !dealerId.equals("null")){
      "di"
    }else if(tuanId!=null && !tuanId.equals("null")){
      "ti"
    }else{
      null
    }
  }

  def process(conn:Connection,sql:String,data:Record): Unit ={
    try{
        val ps : PreparedStatement = conn.prepareStatement(sql)
        ps.setLong(1,data.vtime)
        ps.setString(2,data.muid)
        ps.setString(3,data.uid)
        ps.setString(4,data.item)
        ps.setString(5,data.types)
        ps.executeUpdate()
    }catch{
      case exception:Exception=>
        logger.warn("Error in execution of query"+exception.printStackTrace())
    }
  }
}

 

使用连接池的方式获取connection

可以使用JDBC(Java数据库连接)来将MySQL数据源加载到Spark Streaming中。具体来说,可以通过以下步骤实现: 1. 在Spark Streaming中创建一个StreamingContext对象。 2. 创建一个JDBC连接,连接到MySQL数据库。 3. 使用Spark Streaming内置的DStream对象创建一个输入流,该输入流将使用提供的数据源MySQL数据库中读取数据。 4. 对每个批次的数据执行必要的转换和操作。 5. 将数据写回MySQL数据库。 下面是一个示例代码: ```python from pyspark.streaming import StreamingContext from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext import datetime conf = SparkConf().setAppName("MySQLStream") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) ssc = StreamingContext(sc, batchDuration=1) # 创建JDBC连接 jdbcUrl = "jdbc:mysql://localhost:3306/mydatabase" jdbcUsername = "username" jdbcPassword = "password" # 使用DStream对象创建输入流 def createMySQLStream(): inputDF = sqlContext.read.format("jdbc").options(url=jdbcUrl, dbtable="mytable", user=jdbcUsername, password=jdbcPassword).load() stream = inputDF.writeStream.outputMode("append").format("console").start() return stream # 对每个批次的数据执行必要的转换和操作 def processStream(batchTime, rdd): if not rdd.isEmpty(): # 在这里处理批次的数据 processedData = rdd # 将数据写回MySQL数据库 processedData.write.jdbc(url=jdbcUrl, table="outputtable", mode="append", properties={"user": jdbcUsername, "password": jdbcPassword}) # 创建DStream对象,并开始流处理 stream = createMySQLStream() stream.foreachRDD(processStream) ssc.start() ssc.awaitTermination() ``` 注意:这只是一个简单的示例,实际生产环境中需要进行更多的错误处理和优化。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值