SparkStreamingOffsetMysql将偏移量保存到MySQL中

<dependency>
<dependency>
            <groupId>org.apache.spark</groupId>
            <artifactId>spark-streaming_2.11</artifactId>
            <version>${spark.version}</version>
            <!--            <scope>provided</scope>-->
        </dependency>
 <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <version>5.1.26</version>
        </dependency>
        <dependency>
            <groupId>org.scalikejdbc</groupId>
            <artifactId>scalikejdbc-core_2.11</artifactId>
            <version>2.5.0</version>
        </dependency>
        <dependency>
            <groupId>org.scalikejdbc</groupId>
            <artifactId>scalikejdbc-config_2.11</artifactId>
            <version>2.5.0</version>
        </dependency>
package com.sparkStreaming.Demo12_OffsetMysql

/**
 * Created by Shi shuai RollerQing on 2019/12/21 10:38
 */

import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaCluster.Err
import org.apache.spark.streaming.kafka.{HasOffsetRanges, KafkaCluster, KafkaUtils}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scalikejdbc.{DB, SQL}
import scalikejdbc.config.DBs

/**
 * 将偏移量保存到MySQL中
 */
object SparkStreamingOffsetMysql {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("ssom").setMaster("local[2]")
    val ssc = new StreamingContext(conf,Seconds(3))
    //一系列基本的配置
    val groupid = "gp0123"
    val brokerList = "192.168.14.128:9092,192.168.14.129:9092,192.168.14.130:9092"
    val topic = "tt"
    //可能会有多个Topic
    val topics = Set(topic)
    //设置kafka的配置
    val kafkas = Map(
      "metadata.broker.list"->brokerList,
      "group.id"-> groupid,
      "auto.offset.reset"->kafka.api.OffsetRequest.SmallestTimeString
    )
    //加载配置
    DBs.setup()
    //这一块我们就不需要在进行查询ZK中的offset的了,直接查询MySQL中的offset数据
    val fromdbOffset :Map[TopicAndPartition,Long] =
      DB.readOnly{
        implicit session =>
          //查询每个分组下面的所有消息
          SQL(s"select * from offsets where groupId ='${groupid}'")
            //查询出来后,将MySQL中的数据赋值给这个元组
            .map(m=>(TopicAndPartition(
              m.string("topic"),m.int("partitions")),m.long("untilOffset")))
            .toList().apply()
      }.toMap //最后要toMap一下,因为前面的返回值已经给定
    // 创建一个InputDStream,然后根据offset读取数据
    var kafkaStream :InputDStream[(String,String)] = null
    //从MySQL中获取数据,进行判断
    if(fromdbOffset.size ==0){
      //如果程序第一次启动
      kafkaStream = KafkaUtils.
        createDirectStream[String,String,StringDecoder,StringDecoder](
          ssc,kafkas,topics)
    }else{
      //如果程序不是第一次启动
      //首先获取Topic和partition、offset
      var checckOffset = Map[TopicAndPartition,Long]()
      // 加载kafka的配置
      val kafkaCluster = new KafkaCluster(kafkas)
      //首先获取Kafka中的所有Topic partition offset
      val earliesOffsets: Either[Err,
        Map[TopicAndPartition, KafkaCluster.LeaderOffset]] =
        kafkaCluster.getEarliestLeaderOffsets(fromdbOffset.keySet)
      //然后开始进行比较大小,用MySQL中的offset和kafka的offset进行比较
      if(earliesOffsets.isRight){
        //取到我们需要的Map
        val topicAndPartitionOffset:
          Map[TopicAndPartition, KafkaCluster.LeaderOffset] =
          earliesOffsets.right.get
        // 来个比较直接进行比较大小
        checckOffset = fromdbOffset.map(owner=>{
          //取我们kafka汇总的offset
          val topicOffset = topicAndPartitionOffset.get(owner._1).get.offset
          //进行比较  不允许重复消费 取最大的
          if(owner._2 > topicOffset){
            owner
          }else{
            (owner._1,topicOffset)
          }
        })
      }
      //不是第一次启动的话,按照之前的偏移量继续读取数据
      val messageHandler = (mmd:MessageAndMetadata[String,String])=>{
        (mmd.key(),mmd.message())
      }
      kafkaStream = KafkaUtils.
        createDirectStream[String,String,
          StringDecoder,StringDecoder,
          (String,String)](ssc,kafkas,checckOffset,messageHandler)
    }
    //开始处理数据流,跟咱们之前的ZK那一块一样了
    kafkaStream.foreachRDD(kafkaRDD=>{
      //首先将获取的数据转换 获取offset  后面更新的时候使用
      val offsetRanges = kafkaRDD.asInstanceOf[HasOffsetRanges].offsetRanges
      val lines = kafkaRDD.map(_._2)
      lines.foreach(println)

      //更新偏移量
      DB.localTx{
        implicit session =>
          //取到所有的topic  partition offset
          for(os<-offsetRanges){
            //            // 通过SQL语句进行更新每次提交的偏移量数据
            //            SQL("UPDATE offsets SET groupId=?,topic=?,partitions=?,untilOffset=?")
            //              .bind(groupid,os.topic,os.partition,os.untilOffset).update().apply()
            SQL("replace into " +
              "offsets(groupId,topic,partitions,untilOffset) values(?,?,?,?)")
              .bind(groupid,os.topic,os.partition,os.untilOffset)
              .update().apply()
          }
      }
    })
    ssc.start()
    ssc.awaitTermination()
  }
}

Flink将偏移量保存MySQL可以通过实现Flink的Checkpoint和Savepoint机制,将每个任务的状态和偏移量保存MySQL。具体步骤如下: 1. 首先,需要在Flink作业启用Checkpoint和Savepoint机制,可以通过以下代码启用: ``` StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(); env.enableCheckpointing(5000); env.setStateBackend(new FsStateBackend("hdfs://localhost:9000/flink/checkpoints")); ``` 2. 然后,需要在Flink作业实现Checkpoint和Savepoint的回调函数。在回调函数,可以将每个任务的状态和偏移量保存MySQL。以下是一个简单的例子: ``` public class OffsetCheckpoint implements ListCheckpointed<Long> { private Long offset = 0L; private Connection connection; public OffsetCheckpoint() { try { Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/test", "root", "password"); } catch (ClassNotFoundException | SQLException e) { e.printStackTrace(); } } @Override public List<Long> snapshotState(long checkpointId, long timestamp) throws Exception { PreparedStatement statement = connection.prepareStatement("INSERT INTO offsets (checkpoint_id, offset) VALUES (?, ?)"); statement.setLong(1, checkpointId); statement.setLong(2, offset); statement.executeUpdate(); return Collections.singletonList(offset); } @Override public void restoreState(List<Long> state) throws Exception { if (!state.isEmpty()) { offset = state.get(0); } } } ``` 3. 最后,在Flink作业调用Checkpoint和Savepoint回调函数。以下是一个简单的例子: ``` DataStreamSource<String> stream = env.addSource(new FlinkKafkaConsumer<>("topic", new SimpleStringSchema(), properties)); stream .map(new MapFunction<String, String>() { @Override public String map(String value) throws Exception { // process data return value; } }) .addSink(new SinkFunction<String>() { @Override public void invoke(String value, Context context) throws Exception { // save offset OffsetCheckpoint checkpoint = new OffsetCheckpoint(); checkpoint.offset = context.currentWatermark(); env.addOperatorStateChangeListener(checkpoint); env.getCheckpointConfig().setCheckpointInterval(5000); } }); env.execute("job"); ``` 在以上代码,我们定义了一个名为`OffsetCheckpoint`的类,该类实现了Flink的`ListCheckpointed`接口。在`invoke`方法,我们创建了一个`OffsetCheckpoint`实例,并将当前任务的偏移量保存到该实例。然后,我们将`OffsetCheckpoint`实例添加到Flink作业的状态变化监听器,并设置Checkpoint间隔为5000毫秒。 以上就是将Flink的偏移量保存MySQL的基本步骤。需要注意的是,在实现过程还需要考虑异常处理、连接池管理等问题。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值