SparkStreaming的幂等操作

最新推荐文章于 2022-03-04 22:59:37 发布

XLMN

最新推荐文章于 2022-03-04 22:59:37 发布

阅读量981

点赞数

分类专栏： Spark 文章标签： SparkStreaming的幂等操作

本文链接：https://blog.csdn.net/weixin_44701192/article/details/98032864

版权

Spark 专栏收录该内容

17 篇文章 0 订阅

订阅专栏

幂等操作

所谓的幂等操作，简单点，就是说，无论你执行多少次的操作，对于用户而言都是一次操作

/**
  */
object _04KafkaOffsetIdempotent {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")

    val processingInterval = 2
    val brokers = "bigdata01:9092,bigdata02:9092,bigdata03:9092"
    val topic = "mytopic1"
    val groupName =  "myspark"
    // Create direct kafka stream with brokers and topics
    val topicsSet = topic.split(",").toSet
    val kafkaParams = Map[String, String](
      "metadata.broker.list" -> brokers,
      "auto.offset.reset" -> "smallest",
      "group.id" -> groupName
    )

    /*
       1. 创建测试的mysql数据库
       create database mytest;
       2. 建表
       CREATE TABLE myorders(NAME VARCHAR(20), orderid VARCHAR(100) PRIMARY KEY);
       3. 新建topic： mytopic1
         kafka-topics.sh --zookeeper bigdata01:2181/kafka --create --topic mytopic1 --partitions 3 --replication-factor 1
       4. 往mytopic1发送数据， 数据格式为 "字符,数字"  比如  abc,3
     */

    val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
//读取kafka中信息
    val messages = KafkaManager.createMessage(ssc, kafkaParams, topicsSet, curator)

    val jdbcUrl =  "jdbc:mysql://localhost:3306/mytest"
    val jdbcUser = "root"
    val jdbcPassword = "sorry"
//判断是否有信息
    messages.foreachRDD(rdd=>{
      if(!rdd.isEmpty()) {
//获取偏移量
        val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges

        rdd.map(x => x._2).foreachPartition(partition => {
//获取数库连接
          val dbConn = DriverManager.getConnection(jdbcUrl, jdbcUser, jdbcPassword)
          partition.foreach(msg => {
            val name = msg.split(",")(0)
            val orderid = msg.split(",")(1)
            val sql = s"insert into myorders(name, orderid) values ('$name', '$orderid') ON DUPLICATE KEY UPDATE name='${name}'"
//进行预编译sql
            val pstmt = dbConn.prepareStatement(sql)
            pstmt.execute()
            pstmt.close()
          })
          dbConn.close()
        })
//将偏移量写回redis
        KafkaManager.storeOffsets(rdd.asInstanceOf[HasOffsetRanges].offsetRanges, groupName, curator)
      }
    })

    ssc.start()
    ssc.awaitTermination()
  }

  val curator = {
//创建zookeeper连接(通过builder创建zookeeper对象)
val client = CuratorFrameworkFactory.builder()
// connectString参数是ZooKeeper服务的地址和端口号,对于集群情况下的多个ZooKeeper示例,之间使用逗号分隔.比如
        .connectString("bigdata01:2181,bigdata02:2181,bigdata03:2181")
retryPolicy参数是指在连接ZK服务过程中重新连接测策略.在它的实现类ExponentialBackoffRetry(int baseSleepTimeMs, int maxRetries)中,baseSleepTimeMs参数代表两次连接的等待时间,maxRetries参数表示最大的尝试连接次数
        .retryPolicy(new ExponentialBackoffRetry(1000, 3))

        .namespace("kafka/consumers")//需要协商确定存储的zk的路径
相比于使用newClient()方法创建连接外,还可以使用builder()方法来控制更多的参数,如        .build()
    client.start()
    client
  }
}





**

## 用redis进行direct的偏移量offset管理对读取偏移量和写入
具体详情看[用redis进行direct的偏移量offset管理](https://mp.csdn.net/mdeditor/97967077#)
**
import kafka.common.TopicAndPartition
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import org.apache.curator.framework.CuratorFramework
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.{KafkaUtils, OffsetRange}
import org.apache.zookeeper.data.Stat

import scala.collection.mutable

object KafkaManager {

    /**
      * 更新offset
      *  /kafka/consumers/offsets/${topic}/${group}/${partition}
      */
    def storeOffsets(offsetRanges: Array[OffsetRange], group:String, curator:CuratorFramework) = {
        for (offsetRange <- offsetRanges) {
            val topic = offsetRange.topic
            val partition = offsetRange.partition
            val offset = offsetRange.untilOffset
            val path = s"/offset/${topic}/${group}/${partition}"
            checkExist(path, curator)
            curator.setData().forPath(path, (offset + "").getBytes())
        }
    }

    def createMessage(ssc: StreamingContext, kafkaParams: Map[String, String],
                      topics:Set[String], curator:CuratorFramework): InputDStream[(String, String)] = {
        //step 1 读取偏移量
        val fromOffsets:Map[TopicAndPartition, Long] = getFromOffsets(topics, kafkaParams("group.id"), curator)
        /*
            step 2 拉取kafka数据
                有offset
                    从相关偏移量的位置拉取数据
                无offset
                    从指定的offset.auto.reset对应的位置开始拉取数据
         */
        var messages:InputDStream[(String, String)] = null
        if(!fromOffsets.isEmpty) {
            //you偏移量
            val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
            messages = KafkaUtils.createDirectStream[String, String,
                StringDecoder, StringDecoder,
                (String, String)](ssc,
                kafkaParams, fromOffsets,
                messageHandler)
        } else {
            //无偏移量
            messages = KafkaUtils.createDirectStream[String, String,
                StringDecoder, StringDecoder](ssc, kafkaParams, topics)
        }
        messages
    }
    /*
        获取topic和partition对应的偏移量
        1、既然是基于zk的方式来管理offset，那么就需要在zk中的某个目录对应节点中存储offset信息，每次再更新
        2、所以我们可以模仿基于receiver的方式来保存的offset
        3、操作zk得需要zk的client
        约定数据存储的目录
        /kafka/consumers/bd-1901-group-1/offsets/spark/0 -->基于receiver的方式
        模仿
        /kafka/consumers/offsets/${topic}/${group}/${partition}
                                                    |data-offset
     */
    def getFromOffsets(topics:Set[String], group:String, curator:CuratorFramework):Map[TopicAndPartition, Long] = {
        val fromOffset = mutable.Map[TopicAndPartition, Long]()
        import scala.collection.JavaConversions._
        for(topic <- topics) {
            val basePath = s"/offset/${topic}/${group}"
            //basepath可能存在，可能不存在
            checkExist(basePath, curator)
            val partitions = curator.getChildren.forPath(basePath)//partitions是一个java的集合
            for(partition <- partitions) {
                val path = s"${basePath}/${partition}"
                val offset = new String(curator.getData.forPath(path)).toLong
                fromOffset.put(TopicAndPartition(topic, partition.toInt), offset)
            }
        }
        fromOffset.toMap
    }

    def checkExist(path:String, curator:CuratorFramework): Unit = {
        val stat:Stat = curator.checkExists().forPath(path)
        if(stat == null) {
            //创建该目录
            curator.create()
                .creatingParentsIfNeeded()//如果目录中存在多级未创建的目录，需要指定递归创建目录的方式
                .forPath(path)
        }

    }
}

XLMN

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
SparkStreaming的幂等操作

幂等操作所谓的幂等操作，简单点，就是说，无论你执行多少次的操作，对于用户而言都是一次操作/** */object _04KafkaOffsetIdempotent { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("test").setMaster("...
复制链接

扫一扫