spark手动维护kafka偏移量=＞存储介质mysql/redis

最新推荐文章于 2022-10-12 22:42:41 发布

两人走

最新推荐文章于 2022-10-12 22:42:41 发布

阅读量283

点赞数

分类专栏： spark 文章标签： kafka spark 大数据数据库 jedis

本文链接：https://blog.csdn.net/weixin_43862280/article/details/109094084

版权

spark 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

package common

import java.util
import redis.clients.jedis.{BinaryJedisCluster, HostAndPort, JedisCluster, JedisPoolConfig}

/**
  * JedisCluster连接工具
  * @author IT803300
  *  @date 2020-10-15
  */
object RedisDBUtil  extends Serializable  {

  private val nodes:util.Set[HostAndPort] =  new util.HashSet[HostAndPort]()
  //连接超时
  private var connectionTimeOut: Int = 6000
  //读取数据超时时间
  private var soTimeOut: Int = 2000
  //重试次数
  private var maxAttempts: Int = 3
  //最大活动对象数
  private var maxTotal: Int = 1000
  //最大最小保持idle状态(连接空时间)的对象数
  private var maxIdle: Int = 100
  private var minIdle: Int = 0
  private var poolConfig: JedisPoolConfig = _
  //密码
  private var password:String = ""
  var cluster:JedisCluster = _

  def redisCluster(): JedisCluster = {
    cluster = new JedisCluster(nodes, this.connectionTimeOut, this.soTimeOut, this.maxAttempts, this.password, this.poolConfig)
    cluster
  }

  def settingRedis(nodes: Array[String],
                   connectionTimeOut: Int,
                   soTimeOut: Int,
                   maxAttempts: Int,
                   password: String,
                   maxTotal: Int,
                   maxIdle: Int,
                   minIdle: Int): Unit = {
    //节点配置
    nodes.foreach(node => {
      val ipPort = node.split(":")
      val port = new HostAndPort(ipPort(0), ipPort(1).toInt)
      this.nodes.add(port)
      this.connectionTimeOut = connectionTimeOut
      this.soTimeOut = soTimeOut
      this.maxAttempts = maxAttempts
      this.password = password
      poolConfig = new JedisPoolConfig()
      poolConfig.setMaxTotal(maxTotal)
      poolConfig.setMaxIdle(maxIdle)
      poolConfig.setMinIdle(minIdle)
    })
  }
}

def main(args: Array[String]): Unit = {
    // 参数校验
    //    if (args.length < 1) {
    //      System.err.println( s"""| <configurationFile> is the configuration file """.stripMargin)
    //      System.exit(1)
    //    }


    val conf = new ConfigProperties("D://test.properties") // 测试
    val sparkConf = new SparkConf().setAppName("app.name").setMaster("local") // 测试


    //    val conf = new ConfigProperties(args(0)) // 生产
    //    val sparkConf = new SparkConf().setAppName(conf.get("app.name")) // 生产

    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

    val scc = new StreamingContext(sparkConf, Seconds(conf.get("streaming.batch.duration").toLong))

    // kafka 初始化
    val topics = conf.get("input.kafka.topics").split(",")
    val username = conf.get("input.kafka.username")
    val password = conf.get("input.kafka.password")
    val saslJaasConfig = classOf[PlainLoginModule].getName + " required username=\"" + username + "\" password=\"" + password + "\";"
    val kafkaServers = conf.get("input.kafka.bootstrap.servers")
    val groupId = conf.get("input.kafka.group.id")

    val kafkaParam = Map("bootstrap.servers" -> kafkaServers,
      "sasl.jaas.config" -> saslJaasConfig,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest",
      "security.protocol" -> "SASL_PLAINTEXT",
      "sasl.mechanism" -> "PLAIN")

    if (conf.get("input.kafka.session.timeout.ms", null) != null) {
      kafkaParam.+("session.timeout.ms" -> conf.get("input.kafka.session.timeout.ms"))
    }
    if (conf.get("input.kafka.max.poll.interval.ms", null) != null) {
      kafkaParam.+("max.poll.interval.ms" -> conf.get("input.kafka.max.poll.interval.ms"))
    }
    if (conf.get("input.kafka.max.poll.records", null) != null) {
      kafkaParam.+("max.poll.records" -> conf.get("input.kafka.max.poll.records"))
    }

    try {
      /**
        * 一: 初始化mysql配置
        * 二: 从数据库读取偏移量分为两种情况:
        *        1.如果有,自从偏移量开始读
        *        2.如果没有.从last开始读
        */
      /*      val mysqlUsername = conf.get("offset.mysql.username")
            val mysqlPassword = conf.get("offset.mysql.password")
            val mysqlJdbcUrl = conf.get("offset.mysql.jdbc.url")
            val offsetMap = OffsetUtil.getOffsetMapFromMysql(groupId, topics, mysqlJdbcUrl, mysqlUsername, mysqlPassword)
            val kafkaStream = if (offsetMap.size > 0) {
              KafkaUtils.createDirectStream[String, String](
                scc,
                //位置策略,源码强烈推荐使用该策略,会让Spark的Executor和Kafka的Broker均匀对应
                LocationStrategies.PreferConsistent,
                //消费策略,源码推荐使用该策略
                ConsumerStrategies.Subscribe[String, String](topics, kafkaParam, offsetMap))
            } else {
              KafkaUtils.createDirectStream[String, String](
                scc,
                LocationStrategies.PreferConsistent,
                //消费策略,源码推荐使用该策略
                ConsumerStrategies.Subscribe[String, String](topics, kafkaParam))
            }*/

      //redis初始化配置
      val nodes = conf.get("output.redis.uri").split(",")
      val connectionTimeOut = conf.get("output.redis.connectionTimeout").toInt
      val soTimeOut = conf.get("output.redis.soTimeout").toInt
      val maxAttempts = conf.get("output.redis.maxAttempts").toInt
      val RedisPassword = conf.get("output.redis.password").toString
      val maxTotal = conf.get("output.redis.maxTotal").toInt
      val maxIdle = conf.get("output.redis.maxIdle").toInt
      val minIdle = conf.get("output.redis.minIdle").toInt
      RedisDBUtil.settingRedis(nodes, connectionTimeOut, soTimeOut, maxAttempts, RedisPassword, maxTotal, maxIdle, minIdle)
      val jedis = RedisDBUtil.redisCluster()

      val offsetMap = OffsetUtil.getOffsetMapFromRedis(groupId, topics, jedis)
      val kafkaStream = if (offsetMap.size > 0) {
        KafkaUtils.createDirectStream[String, String](
          scc,
          //位置策略,源码强烈推荐使用该策略,会让Spark的Executor和Kafka的Broker均匀对应
          LocationStrategies.PreferConsistent,
          //消费策略,源码推荐使用该策略
          ConsumerStrategies.Subscribe[String, String](topics, kafkaParam, offsetMap))
      } else {
        KafkaUtils.createDirectStream[String, String](
          scc,
          LocationStrategies.PreferConsistent,
          //消费策略,源码推荐使用该策略
          ConsumerStrategies.Subscribe[String, String](topics, kafkaParam))
      }

      // 广播KafkaSink 到每个executor 用于输出到kafka
      val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
        val username = conf.get("output.kafka.username")
        val password = conf.get("output.kafka.password")
        val brokers = conf.get("output.kafka.bootstrap.servers")
        val kafkaProducerConfig = KafkaFactory.getProducerConfigProperties(brokers, username, password)

        println("kafka producer init done!")
        scc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
      }

      //共有kafka的topic配置
      val mpprTopics = conf.get("output.kafka.topics.mppr").split(",") //1000
      val outputTopicMap = selectTopic(mpprTopics)

      // 容器云kafka原始日志消费
      // 先用正则过滤掉不合规的日志，然后再从日志提取相关字段，转成JSON
      kafkaStream.foreachRDD(rdd => {
        if (rdd.count() > 0) {
          rdd.map(_.value()).map(rawLogStr => {
            LogProcessUtil.getLogContentFromKafka(rawLogStr)
          }).filter(originLogStr => {
            println("原始日志: " + originLogStr)
            LogProcessUtil.platLogCheck(originLogStr)
          }).map(logStr => {
            LogProcessUtil.getPlatformLogJson(logStr)
            //操作map()生成的RDD,用于将日志输入到kafka
          }).foreach(record => {
            try {
              println("json字符串日志: " + record)
              //判断record日志中的区别条件,push到指定的topic中
              val recordJson = JSON.parseObject(record)
              if (recordJson.get("chlid").equals("1000")) {
                kafkaProducer.value.send(outputTopicMap("1000"), record)
                saveOffsetForRedis(rdd, groupId)

              } else if (recordJson.get("chlid").equals("1001")) {
                kafkaProducer.value.send(outputTopicMap("1001"), record)
                saveOffsetForRedis(rdd, groupId)
              }
            } catch {
              case e: Throwable => println(e)
            }
          })
        }
      })
    } catch {
      case e: Exception => print(e)
    }
    scc.start()
    scc.awaitTermination()
  }

  def saveOffsetForRedis(rdd: RDD[ConsumerRecord[String, String]], groupId: String) = {
    // spark提供了一个类HasOffsetRanges,帮我们封装offset的数据
    val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
    for (o <- offsetRanges) {
      println(s"topic=${o.topic},partition=${o.partition},fromOffset=${o.fromOffset},untilOffset=${o.untilOffset}")
    }
    //手动提交offset,默认提交到Checkpoint中
    //recordDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
    //实际中偏移量可以提交到MySQL/Redis中
    OffsetUtil.savaOffsetForResis(groupId, offsetRanges)
  }

  //将kafka的offset保存到mysql
  def saveOffsetForMysql(rdd: RDD[ConsumerRecord[String, String]], groupId: String, mysqlJdbcUrl: String, mysqlUsername: String, mysqlPassword: String) = {
    // spark提供了一个类HasOffsetRanges,帮我们封装offset的数据
    val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
    for (o <- offsetRanges) {
      println(s"topic=${o.topic},partition=${o.partition},fromOffset=${o.fromOffset},untilOffset=${o.untilOffset}")
    }
    //手动提交offset,默认提交到Checkpoint中
    //recordDStream.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
    //实际中偏移量可以提交到MySQL/Redis中
    OffsetUtil.saveOffsetforMysql(groupId, offsetRanges, mysqlJdbcUrl, mysqlUsername, mysqlPassword)
  }

  def selectTopic(topicArray: Array[String]): Map[String, String] = {
    var resultMap: Map[String, String] = new HashMap[String, String]()
    topicArray.foreach(topic => {
      val topicArray = topic.split("_").toArray
      resultMap += (topicArray(5) -> topic)
    })
    return resultMap
  }

package common

import java.sql.{DriverManager, ResultSet}

import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import redis.clients.jedis.{BinaryJedisCluster, JedisCluster}

import scala.collection.JavaConversions._
import scala.collection.mutable

/**
  * offset工具类
  * @author IT803300
  *  @date 2020-10-15
  */
object OffsetUtil {

  /**
    * Mysql取offset
    *
    * @param groupId
    * @param topic
    * @return
    */
  def getOffsetMapFromMysql(groupId: String, topics: Array[String], mysqlJdbcUrl: String, mysqlUsername: String, mysqlPassword: String) = {
    //连接数据库
    val connection = DriverManager.getConnection(mysqlJdbcUrl, mysqlUsername, mysqlPassword)
    val pstmt = connection.prepareStatement("select `topic`,`partition`,`offset` from dataServer_offset where `groupId`=? and `topic`=?")
    //存放结果
    val offsetMap = new mutable.HashMap[TopicPartition, Long]()
    var result: ResultSet = null
    topics.foreach(topic => {
      //设置参数
      pstmt.setString(1, groupId)
      pstmt.setString(2, topic)
      result = pstmt.executeQuery()
      while (result.next()) {
        offsetMap += new TopicPartition(result.getString("topic"), result.getInt("partition")) -> result.getLong("offset")
      }
    })
    //关闭资源
    result.close()
    pstmt.close()
    connection.close()
    offsetMap
  }

  /**
    * Mysql存offset
    *
    * @param groupid
    * @param offsetRange
    */
  def saveOffsetforMysql(groupid: String, offsetRange: Array[OffsetRange], mysqlJdbcUrl: String, mysqlUsername: String, mysqlPassword: String) = {
    //连接数据库
    val connection = DriverManager.getConnection(mysqlJdbcUrl, mysqlUsername, mysqlPassword)
    val pstmt = connection.prepareStatement("replace into dataServer_offset(`topic`, `partition`, `groupId`, `offset`) values(?,?,?,?)")
    //设置参数
    offsetRange.foreach(o => {
      pstmt.setString(1, o.topic)
      pstmt.setInt(2, o.partition)
      pstmt.setString(3, groupid)
      pstmt.setLong(4, o.untilOffset)
      pstmt.executeUpdate()
    })
    //关闭资源
    pstmt.close()
    connection.close()
  }


  def getOffsetMapFromRedis(groupId: String, topics: Array[String], jedis: JedisCluster) = {
    //结果集
    val offsetMap = new mutable.HashMap[TopicPartition, Long]()
    topics.foreach(topic => {
      //因为我们也不知道在那个topic和那个分区,只能读取所有数据
      val resultMap = jedis.hgetAll(topic)
      for((groupAndPartition:String, offsetStr:String) <- resultMap){
        val gps = groupAndPartition.split("\\|")
        val redisGroup = gps(0)
        if(redisGroup == groupId) {
          val partition = gps(1).toInt
          val offset = offsetStr.toLong
          offsetMap += new TopicPartition(topic, partition) -> offset
        }
      }
    })

    jedis.close()
    offsetMap
  }

  def savaOffsetForResis(groupid: String, offsetRange: Array[OffsetRange]) = {
    val jedis = RedisDBUtil.cluster
    for(offsetRange <- offsetRange) {
      val topic = offsetRange.topic
      val partition = offsetRange.partition
      val offset = offsetRange.untilOffset
      val topicAndPartition = s"${groupid}|${partition}"
      jedis.hset(topic, topicAndPartition, offset + "")
    }
    jedis.close()
  }
}

两人走

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark手动维护kafka偏移量=＞存储介质mysql/redis

package commonimport java.utilimport redis.clients.jedis.{HostAndPort, JedisCluster, JedisPoolConfig}/** * JedisCluster连接工具 * @author IT803300 * @date 2020-10-15 */object RedisDBUtil { private val nodes:util.Set[HostAndPort] = new util.H
复制链接

扫一扫

专栏目录