Spark Streaming No Receivers 方式的createDirectStream 方法不使用接收器,而是创建输入流直接从Kafka 集群节点拉取消息。输入流保证每个消息从Kafka 集群拉取以后只完全转换一次,保证语义一致性。但是当作业发生故障或重启时,要保障从当前的消费位点去处理数据(即Exactly Once语义),单纯的依靠SparkStreaming本身的机制是不太理想的,生产环境中通常借助手动管理offset的方式来维护kafka的消费位点。本文分享将介绍如何手动管理Kafka的Offset,希望对你有所帮助。本文主要包括以下内容:
- 如何使用MySQL管理Kafka的Offset
- 如何使用Redis管理Kafka的OffSet
如何使用MySQL管理Kafka的Offset
我们可以从Spark Streaming 应用程序中编写代码来手动管理Kafka偏移量,偏移量可以从每一批流处理中生成的RDDS偏移量来获取,获取方式为:
KafkaUtils.createDirectStream(...).foreachRDD {
rdd =>
// 获取偏移量
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
...
}
当获取到偏移量之后,可以将将其保存到外部存储设备中(MySQL、Redis、Zookeeper、HBase等)。
使用案例代码
- MySQL中用于保存偏移量的表
CREATE TABLE `topic_par_group_offset` (
`topic` varchar(255) NOT NULL,
`partition` int(11) NOT NULL,
`groupid` varchar(255) NOT NULL,
`offset` bigint(20) DEFAULT NULL,
PRIMARY KEY (`topic`,`partition`,`groupid`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 ;
- 常量配置类:ConfigConstants
object ConfigConstants {
// Kafka配置
val kafkaBrokers = "kms-2:9092,kms-3:9092,kms-4:9092"
val groupId = "group_test"
val kafkaTopics = "test"
val batchInterval = Seconds(5)
val streamingStorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2
val kafkaKeySer = "org.apache.kafka.common.serialization.StringSerializer"
val kafkaValueSer = "org.apache.kafka.common.serialization.StringSerializer"
val sparkSerializer = "org.apache.spark.serializer.KryoSerializer"
val batchSize = 16384
val lingerMs = 1
val bufferMemory = 33554432
// MySQL配置
val user = "root"
val password = "123qwe"
val url = "jdbc:mysql://localhost:3306/kafka_offset"
val driver = "com.mysql.jdbc.Driver"
// 检查点配置
val checkpointDir = "file:///e:/checkpoint"
val checkpointInterval = Seconds(10)
// Redis配置
val redisAddress = "192.168.10.203"
val redisPort = 6379
val redisAuth = "123qwe"
val redisTimeout = 3000
}
- JDBC连接工具类:JDBCConnPool
object JDBCConnPool {
val log: Logger = Logger.getLogger(JDBCConnPool.getClass)
var dataSource: BasicDataSource = null
/**
* 创建数据源
*
* @return
*/
def getDataSource(): BasicDataSource = {
if (dataSource == null) {
dataSource = new BasicDataSource()
dataSource.setDriverClassName(ConfigConstants.driver)
dataSource.setUrl(ConfigConstants.url)
dataSource.setUsername(ConfigConstants.user)
dataSource.setPassword(ConfigConstants.password)
dataSource.setMaxTotal(50)
dataSource.setInitialSize(3)
dataSource.setMinIdle(3)
dataSource.setMaxIdle(10)
dataSource.setMaxWaitMillis(2 * 10000)
dataSource.setRemoveAbandonedTimeout(180)
dataSource.setRemoveAbandonedOnBorrow(true)
dataSource.setRemoveAbandonedOnMaintenance(true)
dataSource.setTestOnReturn(true)
dataSource.setTestOnBorrow(true)
}
return dataSource
}
/**
* 释放数据源
*/
def closeDataSource() = {
if (dataSource != null) {
dataSource.close()
}
}
/**
* 获取数据库连接
*
* @return
*/
def getConnection(): Connection = {
var conn: Connection = null
try {
if (dataSource != null) {
conn = dataSource.getConnection()
} else {
conn = getDataSource().getConnection()
}
} catch {
case e: Exception =>
log.error(e.getMessage(), e)
}
conn
}
/**
* 关闭连接
*/
def closeConnection (ps:PreparedStatement , conn:Connection ) {
if (ps != null) {
try {
ps.close();
} catch {
case e:Exception =>
log.error("预编译SQL语句对象PreparedStatement关闭异常!" + e.getMessage(), e);
}
}
if (conn != null) {
try {
conn.close();
} catch {
case e:Exception =>
log.error("关闭连接对象Connection异常!" + e.getMessage(), e);
}
}
}
}
- Kafka生产者:KafkaProducerTest
object KafkaProducerTest {
def main(args: Array[String]): Unit = {
val props : Properties = new Properties()
props.put("bootstrap.servers", ConfigConstants.kafkaBrokers)
props.put("batch.size", ConfigConstants.batchSize.asInstanceOf[Integer])
props.put("linger.ms", ConfigConstants.lingerMs.asInstanceOf[Integer])
props.put("buffer.memory", ConfigConstants.bufferMemory.asInstanceOf[Integer])
props.put("key.serializer",ConfigConstants.kafkaKeySer)
props.put("value.serializer", ConfigConstants.kafkaValueSer)
val producer : Producer[String, String] = new KafkaProducer[String, String](props)
val startTime : Long = System.currentTimeMillis()
for ( i <- 1 to 100) {
producer.send(new ProducerRecord[String, String](ConfigConstants.kafkaTopics, "Spark", Integer.toString(i)))
}
println("消耗时间:" + (System.currentTimeMillis() - startTime))
producer.close()
}
}
- 读取和保存Offset:
该对象的作用是从外部设备中读取和写入Offset,包括MySQL和Redis
object OffsetReadAndSave {
/**
* 从MySQL中获取偏移量
*
* @param groupid
* @param topic
* @return
*/
def getOffsetMap(groupid: String, topic: String): mutable.Map[TopicPartition, Long] = {
val conn = JDBCConnPool.getConnection()
val selectSql = "select * from topic_par_group_offset where groupid = ? and topic = ?"
val ppst = conn.prepareStatement(selectSql)
ppst.setString(1, groupid)
ppst.setString(2, topic)
val result: ResultSet = ppst.executeQuery()
// 主题分区偏移量
val topicPartitionOffset = mutable.Map[TopicPartition, Long]()
while (result.next()) {
val topicPartition: TopicPartition = new TopicPartition