1、分类:
消息传递语义有:
至少一次语义(at-least-once)、
最多一次语义(at-most-once)、
一次仅一次语义(exactly-once)。
其中at-least-once和at-most-once如下图:
2、详解
exactly-once:
1. 幂等写入( idempotent writes)
需要设置好唯一主键等,比如用redis、mysql
再比如每次往一个目录覆盖写数据,这样主键不容易获取
注:在软件开发领域,幂等写入即为同样的请求被执行一次与连续执行多次的效果是一样的,服务器的状态也是一样的,实际上就是接口的可重复调用(包括时间和空间上两个维度)。
/**
* 一次语义:幂等写入
* 当获取到数据后,先写到mysql,再保存offset,
* 如果在写到mysql数据后,在保存offset之前宕机,重启作业后也不会影响一次语义
* 因为会在mysql重复更新
*/
object KafkaOffsetIdempotent {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
val processingInterval = 2
val brokers = "node01:9092,node02:9092,node03:9092"
val topic = "mytopic1"
// Create direct kafka stream with brokers and topics
val topicsSet = topic.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
/*
1.创建测试的mysql数据库
create database mytest;
2.建表
create table myorders(name varchar(100), orderid varchar(100) primary key);
3.新建topic: mytopic1
kafka-topics.sh --zookeeper node01:2181 --create --topic mytopic1 --partitions 3 --replication-factor 1
4.往mytopic1发送数据,数据格式为 "name,orderid" 比如 abc,3
*/
val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
val groupName = "group1"
val messages = MyKafkaUtils.createMyDirectKafkaStream(
ssc, kafkaParams, topicsSet, groupName)
val jdbcUrl = "jdbc:mysql://node03:3306/mytest"
val jdbcUser = "root"
val jdbcPassword = "root"
messages.foreachRDD(rdd=>{
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.map(x=>x._2).foreachPartition(partition =>{
val conn = DriverManager.getConnection(jdbcUrl, jdbcUser, jdbcPassword)
// upsert update insert
partition.foreach(msg=>{
val name = msg.split(",")(0)
val orderid = msg.split(",")(1)
// orderid为主键
// 插入数据时,会找是否之前的数据有相同的orderid,如果有,就更新name,没有就插入
// 这样就可以实现幂等写入
val sql = s"insert into myorders(name, orderid) values ('$name', '$orderid') ON DUPLICATE KEY UPDATE name='${name}'"
val pstmt = conn.prepareStatement(sql)
pstmt.execute()
})
conn.close()
})
MyKafkaUtils.saveOffsets(offsetRanges, groupName)
})
ssc.start()
ssc.awaitTermination()
}
}
2、事务控制
保证数据和offset在同一个事务里面,比如用mysql
这样需要事务存储的支持
/**
* 事务控制{
* 保存数据
* 保存offset
* }
*
* 1. 创建测试的mysql数据库
create database mytest;
2. 新建topic: mytopic1
kafka-topics.sh --zookeeper node01:2181 --create --topic mytopic1 --partitions 3 --replication-factor 1
3. 建表
--用于存储offset
create table mytopic(topic varchar(200), partid int, offset bigint);
--用于存储数据
create table mydata(name varchar(200), id int);
初始化表:
insert into mytopic(topic, partid, offset) values('mytopic1',0,0);
insert into mytopic(topic, partid, offset) values('mytopic1',1,0);
insert into mytopic(topic, partid, offset) values('mytopic1',2,0);
4. 往mytopic1发送数据, 数据格式为 "name,orderid" 比如 abc,3
5. 在pom文件加入依赖
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc_2.10</artifactId>
<version>2.2.1</version>
</dependency>
*/
object KafkaOffsetTransanction {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("test").setMaster("local[2]")
val processingInterval = 2
val brokers = "node01:9092,node02:9092,node03:9092"
val topic = "mytopic1"
// Create direct kafka stream with brokers and topics
val topicsSet = topic.split(",").toSet
val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers, "auto.offset.reset" -> "smallest")
val ssc = new StreamingContext(sparkConf, Seconds(processingInterval))
val groupName = "myspark"
val driver = "com.mysql.jdbc.Driver"
val jdbcUrl = "jdbc:mysql://node03:3306/mytest"
val jdbcUser = "root"
val jdbcPassword = "root"
// 设置jdbc
Class.forName(driver)
// 设置连接池
ConnectionPool.singleton(jdbcUrl, jdbcUser, jdbcPassword)
// 通过scalike获取mysql中的offset
val fromOffsets = DB.readOnly { implicit session => sql"select topic, partid, offset from mytopic".
map { r =>
TopicAndPartition(r.string(1), r.int(2)) -> r.long(3)
}.list.apply().toMap
}
val messageHandler = (mmd : MessageAndMetadata[String, String]) => (mmd.topic, mmd.message())
val messages = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, fromOffsets, messageHandler)
messages.foreachRDD(rdd=> {
rdd.foreachPartition(partiton=>{
// 获取到该RDD所有分区的offset
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 获取到某个分区的offset
val pOffsetRange = offsetRanges(TaskContext.get.partitionId)
// localTx--开启scalike提供的事务机制
DB.localTx { implicit session =>
// 将数据存入mydata表
partiton.foreach(msg=>{
// 或者使用scalike的batch插入
val name = msg._2.split(",")(0)
val id =msg._2.split(",")(1)
val dataResult = sql"""insert into mydata(name,id) values (${name},${id})""".execute().apply()
})
// 更新offset到mytopic表
val offsetResult =
sql"""update mytopic set offset = ${pOffsetRange.untilOffset} where topic =
${pOffsetRange.topic} and partid = ${pOffsetRange.partition}""".update.apply()
}
})
})
ssc.start()
ssc.awaitTermination()
}
}
3、自己实现Exactly-once
offset和数据绑定保存等