一、业务场景
Kafka consumer 任务出现异常的时候如何保证数据的质量?在以往的经验中,为了保证数据的精准一次,使用mysql表记录下程序异时数据的partition和offset,任务重启的时候查询下mysql 表中是否有程序异常的记录,如果有就从mysql表中取出对应partition的offset,重置consumer的消费。以下案例为使用Kafka consumer消费kafka的数据,ETL之后写入HBase。
二、代码实践
package scala.com.qsq.report.consumer
import java.sql.ResultSet
import java.text.SimpleDateFormat
import java.util
import java.util.{Date, Properties}
import com.qsq.config.LoadConfig
import com.qsq.utils.hbase.HbaseClientObj
import com.qsq.utils.jdbc.C3p0Pools
import com.qsq.utils.JsonUtils
import com.qsq.utils.constant.Constants
import kafka.common.{OffsetAndMetadata, TopicAndPartition}
import kafka.consumer._
import scala.collection.mutable.ArrayBuffer
object MyConsumer {
def main(args: Array[String]): Unit = {
val HBASE_A_RT_CREDIT = "bee:a_user"
val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
// kafka参数
val props = new Properties()
val zk = LoadConfig.getProperties(Constants.START_ENV_REALTIME, "kafka.zookeeper.quorum")
props.put("zookeeper.connect", zk)
props.put("group.id", "call_group")
props.put("auto.offset.reset", "largest")
props.put("fetch.message.max.bytes", "50000000")
props.put("replica.fetch.max.bytes", "50000000")
val config = new ConsumerConfig(props)
// 创建consumer
val consumer = kafka.consumer.Consumer.createJavaConsumerConnector(config)
var hashMap = new util.HashMap[TopicAndPartition, OffsetAndMetadata]()
val conn = C3p0Pools.getConnection()
// 查询mysql表中记录
val res: ResultSet = C3p0Pools.query(conn, """ SELECT * FROM shop.kafka_topic_info WHERE topics = ? AND type = 1 """, Array("u-rall"))
// 有上次失败的记录
while ( res.next() ) {
println("恢复topic : " + res.getString("topics"))
println("恢复partition: " + res.getInt("partitions"))
println("恢复offset: " + res.getLong("offsets"))
hashMap.put(TopicAndPartition(res.getString("topics") ,res.getInt("partitions")), OffsetAndMetadata( res.getLong("offsets") ))
}
conn.close()
if (!hashMap.isEmpty) {
println("恢复offset---------------------- " )
consumer.commitOffsets( hashMap, true )
}
registerShutdownHook()
// 开启3个线程
run(3)
def run(numThread: Int) = {
println("run----------------------")
val topicMap = new util.HashMap[String, Integer]()
topicMap.put("u-rall", numThread)
val decoder = new kafka.serializer.StringDecoder(null)
val topicStreams = consumer.createMessageStreams(topicMap, decoder, decoder)
val consumerStreams = topicStreams.values().iterator()
while (consumerStreams.hasNext) {
val streams: util.List[KafkaStream[String, String]] = consumerStreams.next()
(0 until streams.size()).foreach(i => {
val stream = streams.get(i).iterator
new Thread(new Runnable {
override def run(): Unit = {
while (stream.hasNext()) {
val mam = stream.next
val message: String = mam.message()
try {
if (message.size > 0) {
val jsonMsgObj = JsonUtils.getObjectFromJson(message)
val id = jsonMsgObj.getOrDefault("id", "").toString
val identity = jsonMsgObj.getOrDefault("identity", "").toString
val dataMsg = ArrayBuffer[(String, AnyRef)]()
dataMsg += (("id", id))
dataMsg += (("identity", identity))
dataMsg += (("data", message))
dataMsg += (("create_time", dateFormat.format(new Date())))
HbaseClientObj.getInstance().init(HBASE_A_RT_CREDIT)
HbaseClientObj.getInstance().put(id, "cf", dataMsg)
// 记录消息信息
val partition: Int = mam.partition
println("partition = " + partition + " time: " + dateFormat.format(new Date()))
val offset: Long = mam.offset
println("offset = " + offset + " time: " + dateFormat.format(new Date()))
val topic:String = mam.topic
println("topic = " + topic + " time: " + dateFormat.format(new Date()))
try {
// 更新mysql
C3p0Pools.execute(
"""
|INSERT INTO shop.kafka_topic_info
|( type, topics, partitions, offsets, create_date, update_date )
|VALUES
|( '1', ?, ?, ?, NOW(), NOW() )
|ON DUPLICATE KEY UPDATE partitions = VALUES(partitions), offsets = VALUES(offsets), update_date = NOW()
""".stripMargin, Array(topic, partition, offset))
} catch {
case e: Exception =>
println( s"failed save to mysql ${e}" )
}
}
} catch {
case e: Exception =>
e.printStackTrace()
println(s"failed consumer message ${e}")
}
}
}
}).start()
})
}
}
def release(): Unit = {
try {
println("release consumer...")
consumer.shutdown
} catch {
case e: Exception => println(s"failed release consumer ${e}")
}
}
def registerShutdownHook(): Unit = {
Runtime.getRuntime.addShutdownHook(new Thread() {
override def run(): Unit = {
release
}
})
}
Thread.sleep(10000)
}
}
三、总结
使用Kafka consumer的好处是比较轻量级,在数据量可控的情况下,占用资源少,采用mysql来记录异常的offset信息虽然带来额外的系统开销,却能使数据更加可靠,可以指定从任意的offset开始消费,方便灵活。