前边我们已经有了两种解决方式《sparkStream+kafka实现exactlyOne第三种--Mysql-事务》、《sparkStream+kafka实现exactlyOne第二种--Redis-pipeline》
这是第三种《sparkStream+kafka实现exactlyOne第二种--Hbase-行内事务 phoenix》,habse一行的数据 是有事务的,这一行要么都成功,要么都失败,
因此,我们可以为一个hbase表添加一个offset列族,每次写入一条数据就将offset一并写入,这样就能保证数据的消费和偏移量的记录是同时成功同时失败的
另外一点,利用hbase不需要把数据收集到driver端再写入,前两种都需要把数据收集到driver端,再通过事务、pipeline控制。利用hbase可以再executor端执行,
所以数据量大的话,hbase比较适合,mysql redis只适合每个批次中数据量小的。
主方法
package com.ws.sparkstreaming.kafkahbase
import java.util
import com.google.gson.Gson
import com.ws.sparkstreaming.utils.{HbaseUtils, OffsetUtils}
import org.apache.hadoop.hbase.TableName
import org.apache.hadoop.hbase.client.{Connection, Put, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, TaskContext}
import org.slf4j.{Logger, LoggerFactory}
object WordCountJoinKafkaHbaseManagerOffset {
private val logger: Logger = LoggerFactory.getLogger(this.getClass.getName)
def main(args: Array[String]): Unit = {
val appname = "ssc-kafka-hbase"
val group = "group1"
val topics = Array("orders").toIterable
val conf = new SparkConf().setAppName(appname).setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(5))
ssc.sparkContext.setLogLevel("WARN")
val kafkaParams: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "dream1:9092,dream2:9092,dream3:9092", // kafka地址
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer", // 设置反序列化组件
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"group.id" -> group, // 消费者组
"auto.offset.reset" -> "earliest", // 指定消费者从哪开始消费[latest,earliest]
"enable.auto.commit" -> "false" // 是否自动提交偏移量,默认是true
)
val lastoffset = OffsetUtils.selectOffsetFromHbase(appname, group)
val dstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topics, kafkaParams, lastoffset)
)
dstream.foreachRDD((kafkaRdd: RDD[ConsumerRecord[String, String]]) => {
println("------------------------------------" + System.currentTimeMillis() + "------------------------------------")
if (!kafkaRdd.isEmpty()) {
// 每条数据都是一个json,第一步解析数据
val ranges: Array[OffsetRange] = kafkaRdd.asInstanceOf[HasOffsetRanges].offsetRanges
val lines: RDD[String] = kafkaRdd.map(_.value())
val ordersAndErro: RDD[MyOrder] = lines.mapPartitions((pt: Iterator[String]) => {
val gson = new Gson()
var order: MyOrder = null
pt.map(line => {
try {
order = gson.fromJson(line, classOf[MyOrder])
} catch {
case _: Exception =>
logger.error(line + "-----------解析异常")
}
order
})
})
// 过滤异常数据
val orders: RDD[MyOrder] = ordersAndErro.filter(_ != null)
orders.foreachPartition(pt1 => {
if (pt1.nonEmpty) {
// ****************** 根据分区id获取,offset传入executor端 ******************
val range: OffsetRange = ranges(TaskContext.get.partitionId())
var conn: Connection = null
var table: Table = null
try {
// 获取Hbase连接
conn = HbaseUtils.getConnection()
table = conn.getTable(TableName.valueOf("STREAMING"))
val puts: util.ArrayList[Put] = new util.ArrayList[Put]()
pt1.foreach(order => {
val put = new Put(Bytes.toBytes(order.gid))
put.addColumn(Bytes.toBytes("DATA"), Bytes.toBytes("ORDER"), Bytes.toBytes(order.money))
// create view STREAMING (pk varchar primary key,"OFFSET"."OFFSET" varchar,"OFFSET"."topic_partition" varchar);
// 如果是最后一条数据将偏移量写入和数据一块写入 hbase,我们利用单条数据的一致性(要成功都成功要失败都失败)包子ExactlyOnce
if (!pt1.hasNext) {
put.addColumn(Bytes.toBytes("OFFSET"), Bytes.toBytes("APP_GID"), Bytes.toBytes(appname + "_" + group))
put.addColumn(Bytes.toBytes("OFFSET"), Bytes.toBytes("TOPIC_PARTITION"), Bytes.toBytes(range.topic + "_" + range.partition))
put.addColumn(Bytes.toBytes("OFFSET"), Bytes.toBytes("OFFSET"), Bytes.toBytes(range.untilOffset))
}
puts.add(put)
if (puts.size() % 10 == 0) {
table.put(puts)
puts.clear()
}
})
table.put(puts)
} catch {
case e: Exception =>
e.printStackTrace()
} finally {
if (table != null) {
table.close()
}
if (conn != null) {
conn.close()
}
}
}
})
}
})
ssc.start()
ssc.awaitTermination()
}
}
获取历史偏移量
//offset好像是关键字,需要加“”
package com.ws.sparkstreaming.utils
import java.sql.{Connection, DriverManager}
import java.util
import org.apache.kafka.common.TopicPartition
import org.apache.spark.streaming.kafka010.OffsetRange
import scala.collection.mutable
object OffsetUtils {
def selectOffsetFromHbase(appname: String, group: String): Map[TopicPartition, Long] = {
val offsets = new mutable.HashMap[TopicPartition, Long]()
val conn = DriverManager.getConnection("jdbc:phoenix:dream1,dream2,dream3:2181")
// 必须加引号才行
val ps = conn.prepareStatement("select TOPIC_PARTITION,max(\"OFFSET\") as \"OFFSET\" from streaming WHERE APP_GID =? GROUP BY TOPIC_PARTITION")
ps.setString(1,appname+"_"+group)
val resultSet = ps.executeQuery()
while (resultSet.next()) {
val TOPIC_PARTITION = resultSet.getString("TOPIC_PARTITION").split("_")
val OFFSET = resultSet.getInt("OFFSET")
offsets(new TopicPartition(TOPIC_PARTITION(0),TOPIC_PARTITION(1).toInt)) = OFFSET.toLong
}
offsets.toMap
}
}
hbase连接
// hbase 没有连接池
package com.ws.sparkstreaming.utils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration}
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory}
object HbaseUtils {
def getConnection(): Connection = synchronized {
val conf: Configuration = HBaseConfiguration.create()
conf.set("hbase.zookeeper.quorum", "dream1:2181,dream2:2181,dream3:2181")
ConnectionFactory.createConnection(conf)
}
}
HBASE表
create VIEW "STREAMING"(PK VARCHAR PRIMARY KEY,"DATA"."ORDER" UNSIGNED_DOUBLE,"OFFSET"."APP_GID" VARCHAR,"OFFSET"."TOPIC_PARTITION" VARCHAR,"OFFSET"."OFFSET" UNSIGNED_LONG);
0: jdbc:phoenix:> select * from streaming;
+------+--------+-------------------------+------------------+---------+
| PK | ORDER | APP_GID | TOPIC_PARTITION | OFFSET |
+------+--------+-------------------------+------------------+---------+
| g11 | 666.3 | ssc-kafka-hbase_group1 | orders_2 | 1 |
| g12 | 335.3 | ssc-kafka-hbase_group1 | orders_1 | 1 |
| g13 | 615.3 | ssc-kafka-hbase_group1 | orders_0 | 1 |
| g14 | 211.3 | ssc-kafka-hbase_group1 | orders_2 | 2 |
| g15 | 366.3 | ssc-kafka-hbase_group1 | orders_1 | 2 |
| g16 | 566.3 | ssc-kafka-hbase_group1 | orders_0 | 2 |
| g17 | 66.3 | ssc-kafka-hbase_group1 | orders_2 | 4 |
| g18 | 166.3 | ssc-kafka-hbase_group1 | orders_1 | 3 |
| g19 | 16.3 | ssc-kafka-hbase_group1 | orders_0 | 3 |
+------+--------+-------------------------+------------------+---------+
9 rows selected (0.151 seconds)
0: jdbc:phoenix:>
kafka
Last login: Thu Feb 25 10:10:25 2021 from 192.168.33.1
[root@dream1 ~]# kafka-console-producer.sh --broker-list dream1:9092,dream2:9092,dream3:9092 --topic orders
>{"gid": "g01","money": 19.99}
{"gid": "g02","money": 29.99}
{"gid": "g03","money": 39.99}>>
>
数据
{"gid": "g01","money": 19.99}
{"gid": "g02","money": 29.99}
{"gid": "g03","money": 39.99}
{"gid": "g04","money": 39.99}
{"gid": "g05","money": 49.99}
{"gid": "g06","money": 59.99}
{"gid": "g07","money": 89.99}