大数据-代码开发(一)
SparkStreaming消费kafka数据,自主维护offset到hbase
package com.travel.programApp
import java.util.Date
import java.util.regex.Pattern
import com.travel.common.{ConfigUtil, Constants, DateUtils}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{Cell, CellUtil, HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.client.{Admin, Connection, ConnectionFactory, Get, Put, Result, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, ConsumerStrategy, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.collection.mutable
/**
* SparkStreaming程序消费kafka里面的数据,
* 实现自主维护offset到hbase里
*/
object StreamingKafka {
def main(args: Array[String]): Unit = {
val brokers = "node01:9092,node02:9092,node03:9092"
val topics = Array("topic1","topic2")
val group:String = "consum_group"
val kafkaParams = Map[String,Object](
"bootstrap.servers" -> brokers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> group,
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false:java.lang.Boolean)
)
val sparkConf:SparkConf = new SparkConf().setMaster("local[1]").setAppName("StreamingKafka")
val sparkSession:SparkSession = SparkSession.builder().config(sparkConf).getOrCreate()
val sparkContext:SparkContext = sparkSession.sparkContext
sparkContext.setLogLevel("WARN")
val streamingContext = new StreamingContext(sparkContext,Seconds(1))
val conn:Connection = getHbaseConn
val result: InputDStream[ConsumerRecord[String, String]] = getStreamingContextFromHBase(streamingContext,kafkaParams,topics,group,("(.*)gps_topic"))
result.foreachRDD(eachRdd =>{
if(!eachRdd.isEmpty()){
eachRdd.foreachPartition(eachPartition => {
val connection:Connection = getHbaseConn
eachPartition.foreach(record => {
val consumerRecords: ConsumerRecord[String, String] = saveToHBase(connection,record)
})
connection.close()
})
}
val ranges:Array[OffsetRange] = eachRdd.asInstanceOf[HasOffsetRanges].offsetRanges
for(eachRange <- ranges){
val startOffset: Long = eachRange.fromOffset
val endOffset: Long = eachRange.untilOffset
val topic: String = eachRange.topic
val partition: Int = eachRange.partition
//group: String, topic: String, partition: String, offset: Long
saveBatchOffset(group,topic,partition+"",endOffset+1)
}
})
streamingContext.start()
streamingContext.awaitTermination()
}
def getHbaseConn: Connection = {
try{
val config:Configuration = HBaseConfiguration.create()
//zookeeper 地址
config.set("hbase.zookeeper.quorum" , "node01,node02,node03")
config.set("hbase.zookeeper.property.clientPort" , "2181")
//Hmaster地址
config.set("hbase.master" , "node01:60000")
ConnectionFactory.createConnection(config)
}catch{
case exception: Exception =>
error(exception.getMessage)
error("HBase获取连接失败")
null
}
}
def getStreamingContextFromHBase(streamingContext: StreamingContext, kafkaParams: Map[String, Object], topics: Array[String], group: String,matchPattern:String): InputDStream[ConsumerRecord[String, String]] = {
val connection: Connection = getHbaseConn
val admin: Admin = connection.getAdmin
var getOffset:collection.Map[TopicPartition, Long] = getOffsetFromHBase(connection,admin,topics,group)
val result = if(getOffset.size > 0){
val consumerStrategy: ConsumerStrategy[String, String] = ConsumerStrategies.SubscribePattern[String,String](Pattern.compile(matchPattern),kafkaParams,getOffset)
val value: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent,consumerStrategy)
value
}else{
val consumerStrategy: ConsumerStrategy[String, String] = ConsumerStrategies.SubscribePattern[String,String](Pattern.compile(matchPattern),kafkaParams)
val value: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(streamingContext,LocationStrategies.PreferConsistent,consumerStrategy)
value
}
admin.close()
connection.close()
result
}
def getOffsetFromHBase(connection: Connection,admin: Admin,topics: Array[String], group: String): collection.Map[TopicPartition, Long] = {
if(!admin.tableExists(TableName.valueOf(Constants.HBASE_OFFSET_STORE_TABLE))){
val chengdu_gps_offset = new HTableDescriptor(TableName.valueOf(Constants.HBASE_OFFSET_STORE_TABLE))
chengdu_gps_offset.addFamily(new HColumnDescriptor(Constants.HBASE_OFFSET_FAMILY_NAME))
admin.createTable(chengdu_gps_offset)
admin.close();
}
val table = connection.getTable(TableName.valueOf(ConfigUtil.getConfig(Constants.HBASE_OFFSET_STORE_TABLE)))
var myReturnValue:collection.Map[TopicPartition, Long] = new mutable.HashMap[TopicPartition,Long]()
for(eachTopic <- topics){
val get = new Get((group+":"+eachTopic).getBytes())
val result: Result = table.get(get)
val cells: Array[Cell] = result.rawCells()
for(result <- cells){
//列名 group:topic:partition
val topicPartition: String = Bytes.toString( CellUtil.cloneQualifier(result))
//列值 offset
val offsetValue: String = Bytes.toString(CellUtil.cloneValue(result))
//切割列名,获取 消费组,消费topic,消费partition
val strings: Array[String] = topicPartition.split(":")
val myStr = strings(2)
//println(myStr)
val partition = new TopicPartition(strings(1),strings(2).toInt)
myReturnValue += (partition -> offsetValue.toLong)
}
}
table.close()
myReturnValue
}
def saveToHBase(connection:Connection, eachLine: ConsumerRecord[String, String]): ConsumerRecord[String,String] = {
var rowkey = ""
//司机ID
var driverId = ""
//订单ID
var orderId = ""
//经度
var lng = ""
//维度
var lat = ""
//时间戳
var timestamp = ""
val topic: String = eachLine.topic()
val line: String = eachLine.value()
//成都数据
if(line.split(",").size > 4){
if(!line.contains("end") ){
//非结束数据,保存到hbase里面去
val strings: Array[String] = line.split(",")
val split: Array[String] = line.split(",")
driverId = split(0)
orderId = split(1)
timestamp = split(2)
lng = split(3)
lat = split(4)
rowkey = orderId + "_" + timestamp
val put = new Put(rowkey.getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"CITYCODE".getBytes(),Constants.CITY_CODE_CHENG_DU.getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"DRIVERID".getBytes(),driverId.getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"ORDERID".getBytes(),orderId.getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"TIMESTAMP".getBytes(),(timestamp+"").getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"TIME".getBytes(),DateUtils.formateDate(new Date((timestamp + "000").toLong),"yyyy-MM-dd HH:mm:ss").getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"LNG".getBytes(),lng.getBytes())
put.addColumn(Constants.DEFAULT_FAMILY.getBytes(),"LAT".getBytes(),lat.getBytes())
val table: Table = connection.getTable(TableName.valueOf(Constants.HTAB_GPS))
table.put(put)
table.close()
}
}
eachLine
}
def saveBatchOffset(group: String, topic: String, partition: String, offset: Long): Unit = {
val conn: Connection = getHbaseConn
val table: Table = conn.getTable(TableName.valueOf(Constants.HBASE_OFFSET_STORE_TABLE))
val rowkey = group + ":" + topic
val columName = group + ":" + topic + ":" + partition
val put = new Put(rowkey.getBytes())
put.addColumn(Constants.HBASE_OFFSET_FAMILY_NAME.getBytes(),columName.getBytes(),offset.toString.getBytes())
table.put(put)
table.close()
conn.close()
}
}