前文:
理解driver端和excutor端的代码块划分有利于优化代码。
import com.cmsz.utils.MySQLUtils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies, OffsetRange}
object demotest {
def main(args: Array[String]): Unit = {
//本地测试添加.setMaster("local[*]")
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("RiskControl2_RealTimeEngine")
val ssc = new StreamingContext(sparkConf, Seconds(5))
val topicSet = "in_topic".trim.split(",", -1).toSet
val groupID = args("test")
val kafkaParams = Map[String, String](
"bootstrap.servers" -> "hadoop122:9092",
"group.id" -> "test",
"key.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer",
"value.deserializer" -> "org.apache.kafka.common.serialization.StringDeserializer"
)
//driver端,从Kafka获取数据DStream
val messages: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](topicSet, kafkaParams))
var offsetRanges = Array[OffsetRange]()
messages.foreachRDD { rdd: RDD[ConsumerRecord[String, String]] =>
//driver端,对RDD操作,获取Kafka偏移量
offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition { records: Iterator[ConsumerRecord[String, String]] =>
//在executor端的,对RDD(每个分区)操作
//以分区为单位!!创建Mysql、Phoenix、Kafka等连接把数据发送给外部系统!!
val conn = MySQLUtils.getConn();
records.foreach { record =>
//对每一条数据操作,向数据库等外部系统输出
}
conn.close();
}
//提交offset
messages.asInstanceOf[CanCommitOffsets].commitAsync(offsetRanges)
}
}
}