1.pom.xml
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.49</version>
</dependency>
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc_2.11</artifactId>
<version>2.5.0</version>
</dependency>
<dependency>
<groupId>org.scalikejdbc</groupId>
<artifactId>scalikejdbc-config_2.11</artifactId>
<version>2.5.0</version>
</dependency>
2.实现
import java.util.Properties
import com.qs.stream.utils._
import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import scalikejdbc.config.DBs
import scalikejdbc.{DB, SQL}
/**
* @ author Duncan
* @ date 2020-07-28 14:20
* @ version 1.0
*/
object Kafka2SparkStreaming {
def main(args: Array[String]): Unit = {
val appName = ReadProperty.getConfigData("spark.consumer.appName")
val servers = ReadProperty.getConfigData("spark.consumer.bootstrap.servers")
val groupId = ReadProperty.getConfigData("spark.consumer.group.id")
val topic = ReadProperty.getConfigData("spark.consumer.topics")
val conf = new SparkConf().setAppName(appName).setMaster(ReadProperty.getConfigData("spark.master.uri"))
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(ReadProperty.getConfigData("window.slide.time").toLong))
val kafkaParams = Map[String, Object](
//用于初始化链接到集群的地址
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> servers,
//key与value的反序列化类型
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
//用于标识这个消费者属于哪个消费团体
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
//自己维护偏移量
"enable.auto.commit" -> (false: java.lang.Boolean),
// 间隔120s(默认300)
"max.poll.interval.ms" -> "60",
//earliest 从头消费 latest 继续消费
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest")
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.put("bootstrap.servers", ReadProperty.getConfigData("spark.consumer.bootstrap.servers"));
p.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
p.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
p
}
sc.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
//先使用scalikejdbc从MySQL数据库中读取offset信息
// ====================tb_kafka_offset====================================
//+------------+------------------+------------+------------+-------------+
//| topic | groupid | partitions | fromoffset | untiloffset |
//+------------+------------------+------------+------------+-------------+
//MySQL表结构如上,将“topic”,“partitions”,“untiloffset”列读取出来
//组成 fromOffsets: Map[TopicAndPartition, Long]
DBs.setup()
val fromdbOffset: Map[TopicPartition, Long] =
DB.readOnly { implicit session =>
SQL(s"select * from `tb_kafka_offset` where groupid = '${groupId}'")
.map(rs => (new TopicPartition(rs.string("topic"), rs.int("partitions")), rs.long("untilOffset")))
.list().apply()
}.toMap
val KafkaStream: InputDStream[ConsumerRecord[String, String]] = if (fromdbOffset.size == 0) {
println("从头开始消费...")
KafkaUtils.createDirectStream[String, String](ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](List(topic.split(",")(1)), kafkaParams))
} else {
println("从已存在记录开始消费...")
KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Assign[String, String](fromdbOffset.keys, kafkaParams, fromdbOffset))
}
/**
* Spark官方说明:
* Note that the typecast to HasOffsetRanges will only succeed if it is done in the first method called on the result of createDirectStream,
* not later down a chain of methods. Be aware that the one-to-one mapping between RDD partition and Kafka partition does not remain after
* any methods that shuffle or repartition, e.g. reduceByKey() or window().
*
* 只有在对KafkaUtils.createDirectStream的结果调用第一个方法时,对HasOffsetRanges进行类型转换才会成功,
* 如果中间有其他操作,RDD分区和Kafka分区之间的一对一映射将不再保留。
*
*/
// 如果Mysql表中没有offset信息,就从0开始消费,如果有,就从已经存在的offset消费
KafkaStream.foreachRDD(rdd=>{
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 输出offset信息
offsetRanges.foreach(x=>{
println(s"---${x.topic},${x.partition},${x.fromOffset},${x.untilOffset}---")
})
val resout: RDD[(String, Int)] = rdd.flatMap(_.value().split(" ")).map((_, 1))
resout.foreachPartition({
it =>
if(it.size>0){
// 数据写入MySQl
val conn = ConnectPoolUtil.getConnection
// 设为手动提交
conn.setAutoCommit(false);
val stmt = conn.createStatement()
it.foreach(words=>{
val word: Array[String] = words._1.split(",")
println("word============"+word(0)+word(1))
stmt.addBatch("insert into tb_kafka_to_mysql(imsi, code, gid, lac, x, y) values('" + word(0)+"','"+word(1)+"','"+word(2)+"','"+word(3)+"','"+word(4)+"','"+word(5) + "')")
})
stmt.executeBatch()
conn.commit()
conn.close()
println("==========================插入数据end=============================")
}
})
//offset存入mysql,使用scalikejdbc框架事务
DB.autoCommit { implicit session =>
for (or <- offsetRanges) {
println("==========================INSERT OFFSET INTO MYSQL=============================")
SQL("replace into `tb_kafka_offset`(topic,groupid,partitions,fromoffset,untiloffset) values (?,?,?,?,?)")
.bind(or.topic, ReadProperty.getConfigData("spark.consumer.group.id"),or.partition, or.fromOffset,or.untilOffset)
.update().apply()
}
}
})
ssc.start()
ssc.awaitTermination()
}
}
3.创建数据库表
-- 存储offset的表
create table tb_kafka_offset(
topic varchar(32),
groupid varchar(50),
partitions int,
fromoffset bigint,
untiloffset bigint,
primary key(topic,groupid,partitions)
);
4.读取配置工具类
import com.typesafe.config.ConfigFactory
import org.apache.commons.lang3.StringUtils
object ValueUtils {
val load = ConfigFactory.load()
def getStringValue(key:String, defaultValue:String="") = {
val value = load.getString(key)
if(StringUtils.isNotEmpty(value)) {
value
} else {
defaultValue
}
}
}
5.配置文件
# JDBC settings
db.default.driver = "com.mysql.jdbc.Driver"
db.default.url="jdbc:mysql://0.0.0.0:3306/datalv"
db.default.user="root"
db.default.password="root"