快放假了,不想说废话
主要操作类
package com.action
import com.conf.{ConfigManager, ConstantsInterface}
import com.until.LocalKafkaUntils
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* @Author: king
* @Datetime: 2018/12/27
* @Desc: TODO
*
*/
object Kafka2SparkStreaming2MongoDB {
def main(args: Array[String]): Unit = {
//sparkSql驱动注册
val spark = SparkSession
.builder()
.master("local[*]")
.appName("Kafka2SparkStream2Mongo")
.getOrCreate()
//sparkStreaming驱动注册
val ssc = new StreamingContext(spark.sparkContext, Seconds(1))
//kafka配置
val kafkaParams=LocalKafkaUntils.
getKafkaParams(ConstantsInterface.KAFKA_BOOTSTRAP_LIST,"saprk_to_mongo")
val topics = LocalKafkaUntils.getKafkaTopics()
//接收到kafka的数据
val stream:InputDStream[ConsumerRecord[String, String]] =
LocalKafkaUntils.getSteam(ssc,kafkaParams)
//mongo的数据结构
val schemaString="field1 field2 field3"
val fields =schemaString.split(" ").map(fieldname=>
StructField(fieldname,StringType,nullable = true))
val schema = StructType(fields)
//mongo配置
val url = ConfigManager.getProperty("mongodb.uri")
val dbName = ConfigManager.getProperty("mongodb.dbname")
val MongoDbOptions :Map[String, String]= Map[String,String](
"spark.mongodb.output.uri"->url.concat(dbName),
"spark.mongodb.output.replaceDocument"->"false"
)
//数据处理存入mongo
val elementDstream =stream.map(v=>v.value()).foreachRDD{
rdd=>
val mongoDF = spark.read.schema(schema).json(rdd)
mongoDF.write.
format("com.mongodb.spark.sql.DefaultSource")
.mode("append")
.options(MongoDbOptions)
.save()
}
ssc.start
ssc.awaitTermination
}
}
sparkstreaming搭载kafka
package com.until
import com.conf.{ConfigManager, ConstantsInterface}
import kafka.message.MessageAndMetadata
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, SparkContext, SparkException, TaskContext}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{HasOffsetRanges, KafkaUtils, OffsetRange}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import kafka.serializer.Decoder
import scala.reflect.ClassTag
/**
* @Author: king
* @Datetime: 2018/10/11
* @Desc: TODO
*
*/
object LocalKafkaUntils {
/**
* get kafkaconf
*
* @return kafkaParams
*/
def getKafkaParams(brokers: String, groupId: String): Map[String, Object] = {
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> brokers,
"group.id" -> groupId,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
kafkaParams
}
/**
* get topics
*
*/
def getKafkaTopics(): Set[String] = {
val topicSet = ConfigManager.getProperty(ConstantsInterface.KAFKA_TOPICS).split(",").toSet
topicSet
}
/**
* 获取stream
* @param ssc
* @param kafkaParams
* @return stream
*/
def getSteam(ssc: StreamingContext, kafkaParams: Map[String, Object])
: InputDStream[ConsumerRecord[String, String]] = {
val topicSet = getKafkaTopics
val stream = KafkaUtils.createDirectStream[String, String](
ssc,
PreferConsistent,
Subscribe[String, String](topicSet, kafkaParams))
stream
}
/**
* get offset
*/
def getOffsets(stream:InputDStream[ConsumerRecord[String, String]]): Unit ={
stream.foreachRDD{rdd =>
val offsetRanges =rdd.asInstanceOf[HasOffsetRanges].offsetRanges
rdd.foreachPartition{ iter =>
val o:OffsetRange=offsetRanges(TaskContext.get.partitionId)
println(s"${o.topic} ${o.partition} ${o.fromOffset} ${o.untilOffset}")
}
//offsetRanges
}
}
}