object HighKafkaSource2 {
def main(args: Array[String]): Unit = {
//这种方式每次都是new 的
// offset保存并没有起效果
//造成每次消费都是从最新的开始读 初始化找不到初始化的offset
//默认使用的是最新的 可配置
val conf: SparkConf = new SparkConf().setAppName("high1").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(3))
//ssc.checkpoint("./A1")
var params = Map[String,String](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG ->"hadoop103:9092,hadoop104:9092,hadoop105:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "big1015",
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG-> "org.apache.kafka.common.serialization.StringDeserializer",
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"
//ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "smallest"
)
val inputDs: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
params,
Set[String]("highkafka"))
inputDs.print
ssc.start()
ssc.awaitTermination()
}
}
如果想从开始的offset开始读,换消费者组,并且设置
ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "smallest",即从头开始读,因为默认还是读最新的
,但是上面的写法的offset并不能保存,因为每次使new了新的StreamingContext
package day5.KafkaSource
import java.sql.{Connection, PreparedStatement}
import kafka.serializer.StringDecoder
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
import utils.C3p0Utils
import scala.collection.mutable
/**
* @author wade
* @create 2019-03-12 18:42
*/
object HighKafkaSource {
def main(args: Array[String]): Unit = {
//这种创建方式 保证每次启动的ssc 一致
val ssc: StreamingContext =
StreamingContext.getActiveOrCreate("./A1",getStreamContex)
ssc.start()
ssc.awaitTermination()
}
def getStreamContex(): StreamingContext ={
val conf: SparkConf = new SparkConf().setAppName("high1").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(3))
ssc.checkpoint("./A1")
//必须按照这个方式,对于用到ssc的地方,
var params = Map[String,String](
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG ->"hadoop103:9092,hadoop104:9092,hadoop105:9092",
ConsumerConfig.GROUP_ID_CONFIG -> "big1015"
// ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG-> "org.apache.kafka.common.serialization.StringDeserializer",
// ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"
)
val inputDs: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
ssc,
params,
Set[String]("highkafka"))
inputDs.foreachRDD(rdd => {
rdd.foreachPartition(ts =>{
//把第二个值存入mysql,因为在分区内部,不存在序列化
val conn: Connection = C3p0Utils.getConnection
val sql = "INSERT INTO staff VALUES (NULL,?,'male') "
val ps: PreparedStatement = conn.prepareStatement(sql)
ts.foreach(t => {
val value: String = t._2
println(value)
ps.setString(1,value)
ps.addBatch()
})
ps.executeBatch()
ps.close()
conn.close()
})
})
ssc
}
}
//问题2 、
// WARN [Executor task launch worker for task 1] - Property key.deserializer is not valid
// WARN [Executor task launch worker for task 1] - Property value.deserializer is not valid