override def stream(): DataFrame = {
val kafkaConf = new KafkaConfiguration
val spark = GxSparkSession().session()
val df: DataFrame = spark
.readStream
.format("kafka")
.option("subscribe", kafkaConf.intelligentDrivingTopic) // 智驾车的 topic
.option("kafka.bootstrap.servers", kafkaConf.bootstrapServers) // BootstrapServers
.option("startingOffsets", kafkaConf.autoOffsetReset) // 从最新的偏移量开始消费
.option("maxOffsetsPerTrigger", kafkaConf.maxOffsetsPerTrigger) // 设置最大偏移量数
.option("kafka.max.partition.fetch.bytes", kafkaConf.maxPartitionFetchBytes) // 设置fetch最大字节
.option("failOnDataLoss", kafkaConf.failOnDataLoss) // 设置 failOnDataLoss 为 false
.load()
val source = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")
.select("key", "value")
source
}
如上图,kafak参数要加“kafka.”前缀,如max.partition.fetch.bytes
因为structured streaming的kafak source实现里是将structured streaming参数和kafka参数区分开的