Spark Structured Streaming Continuous模式（一）_structured streaming continuous mode-CSDN博客

本文链接：https://blog.csdn.net/lxhandlbb/article/details/80754381

Spark Structured Streaming 的Continuous模式是Spark 2.3 引入的一种持续计算模型。相比于之前的微批处理达到10ms内的延迟。

首先还是展示一个example:


import java.util
import java.util.UUID

import com.alibaba.fastjson.JSON

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.streaming.Trigger
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import scala.collection.mutable


object ContinuousStructuredKafkaWordCount {
  def main(args: Array[String]): Unit = {

    val checkpointLocation =
      if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString

    val spark = SparkSession
      .builder
      .master("local[3]")
      .appName("ContinuousStructuredKafkaWordCount")
      .enableHiveSupport()
      .getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")


    val lines = spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("subscribe", "orders")
      .option("groupId", "groupID0234")
      .load()
      .selectExpr("CAST(offset AS STRING)", "CAST(value AS STRING)")// 没有转换则是字节数组
      .toDF("offset", "value")
      .toDF("value", "id")
    // .withWatermark("timestamp", "10 seconds")

    val output = lines.selectExpr("value").writeStream
      .format("kafka")
      .outputMode("append")
      .option("kafka.bootstrap.servers", "localhost:9092")
      .option("topic", "result")
      .option("checkpointLocation", checkpointLocation)

       .trigger(Trigger.Continuous("1 second")) // only change in query
      //.trigger( Trigger.ProcessingTime("5 seconds")) // only change in query
      .start()

    output.awaitTermination()
  }

}