sparkStreaming:实时流代码案例(实现Wordcount)

依赖包

<dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-core_2.11</artifactId>
      <version>2.4.5</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming_2.11</artifactId>
      <version>2.4.5</version>
      <!--<scope>provided</scope>-->
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming-kafka-0-10 -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
      <version>2.4.5</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
    <dependency>
      <groupId>org.apache.spark</groupId>
      <artifactId>spark-sql_2.11</artifactId>
      <version>2.4.5</version>
    </dependency>
      <dependency>
          <groupId>com.fasterxml.jackson.core</groupId>
          <artifactId>jackson-databind</artifactId>
          <version>2.6.6</version>
      </dependency>

从文件中采集

val conf = new SparkConf().setMaster("local[2]").setAppName("demo1")
val ssc = new StreamingContext(conf,Seconds(5))

val fileDStream = ssc.textFileStream("data/file")
val wordCount = fileDStream.flatMap(line=>line.split("\\s+"))
    .map((_,1))
    .reduceByKey(_+_)

wordCount.print()
ssc.start()
ssc.awaitTermination()

自定义采集

import java.io.{BufferedReader, InputStreamReader}
import org.apache.spark.{SparkConf}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.receiver.Receiver

class MyReceiver(host:String,port:Int) extends Receiver[String](StorageLevel.MEMORY_ONLY){
    var socket:java.net.Socket=null
    def receive():Unit={
    val socket = new java.net.Socket(host,port)
    val reader = new BufferedReader(new InputStreamReader(socket.getInputStream,"UTF-8"))
    var line:String=null;
    while((line=reader.readLine())!=null){
        if(line.equals("end")){
            return
    }else{
        this.store(line)
    }
    }
}
    override def onStart():Unit = {
        new Thread(new Runnable{
            override def run():Unit={
                receive()
            }
        }).start()
    }
    override def onStop():Unit = {
        if(socket != null){
        socket.close()
        socket=null
    }
    }
}
object MyReceiverDemo{
    def main(args:Array[String]):Unit = {
        val conf = new SparkConf().setMaster("local[2]").setAppName("demo3")
        val ssc = new StreamingContext(conf,Seconds(5))
        val receiverStream = ssc.receiverStream(new MyReceiver("192.168.136.10",9999))
        val streamCount = receiverStream.flatMap(_split("\\s+"))
            .map((_,1))
            .reduceByKey(_+_)
        streamCount.print()
        ssc.start()
        ssc.awaitTermination()
    }
}

从kafka收集数据

无状态

	val conf = new SparkConf().setMaster("local[2]").setAppName("spark-kafka")
    val ssc = new StreamingContext(conf, Seconds(5))
	//与kafka建立连接
    val kafkaParams: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG->"spark-kafka")
    )
 	//接收producer的信息
    val kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      //分区设置策略:1.每个Executor作为一个consumer
      LocationStrategies.PreferConsistent,
      
      // LocationStrategies.PreferBrokers
      // LocationStrategies.PreferFixed()
      	//订阅固定的topic集
      ConsumerStrategies.Subscribe(Set("sparkDemo"), kafkaParams)
    )
    //需求实现,wordcount
    val wordCount = kafkaStream.flatMap(v => v.value().toString.split(" "))
      .map((_, 1))
      .reduceByKey(_ + _)
    wordCount.print()
    //启动
    ssc.start()
    ssc.awaitTermination()

producer生成消息:

 kafka-console-producer.sh --topic spark-kafka --broker-list 192.168.136.10:9092

有状态

需要设立检查点checkpoint

	val conf = new SparkConf().setMaster("local[2]").setAppName("demo3")
    val ssc = new StreamingContext(conf, Seconds(5))
    //建立checkpoint
    ssc.checkpoint("checkpoint")
    //连接kafka
    val kafka: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroup1")
    )
    //获得producer数据
    val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("kafkaWindow1"), kafka)
    )
    //需求实现
    val msg = stream.flatMap(v => v.value().toString.split("\\s+"))
      .map((_, 1))
      //有状态统计
    val sum: DStream[(String, Int)] = msg.updateStateByKey {
      case (seq, buffer) => {
        val allSum: Int = buffer.getOrElse(0) + seq.sum
        Option(allSum)
      }
    }
    sum.print()
    //启动
    ssc.start()
    ssc.awaitTermination()

producer生产消息:

 kafka-console-producer.sh --topic kafkaWindow1 --broker-list 192.168.136.10:9092

有状态和无状态相比,有状态的情况下,会统计历史数据,在进行单词计数时,历史的单词也会计算进去。

窗口函数

window

新窗口与旧窗口没有交集

import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}


val conf = new SparkConf().setMaster("local[2]").setAppName("demo1")
    val ssc = new StreamingContext(conf,Seconds(5))

    val sparkKafka: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "window1")
    )
    val KafkaWindow: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
    )
    val wordCount = KafkaWindow.flatMap(v => v.value().toString.split("\\s+"))
      .map((_, 1))
      //必须是前面设置的时间间隔(5s)的倍数
      .window(Seconds(5))
      .reduceByKey(_ + _)

    wordCount.print()
    ssc.start()
    ssc.awaitTermination()

producer生产消息:

kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092

新窗口与旧窗口有交集

val conf = new SparkConf().setMaster("local[2]").setAppName("demo1")
    val ssc = new StreamingContext(conf,Seconds(2))

    val sparkKafka: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "window1")
    )
    val KafkaWindow: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
    )
    val wordCount = KafkaWindow.flatMap(v => v.value().toString.split("\\s+"))
      .map((_, 1))
      //必须是前面设置的时间间隔(5s)的倍数
      .window(Seconds(4),Seconds(2))
      .reduceByKey(_ + _)

    wordCount.print()
    ssc.start()
    ssc.awaitTermination()

producer生产消息:

kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092

countByWindow

统计当前时间窗口的元素个数
需要设立检查点checkpoint

val conf = new SparkConf().setMaster("local[2]").setAppName("demo2")
    val ssc = new StreamingContext(conf,Seconds(2))
    ssc.checkpoint("checkpoint")
    val sparkKafka: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "Window2")
    )
    val kafkamsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
    )
    val res = kafkamsg.flatMap(v => v.value().toString.split("\\s+"))
      .map((_, 1))
      .countByWindow(Seconds(4), Seconds(2))

    res.print()
    ssc.start()
    ssc.awaitTermination()

生产者消费:

kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092

随着消息的产生,它只统计总的单词的个数
在这里插入图片描述

countByValueAndWindow

统计当前时间窗口中元素值相同的元素个数
需要设立检查点checkpoint

val conf = new SparkConf().setMaster("local[2]").setAppName("demo3")
    val ssc = new StreamingContext(conf,Seconds(2))
    ssc.checkpoint("checkpoint")
    val sparkKafka: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "Window3")
    )
    val kafkaMsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("windowStream"), sparkKafka)
    )
    val res: DStream[(String, Long)] = kafkaMsg.flatMap(v => v.value().toString.split("\\s+"))
      .countByValueAndWindow(Seconds(4), Seconds(2))
    res
    res.print()
    ssc.start()
    ssc.awaitTermination()

生产者消费:

kafka-console-producer.sh --topic windowStream --broker-list 192.168.136.10:9092

reduceByWindow

在调用DStream上首先取窗口函数的元素形成新的DStream,然后在窗口元素形成的DStream上进行reduce

val conf = new SparkConf().setMaster("local[2]").setAppName("demo4")
    val ssc = new StreamingContext(conf,Seconds(2))

    val kafkaStream: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, " org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, " org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "window4")
    )

    val kafkaMsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("windowStream"), kafkaStream)
    )
    val wordcount = kafkaMsg.flatMap(_.value().toString.split("\\s+"))
      .reduceByWindow(_ + ":" + _, Seconds(8), Seconds(4))
    wordcount.print()

    ssc.start()
    ssc.awaitTermination()

reduceByKeyAndWindow

按照key值运算

val conf = new SparkConf().setMaster("local[2]").setAppName("demo5")
    val ssc = new StreamingContext(conf,Seconds(2))
    ssc.checkpoint("checkpoint")
    val kafkaStream = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "window5")
    )

    val kafkaMsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("sparkDemo"), kafkaStream)
    )
    val res: DStream[(String, Int)] = kafkaMsg.flatMap(_.value().toString.split("\\s+"))
      .map((_, 1))
      .reduceByKeyAndWindow((x: Int, y: Int) => {
       x + y
      }, Seconds(4), Seconds(2))
      //这个用法需要设立检查点
     // .reduceByKeyAndWindow((x:Int,y:Int)=>{x+y},(x:Int,y:Int)=>{x-y},Seconds(4),Seconds(2))
    res.print()

    ssc.start()
    ssc.awaitTermination()

生产者生产和前面一样,这里即不再赘述。

transform

支持任意的RDD到RDD的映射操作。

val conf = new SparkConf().setMaster("local[2]").setAppName("transform")
    val ssc = new StreamingContext(conf,Seconds(2))
   
    val sparkKafka: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "transform1")
    )
    val kafkamsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("sparkDemo"), sparkKafka))

    val wordCount: DStream[((String, String), Int)] = kafkamsg.transform((rdd, timestamp) => {
      val format: SimpleDateFormat = new SimpleDateFormat("yyyyMMdd HH:mm:ss")
      val time: String = format.format(timestamp.milliseconds)
      val value: RDD[((String, String), Int)] = rdd.flatMap(x => x.value().toString.split("\\s+"))
        .map(x => ((x, time), 1))
        .reduceByKey(_ + _)
        .sortBy(_._2, false)
      value
    })

    wordCount.print()
    ssc.start()
    ssc.awaitTermination()

transform源码:
传入的参数是自定义函数,自定义函数的返回值为RDD,transform整个返回值类型为DStream.

def transform[U: ClassTag](transformFunc: (RDD[T], Time) => RDD[U]): DStream[U] = ssc.withScope {
    // because the DStream is reachable from the outer object here, and because
    // DStreams can't be serialized with closures, we can't proactively check
    // it for serializability and so we pass the optional false to SparkContext.clean
    val cleanedF = context.sparkContext.clean(transformFunc, false)
    val realTransformFunc = (rdds: Seq[RDD[_]], time: Time) => {
      assert(rdds.length == 1)
      cleanedF(rdds.head.asInstanceOf[RDD[T]], time)
    }
    new TransformedDStream[U](Seq(this), realTransformFunc)
  }

SQLContext

用SQL语句来进行操作

import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
object streamSQLDemo {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("streamsql")
    val ssc = new StreamingContext(conf,Seconds(2))

    val kafkaStream: Map[String, String] = Map(
      (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.136.10:9092"),
      (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer"),
      (ConsumerConfig.GROUP_ID_CONFIG, "streamsql")
    )
    val kafkamsg: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream(
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe(Set("sparkDemo"), kafkaStream)
    )
    kafkamsg
    val wordcount: DStream[Row] = kafkamsg.transform(rdd => {
      val sqlContext = SQLContextSinleton.getInstance(rdd.sparkContext)
      import sqlContext.implicits._
      val value = rdd.flatMap(x => x.value().toString.split("\\s+")).map((_, 1))
      value.toDF("name", "cn")
        .createOrReplaceTempView("tbword")
      val frame = sqlContext.sql("select name,count(cn) from tbword group by name")
      frame.rdd
    })
    wordcount.print()
    ssc.start()
    ssc.awaitTermination()

  }
}

object SQLContextSinleton{
  @transient private var instance:SQLContext=_
  def getInstance(sc:SparkContext):SQLContext={
    synchronized(
      if(instance == null) {
        instance = new SQLContext(sc)
      }
    )
    instance
  }
}

再来一个
这种方式输出的是表格形式:

object Demo_SQLWordCount {
  def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("demo").setMaster("local[*]")
val ssc = new StreamingContext(conf,Seconds(5))
//sparkSession需写在ssc后面
val spark = SparkSession.builder().config(conf).getOrCreate()
import spark.implicits._
val lines = ssc.socketTextStream("hadoop01",7777)
val wrods = lines.flatMap(_.split("\\s+"))
wrods.foreachRDD(rdd=>{

  if(rdd.count()!=0) {
  val df = rdd.map(x=>Word(x)).toDF()
  df.createOrReplaceTempView("tb_word")
  spark.sql("select word,count(1) from tb_word group by word")
  .show()
}
})

  //value.print()
  ssc.start()
  ssc.awaitTermination()
}

  case class Word(word:String)
}

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值