初学SparkStreaming小案例(练习)

流式处理socket数据,实现单词统计


import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}


/**
  * sparkStreming流式处理接受socket数据,实现单词统计
  */
object SparkStreamingTCP {

  def main(args: Array[String]): Unit = {

    //配置sparkConf参数
    //一个线程进行计算,一个线程接收数据
    val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("sparkstreamingTCP")
    //构建sparkContext对象
    val sc: SparkContext = new SparkContext(conf)

    //设置日志输出级别
    sc.setLogLevel("WARN")
    //构建StreamingContext对象, 每个批处理的时间间隔
    //5s 为一个时间间隔 (一个批次)
    val scc = new StreamingContext(sc,Seconds(5))
    //注册一个监听的IP地址和端口 用来收集数据
    val lines: ReceiverInputDStream[String] = scc.socketTextStream("sunjunwei.com",9999)
    //切分每一行记录
    val words: DStream[String] = lines.flatMap(line => line.split(" "))
    //每个单词记为 1
    val wordAndOne: DStream[(String, Int)] = words.map(word => (word,1))
    //分组聚合
    val result: DStream[(String, Int)] = wordAndOne.reduceByKey((x, y) => x+y)

    //打印数据
    result.print()

    //开始执行任务
    scc.start()
    //监听任务
    scc.awaitTermination()
  }
}

流式处理接收socket数据,实现单词统计并且每个批次数据结果累加


import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}


/**
  * sparkStreaming流式处理,接受socket数据,实现单词统计并且每个批次数据结果累加
  */
object SparkStreamingTCPTotal {

  System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2")

  def main(args: Array[String]): Unit = {

    //配置sparkConf参数
    //一个线程进行计算  一个线程接收数据  最少两个线程
    val conf: SparkConf = new SparkConf().setMaster("local[2]").setAppName("sparkStreamingTCPTotal")
    //构建SparkContext对象
    val sc: SparkContext = new  SparkContext(conf)
    //设置输出日志级别
    sc.setLogLevel("WARN")
    //构建StreamingContext对象 每个批处理的时间间隔 间隔为5s
    val streamingContext: StreamingContext = new StreamingContext(sc,Seconds(5))

    streamingContext.checkpoint("[./]")
    //注册一个监听IP的地址和端口  用来收集数据
    val lines: ReceiverInputDStream[String] = streamingContext.socketTextStream("sunjunwei.com",9999)
    //切分每一行的数据
    val words: DStream[String] = lines.flatMap(_.split(" "))
    //每个单词记为1
    val wordAndOne: DStream[(String, Int)] = words.map((_,1))
    //累计统计单词出现的次数
    val result: DStream[(String, Int)] = wordAndOne.updateStateByKey(updateFunction)
    //打印数据
    result.print()
    //执行任务
    streamingContext.start()
    //监听任务
    streamingContext.awaitTermination()
  }

  //newValues 表示当前批次汇总成的(word,1)中相同单词的所有的1
  //runningCount 历史的所有相同key的value总和
  //newValues:新过来的值
  //runningCount:之前保存的状态值
  def updateFunction(newValues: Seq[Int],runningCount: Option[Int]): Option[Int] ={
    val newCount = newValues.sum + runningCount.getOrElse(0)
    Some(newCount)
  }
}

开窗函数,一定时间内单词统计


import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
  * sparkStreming开窗函数---统计一定时间内单词出现的次数
  */
object SparkStreamingTCPWindow {

  System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2")

  def main(args: Array[String]): Unit = {

    //配置sparkConf
    val conf: SparkConf = new SparkConf().setAppName("spark_window").setMaster("local[2]")
    //构建sparkcontext对象
    val sc: SparkContext = new SparkContext(conf)
    //设置日志输出级别
    sc.setLogLevel("WARN")
    //构建StreamingContext对象  设置每个批次的时间间隔
    val context: StreamingContext = new StreamingContext(sc,Seconds(5))
    //注册一个监听的ip地址和端口 收集数据
    val linse: ReceiverInputDStream[String] = context.socketTextStream("sunjunwei.com",9999)
    //切分每一行记录
    val words: DStream[String] = linse.flatMap(_.split(" "))
    //每个单词记为1
    val word: DStream[(String, Int)] = words.map((_,1))
    //reduceByKeyAndWindow函数参数意义:
    // windowDuration:表示window框住的时间长度,如本例5秒切分一次RDD,框10秒,就会保留最近2次切分的RDD
    //slideDuration:  表示window滑动的时间长度,即每隔多久执行本计算
    val result: DStream[(String, Int)] = word.reduceByKeyAndWindow((a:Int, b:Int)=>a+b,Seconds(10),Seconds(10))

    result.print()

    context.start()
    context.awaitTermination()
  }
}

开窗函数,统计一定时间内的热门词汇


import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}


/**
  * sparkStreming开窗函数应用----统计一定时间内的热门词汇
  */
object SparkStreamingTCPWindowHotWords {

  def main(args: Array[String]): Unit = {

    //配置sparkconf参数
    val conf: SparkConf = new SparkConf().setAppName("hot_words").setMaster("local[2]")
    //构建sparkcontext对象
    val sc: SparkContext = new SparkContext(conf)
    //设置日志输出级别
    sc.setLogLevel("WARN")
    //构建StreamingContext对象  设置每个批次的时间间隔
    val scc: StreamingContext = new StreamingContext(sc,Seconds(5))
    //注册一个监听ip地址和端口号 用来收集数据
    val lines: ReceiverInputDStream[String] = scc.socketTextStream("sunjunwei.com",9999)
    //切分每一条记录
    val words: DStream[String] = lines.flatMap(_.split(" "))
    //每个单词记为1
    val word: DStream[(String, Int)] = words.map((_,1))
    //reduceByKeyAndWindow函数参数意义:
    // windowDuration:表示window框住的时间长度,如本例5秒切分一次RDD,框10秒,就会保留最近2次切分的RDD
    //slideDuration:  表示window滑动的时间长度,即每隔多久执行本计算
    val result: DStream[(String, Int)] = word.reduceByKeyAndWindow((a:Int, b:Int)=>a+b,Seconds(10),Seconds(10))
    val data = result.transform(rdd => {
      //降序处理后,取前三位
      val sortRdd: RDD[(String, Int)] = rdd.sortBy(t => t._2, false)
      val sortResult: Array[(String, Int)] = sortRdd.take(3)
      println("************************print top 3 begin*************************")
      sortResult.foreach(println)
      println("************************ print top 3 end *************************")
      sortRdd
    })

    data.print()

    scc.start()
    scc.awaitTermination()
  }
}

kafka–receiver


import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object SparkStreamingKafkaReceiver {

  System.setProperty("hadoop.home.dir","E:/x3/hadoop-2.9.2")
  def main(args: Array[String]): Unit = {

    //创建sparkConf
    val conf: SparkConf = new SparkConf().setAppName("kafka_receiver").setMaster("local[4]")
    //创建sparkcontext
    val sc: SparkContext = new SparkContext(conf)
    //设置日志输出级别
    sc.setLogLevel("WARN")

    //创建StreamingContext
    val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))
    //检查点
    ssc.checkpoint("./kafka-receiver")

    //receiver方式:一个接收线程---》一个topic中的分区
   /* val stream = (1 to 3).map(x => {
      KafkaUtils.createStream(ssc,"sunjunwei.com:2181","group01",Map("spark_kafka1" -> 3))
    })
    val inputStream = ssc.union(stream)*/

    val inputstream = KafkaUtils.createStream(ssc, "sunjunwei.com:2181", "group_01", Map("spark_kafka1" -> 1))

    val result = inputstream.map(tuple => tuple._2).flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunc)
    result.print()

    ssc.start()
    ssc.awaitTermination()
  }

  def updateFunc(aa:Seq[Int],bb:Option[Int]): Option[Int] ={
    Some(aa.sum + bb.getOrElse(0))
  }
}

整合flume–poll模式

import java.net.InetSocketAddress

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.flume.FlumeUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
  * sparkStreaming整合flume 拉模式Poll

  */
object SparkStreamingFlumePoll {
  System.setProperty("hadoop.home.dir", "E:/x3/hadoop-2.9.2")
  //newValues 表示当前批次汇总成的(word,1)中相同单词的所有的1
  //runningCount 历史的所有相同key的value总和
  def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
    val newCount =runningCount.getOrElse(0)+newValues.sum
    Some(newCount)
  }


  def main(args: Array[String]): Unit = {
    //配置sparkConf参数
    val sparkConf = new SparkConf().setAppName("SparkStreaming_Flume_Poll").setMaster("local[2]")
    //构建sparkContext对象
    val sc = new SparkContext(sparkConf)
    sc.setLogLevel("WARN")
    //构建StreamingContext对象,每个批处理的时间间隔
    val scc = new StreamingContext(sc, Seconds(5))

    //设置checkpoint
    scc.checkpoint("./")
    //设置flume的地址,可以设置多台
   // val address = Seq(new InetSocketAddress("sunjunwei.com",8888))
    // 从flume中拉取数据
    val flumeStream = FlumeUtils.createPollingStream(scc,"sunjunwei.com",8888,StorageLevel.MEMORY_AND_DISK)

    //获取flume中数据,数据存在event的body中,转化为String
    val lineStream = flumeStream.map(x=>new String(x.event.getBody.array()))
    //实现单词汇总
    val result = lineStream.flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunction)

    result.print()
    scc.start()
    scc.awaitTermination()
  }
}

整合flume–push模式

import java.net.InetSocketAddress

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.flume.{FlumeUtils, SparkFlumeEvent}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
  * sparkStreaming整合flume  推模式Push
  */
object SparkStreamingFlumePush {
  System.setProperty("hadoop.home.dir", "E:/x3/hadoop-2.9.2")
  //newValues 表示当前批次汇总成的(word,1)中相同单词的所有的1
  //runningCount 历史的所有相同key的value总和
  def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
    val newCount =runningCount.getOrElse(0)+newValues.sum
    Some(newCount)
  }


  def main(args: Array[String]): Unit = {
    //配置sparkConf参数
    val sparkConf = new SparkConf().setAppName("SparkStreaming_Flume_Push").setMaster("local[2]")
    //构建sparkContext对象
    val sc = new SparkContext(sparkConf)
    //构建StreamingContext对象,每个批处理的时间间隔
    val scc = new StreamingContext(sc, Seconds(5))
    //设置日志输出级别
    sc.setLogLevel("WARN")
    //设置检查点目录
    scc.checkpoint("./")
    //flume推数据过来
    // 当前应用程序部署的服务器ip地址,跟flume配置文件保持一致
    val flumeStream = FlumeUtils.createStream(scc,"192.168.0.110",8888,StorageLevel.MEMORY_AND_DISK)

    //获取flume中数据,数据存在event的body中,转化为String
    val lineStream = flumeStream.map(x=>new String(x.event.getBody.array()))
    //实现单词汇总
    val result = lineStream.flatMap(_.split(" ")).map((_,1)).updateStateByKey(updateFunction)

    result.print(30)
    scc.start()
    scc.awaitTermination()
  }
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值