Spark系列---streaming详解

 

/**
  * SparkStreaming基于网络端口数据的流式计算
  * SparkStreaming程序的入口:
  *     StreamingContext
  * 监听网络bigdata01机器,上面的9999端口的数据
  * 每隔2秒,统计一次数据
  */
object _01SparkStreamingWordCountNetWorkOps {
    def main(args: Array[String]): Unit = {

        if(args == null || args.length < 3) {
            println(
                """Parameter Errors! Usage: <batchInterval> <host> <port>
                  |batchInterval:       每隔多长时间启动一次streaming作业
                  |host:
                  |port
                """.stripMargin)
            System.exit(-1)
        }
        //模式匹配
        val Array(batchInterval, host, port) = args
//        val t = new Tuple2("aaa", "bbb")
//        val (t1, t2, t3) = new Tuple2("aaa", "bbb")
        val conf = new SparkConf()
                    .setAppName("WordCountNetWork")
                    .setMaster("local[*]")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
        //加载外部数据--->socket网络数据
        val linesDStream:ReceiverInputDStream[String] = 
ssc.socketTextStream(host, port.toInt)

//        linesDStream.print()
        val wordsDStream:DStream[String] = linesDStream.flatMap(line => {
            line.split("\\s+")
        })
        val rbkDStream:DStream[(String, Int)] = wordsDStream.map(word => (word, 
1)).reduceByKey(_+_)
        rbkDStream.print()
        ssc.start()//必须要执行,否则作业不会被执行
        ssc.awaitTermination()
    }
}

案例二:SparkStreaming读取HDFS目录新增文件
			(1)源码
				object _02SparkStreamingWCHDFSOps {
    def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.spark-project").setLevel(Level.WARN)
        if(args == null || args.length < 2) {
            println(
                """Parameter Errors!Usage: <batchInterval> <inputpath>
                  |batchInterval:   batchInterval
                  |inputpath:       inputpath
                """.stripMargin)
            System.exit(-1)
        }

        val Array(batchInterval, inputpath) = args

        val conf = new SparkConf()
                    .setMaster("local[*]")
                    .setAppName("SparkStreamingWCHDFS")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))

        val fileStream:DStream[String] = ssc.textFileStream(inputpath)
        val retDStream:DStream[(String, Int)] = 
fileStream.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_+_)
        retDStream.print()
        ssc.start()
        ssc.awaitTermination()
    }
}

transform算子,主要是用来对DStream和其它数据集(RDD、Dataset、DataFrame)进行关联操作,而DStream没有直接关联的api的场景所出现的。比如一个DStream需要和一个RDD进行join操作。
案例介绍:在线黑名单过滤
/**
  * 广告在线黑名单过滤
  *     黑名单是一个相对静态的数据,
  *     要和不断从kafka中流入进来的数据集进行关联匹配,目的就是为了过滤黑
名单中的数据,
  *     以保证用户权益。
  *     黑名单,假设使用静态的结果集,加载成为一个RDD
  *     而数据是流式的DStream,现在要完成
  *     RDD和DStream的关联操作
  *
  *     DStream并没有提供这样的api操作,所以我们这里只能使用transform操
作来完成
  */
object _03SparkStreamingRealTimeBlacklistFilterOps {
    def main(args: Array[String]): Unit = {

        val conf = new SparkConf()
                        .setAppName("RealTimeBlacklistFilter")
                        .setMaster("local[*]")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(2))

        //构建黑名单

        val blacklist = List(
            ("27.19.74.143", true),
            ("110.52.250.126", true)
        )

        val blacklistRDD:RDD[(String, Boolean)] = sc.parallelize(blacklist)

        /**
          * ip,我们这里使用ip进行过滤
          * 27.19.74.143##2016-05-30 17:38:20##GET 
/static/image/common/faq.gif HTTP/1.1##200##1127
            110.52.250.126##2016-05-30 17:38:20##GET 
/data/cache/style_1_widthauto.css?y7a HTTP/1.1##200##1292
          */
        val lines = ssc.socketTextStream("bigdata01", 9999)

        val ip2InfoDStream:DStream[(String, String)] = lines.map(line => {
            val fields = line.split("##")
            (fields(0), line.substring(line.indexOf("##") + 2))
        })
        //要进行黑名单过滤,blacklistRDD和ip2InfoDStream中都有的数据要被
过滤掉,也就是join有结果的数据被过滤掉
        val filteredDStream:DStream[(String, String)] = 
ip2InfoDStream.transform(rdd => {
            if(!rdd.isEmpty()) {
                /*
                    left join 左表所有的都有,右表能关联上的显示,否则为
null
                    在本例中,两张表都能关联上的数据是:应该被过滤掉数据,
黑名单数据
                 */
                val joinedRDD:RDD[(String, (String, Option[Boolean]))] = 
rdd.leftOuterJoin(blacklistRDD)

                joinedRDD.filter{case (ip, (line, option)) => {
                    !option.isDefined
                }}.map{case (ip, (line, option)) => (ip, line)}
            } else {
                rdd.sparkContext.emptyRDD
            }
        })
        filteredDStream.print()

        ssc.start()
        ssc.awaitTermination()
    }
}

			updateStateByKey
	updateStateByKey算子,所表达的就是统计,截止到目前为止的某一个key的状态。这个状态由前置状态(the previous state)和当前状态(new state)来构成。为了实现这个状态的更新,必须做到以下两个步骤:
1、定义状态
2、定义状态更新函数
案例说明:统计old李商城截止到目前为止商品销售总金额
/**
  * updateStateByKey
  * 统计old李商城截止到目前为止商品销售总金额
  */
object _04SparkStreamingUpdateStateByKeyOps {
    def main(args: Array[String]): Unit = {
        Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
        Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
        Logger.getLogger("org.spark-project").setLevel(Level.WARN)
        val conf = new SparkConf()
            .setAppName("RealTimeBlacklistFilter")
            .setMaster("local[*]")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(2))
        //用来存放历史数据状态的目录
        ssc.checkpoint("file:///E:/data/spark/streaming/usb")

        //只有一列内容,就是金额
        val numDS = ssc.socketTextStream("bigdata01", 9999)

        val amountDS = numDS.map(num => ("totalAmount", num.toInt))
        val rbkDS:DStream[(String, Int)] = amountDS.reduceByKey(_+_)//当前
批次的数据

        //合并历史数据和当前批次的数据
        val usbDS:DStream[(String, Int)] = 
rbkDS.updateStateByKey((current:Seq[Int], history:Option[Int]) => {
            Option[Int](current.sum + history.getOrElse(0))
        })

        usbDS.foreachRDD((rdd, bTime) => {
            if(!rdd.isEmpty()) {
                println("-------------------------------------------")
                println(s"Time: $bTime")
                rdd.foreach{case (key, amt) => {
                    println(s"old李商城截止到目前为止商品销售总金额: " + 
amt)
                }}
                println("-------------------------------------------")
            }
        })

        ssc.start()
        ssc.awaitTermination()
    }
}

相关编码
					举例说明:每隔4秒,统计过去6秒钟产生的数据,batch Interval为2s。
object _01SparkStreamingWindowOps {
    	def main(args: Array[String]): Unit = {
        if(args == null || args.length < 3) {
            println(
                """Parameter Errors! Usage: <batchInterval> <host> <port>
                  |batchInterval:       每隔多长时间启动一次streaming作业
                  |host:
                  |port
                """.stripMargin)
            System.exit(-1)
        }
        //模式匹配
        val Array(batchInterval, host, port) = args
        val conf = new SparkConf()
            .setAppName("StreamingWindow")
            .setMaster("local[2]")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
        ssc.checkpoint("file:///E:/data/spark/streaming/window-ck")
        //加载外部数据--->socket网络数据
        val linesDStream:ReceiverInputDStream[String] = 
ssc.socketTextStream(host, port.toInt)

        //        linesDStream.print()
        val wordsDStream:DStream[String] = linesDStream.flatMap(line => {
            line.split("\\s+")
        })
        val rbkDStream:DStream[(String, Int)] = wordsDStream
            .map(word => (word, 1))
            .reduceByKeyAndWindow(
                (v1:Int, v2:Int) => v1 + v2,
                Seconds(batchInterval.toLong * 3), //window length
                Seconds(batchInterval.toLong * 2))//sliding interval
        rbkDStream.print()
        ssc.start()//必须要执行,否则作业不会被执行
        ssc.awaitTermination()//阻塞方法,等待Streaming被stop,或者异常终
端,才能释放对应锁
        ssc.stop(false, true)//stopSparkContext,stopGracefully
    }
}

15,SparkStreaming自定义Receiver
		object _04SparkStreamingCustomReceiverOps {
    def main(args: Array[String]): Unit = {
        if(args == null || args.length < 3) {
            println(
                """Parameter Errors! Usage: <batchInterval> <host> <port>
                  |batchInterval:       每隔多长时间启动一次streaming作业
                  |host:
                  |port
                """.stripMargin)
            System.exit(-1)
        }
        //模式匹配
        val Array(batchInterval, host, port) = args
        val conf = new SparkConf()
            .setAppName("StreamingCustomReceiver")
            .setMaster("local[2]")
        val sc = new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
        //加载外部数据--->socket网络数据
        val linesDStream:ReceiverInputDStream[String] = ssc.receiverStream(new 
MyNetWorkReceiver(host, port.toInt))
        //        linesDStream.print()
        val wordsDStream:DStream[String] = linesDStream.flatMap(line => {
            line.split("\\s+")
        })
        val rbkDStream:DStream[(String, Int)] = wordsDStream
            .map(word => (word, 1))
            .reduceByKeyAndWindow(
                (v1:Int, v2:Int) => v1 + v2,
                Seconds(batchInterval.toLong * 3), //window length
                Seconds(batchInterval.toLong * 2))//sliding interval
        rbkDStream.print()
        ssc.start()//必须要执行,否则作业不会被执行
        ssc.awaitTermination()//阻塞方法,等待Streaming被stop,或者异常终端,才能释
放对应锁
        ssc.stop(false, true)//stopSparkContext,stopGracefully
       }
}

/**
  * 监听网络端口:
  *     1、hostname
  *     2、port
  * @param storageLevel
  */
class MyNetWorkReceiver(host:String,
                        port:Int,
                        storageLevel: StorageLevel = 
StorageLevel.MEMORY_AND_DISK_SER)
    extends Receiver[String](storageLevel) {

    /**
      * 所有该receiver所需要的资源都需要在该方法中初始化
      */
    override def onStart(): Unit = {
        val socket = new Socket(host, port)
        val thread = new Thread(){
            override def run() = {
                val br = new BufferedReader(new 
InputStreamReader(socket.getInputStream))
                var line: String = null
                while ((line = br.readLine) != null) {
                    store(line)
                }
            }
        }
        thread.setDaemon(true)
        thread.start()
    }
    override def onStop(): Unit = {
    }
}

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

lipviolet

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值