/**
* SparkStreaming基于网络端口数据的流式计算
* SparkStreaming程序的入口:
* StreamingContext
* 监听网络bigdata01机器,上面的9999端口的数据
* 每隔2秒,统计一次数据
*/
object _01SparkStreamingWordCountNetWorkOps {
def main(args: Array[String]): Unit = {
if(args == null || args.length < 3) {
println(
"""Parameter Errors! Usage: <batchInterval> <host> <port>
|batchInterval: 每隔多长时间启动一次streaming作业
|host:
|port
""".stripMargin)
System.exit(-1)
}
//模式匹配
val Array(batchInterval, host, port) = args
// val t = new Tuple2("aaa", "bbb")
// val (t1, t2, t3) = new Tuple2("aaa", "bbb")
val conf = new SparkConf()
.setAppName("WordCountNetWork")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
//加载外部数据--->socket网络数据
val linesDStream:ReceiverInputDStream[String] =
ssc.socketTextStream(host, port.toInt)
// linesDStream.print()
val wordsDStream:DStream[String] = linesDStream.flatMap(line => {
line.split("\\s+")
})
val rbkDStream:DStream[(String, Int)] = wordsDStream.map(word => (word,
1)).reduceByKey(_+_)
rbkDStream.print()
ssc.start()//必须要执行,否则作业不会被执行
ssc.awaitTermination()
}
}
案例二:SparkStreaming读取HDFS目录新增文件
(1)源码
object _02SparkStreamingWCHDFSOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
if(args == null || args.length < 2) {
println(
"""Parameter Errors!Usage: <batchInterval> <inputpath>
|batchInterval: batchInterval
|inputpath: inputpath
""".stripMargin)
System.exit(-1)
}
val Array(batchInterval, inputpath) = args
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("SparkStreamingWCHDFS")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
val fileStream:DStream[String] = ssc.textFileStream(inputpath)
val retDStream:DStream[(String, Int)] =
fileStream.flatMap(_.split("\\s+")).map((_, 1)).reduceByKey(_+_)
retDStream.print()
ssc.start()
ssc.awaitTermination()
}
}
transform算子,主要是用来对DStream和其它数据集(RDD、Dataset、DataFrame)进行关联操作,而DStream没有直接关联的api的场景所出现的。比如一个DStream需要和一个RDD进行join操作。
案例介绍:在线黑名单过滤
/**
* 广告在线黑名单过滤
* 黑名单是一个相对静态的数据,
* 要和不断从kafka中流入进来的数据集进行关联匹配,目的就是为了过滤黑
名单中的数据,
* 以保证用户权益。
* 黑名单,假设使用静态的结果集,加载成为一个RDD
* 而数据是流式的DStream,现在要完成
* RDD和DStream的关联操作
*
* DStream并没有提供这样的api操作,所以我们这里只能使用transform操
作来完成
*/
object _03SparkStreamingRealTimeBlacklistFilterOps {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
.setAppName("RealTimeBlacklistFilter")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(2))
//构建黑名单
val blacklist = List(
("27.19.74.143", true),
("110.52.250.126", true)
)
val blacklistRDD:RDD[(String, Boolean)] = sc.parallelize(blacklist)
/**
* ip,我们这里使用ip进行过滤
* 27.19.74.143##2016-05-30 17:38:20##GET
/static/image/common/faq.gif HTTP/1.1##200##1127
110.52.250.126##2016-05-30 17:38:20##GET
/data/cache/style_1_widthauto.css?y7a HTTP/1.1##200##1292
*/
val lines = ssc.socketTextStream("bigdata01", 9999)
val ip2InfoDStream:DStream[(String, String)] = lines.map(line => {
val fields = line.split("##")
(fields(0), line.substring(line.indexOf("##") + 2))
})
//要进行黑名单过滤,blacklistRDD和ip2InfoDStream中都有的数据要被
过滤掉,也就是join有结果的数据被过滤掉
val filteredDStream:DStream[(String, String)] =
ip2InfoDStream.transform(rdd => {
if(!rdd.isEmpty()) {
/*
left join 左表所有的都有,右表能关联上的显示,否则为
null
在本例中,两张表都能关联上的数据是:应该被过滤掉数据,
黑名单数据
*/
val joinedRDD:RDD[(String, (String, Option[Boolean]))] =
rdd.leftOuterJoin(blacklistRDD)
joinedRDD.filter{case (ip, (line, option)) => {
!option.isDefined
}}.map{case (ip, (line, option)) => (ip, line)}
} else {
rdd.sparkContext.emptyRDD
}
})
filteredDStream.print()
ssc.start()
ssc.awaitTermination()
}
}
updateStateByKey
updateStateByKey算子,所表达的就是统计,截止到目前为止的某一个key的状态。这个状态由前置状态(the previous state)和当前状态(new state)来构成。为了实现这个状态的更新,必须做到以下两个步骤:
1、定义状态
2、定义状态更新函数
案例说明:统计old李商城截止到目前为止商品销售总金额
/**
* updateStateByKey
* 统计old李商城截止到目前为止商品销售总金额
*/
object _04SparkStreamingUpdateStateByKeyOps {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.spark-project").setLevel(Level.WARN)
val conf = new SparkConf()
.setAppName("RealTimeBlacklistFilter")
.setMaster("local[*]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(2))
//用来存放历史数据状态的目录
ssc.checkpoint("file:///E:/data/spark/streaming/usb")
//只有一列内容,就是金额
val numDS = ssc.socketTextStream("bigdata01", 9999)
val amountDS = numDS.map(num => ("totalAmount", num.toInt))
val rbkDS:DStream[(String, Int)] = amountDS.reduceByKey(_+_)//当前
批次的数据
//合并历史数据和当前批次的数据
val usbDS:DStream[(String, Int)] =
rbkDS.updateStateByKey((current:Seq[Int], history:Option[Int]) => {
Option[Int](current.sum + history.getOrElse(0))
})
usbDS.foreachRDD((rdd, bTime) => {
if(!rdd.isEmpty()) {
println("-------------------------------------------")
println(s"Time: $bTime")
rdd.foreach{case (key, amt) => {
println(s"old李商城截止到目前为止商品销售总金额: " +
amt)
}}
println("-------------------------------------------")
}
})
ssc.start()
ssc.awaitTermination()
}
}
相关编码
举例说明:每隔4秒,统计过去6秒钟产生的数据,batch Interval为2s。
object _01SparkStreamingWindowOps {
def main(args: Array[String]): Unit = {
if(args == null || args.length < 3) {
println(
"""Parameter Errors! Usage: <batchInterval> <host> <port>
|batchInterval: 每隔多长时间启动一次streaming作业
|host:
|port
""".stripMargin)
System.exit(-1)
}
//模式匹配
val Array(batchInterval, host, port) = args
val conf = new SparkConf()
.setAppName("StreamingWindow")
.setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
ssc.checkpoint("file:///E:/data/spark/streaming/window-ck")
//加载外部数据--->socket网络数据
val linesDStream:ReceiverInputDStream[String] =
ssc.socketTextStream(host, port.toInt)
// linesDStream.print()
val wordsDStream:DStream[String] = linesDStream.flatMap(line => {
line.split("\\s+")
})
val rbkDStream:DStream[(String, Int)] = wordsDStream
.map(word => (word, 1))
.reduceByKeyAndWindow(
(v1:Int, v2:Int) => v1 + v2,
Seconds(batchInterval.toLong * 3), //window length
Seconds(batchInterval.toLong * 2))//sliding interval
rbkDStream.print()
ssc.start()//必须要执行,否则作业不会被执行
ssc.awaitTermination()//阻塞方法,等待Streaming被stop,或者异常终
端,才能释放对应锁
ssc.stop(false, true)//stopSparkContext,stopGracefully
}
}
15,SparkStreaming自定义Receiver
object _04SparkStreamingCustomReceiverOps {
def main(args: Array[String]): Unit = {
if(args == null || args.length < 3) {
println(
"""Parameter Errors! Usage: <batchInterval> <host> <port>
|batchInterval: 每隔多长时间启动一次streaming作业
|host:
|port
""".stripMargin)
System.exit(-1)
}
//模式匹配
val Array(batchInterval, host, port) = args
val conf = new SparkConf()
.setAppName("StreamingCustomReceiver")
.setMaster("local[2]")
val sc = new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(batchInterval.toLong))
//加载外部数据--->socket网络数据
val linesDStream:ReceiverInputDStream[String] = ssc.receiverStream(new
MyNetWorkReceiver(host, port.toInt))
// linesDStream.print()
val wordsDStream:DStream[String] = linesDStream.flatMap(line => {
line.split("\\s+")
})
val rbkDStream:DStream[(String, Int)] = wordsDStream
.map(word => (word, 1))
.reduceByKeyAndWindow(
(v1:Int, v2:Int) => v1 + v2,
Seconds(batchInterval.toLong * 3), //window length
Seconds(batchInterval.toLong * 2))//sliding interval
rbkDStream.print()
ssc.start()//必须要执行,否则作业不会被执行
ssc.awaitTermination()//阻塞方法,等待Streaming被stop,或者异常终端,才能释
放对应锁
ssc.stop(false, true)//stopSparkContext,stopGracefully
}
}
/**
* 监听网络端口:
* 1、hostname
* 2、port
* @param storageLevel
*/
class MyNetWorkReceiver(host:String,
port:Int,
storageLevel: StorageLevel =
StorageLevel.MEMORY_AND_DISK_SER)
extends Receiver[String](storageLevel) {
/**
* 所有该receiver所需要的资源都需要在该方法中初始化
*/
override def onStart(): Unit = {
val socket = new Socket(host, port)
val thread = new Thread(){
override def run() = {
val br = new BufferedReader(new
InputStreamReader(socket.getInputStream))
var line: String = null
while ((line = br.readLine) != null) {
store(line)
}
}
}
thread.setDaemon(true)
thread.start()
}
override def onStop(): Unit = {
}
}