pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent>
<artifactId>test</artifactId>
<groupId>com.smj</groupId>
<version>1.0-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>spark-streaming</artifactId>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.2.2</version>
</dependency>
</dependencies>
</project>
源码
package wordcount
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
object SparkStreamingWordCount {
def main(args: Array[String]): Unit = {
if(args == null || args.length < 3) {
println(
"""
|Parameter Errors! Usage: <batchInterval> <hostname> <port>
""".stripMargin)
System.exit(-1)
}
val Array(biStr, hostname, port) = args
/*
StreamingContext的初始化,需要至少两个参数,SparkConf和BatchDuration
SparkConf不用多说
batchDuration:提交两次作业之间的时间间隔,每次会提交一个DStream,将数据转化batch--->RDD
所以说:sparkStreaming的计算,就是每隔多长时间计算一次数据
*/
val conf = new SparkConf().setAppName("SparkStreamingWordCount").setMaster("local[2]")
val batchInterval:Duration = Seconds(biStr.toLong)
val ssc = new StreamingContext(conf, batchInterval)
/*
加载外部数据,转化为streaming的编程模型---DStream
大多数流式计算程序,都需要进行持久化,而且默认的持久化策略为MEMORY_AND_DISK_SER_2
为什么?
这就是由流式计算的特点造成,数据在源源不断的产生,也在源源不断的被消费,如果某个节点处理失败,
很难再从源头找到对应的数据,所以持久化,同时还要备份。
*/
val linesDStream: ReceiverInputDStream[String] = ssc.socketTextStream(hostname, port.toInt, StorageLevel.MEMORY_AND_DISK_SER_2)
// 在进行转换操作的时候,将DStream当作RDD处理即可
val retDStream: DStream[(String, Int)] = linesDStream.flatMap(_.split(",")).map((_, 1)).reduceByKey(_ + _)
retDStream.print()
//启动流式计算
ssc.start()
println("------ print ------")
//保证sparkstreaming程序,可以持续不断的运行
ssc.awaitTermination()
}
}