依赖
根据scala版本选择
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.12</artifactId>
<version>3.1.1</version>
</dependency>
1. 单词统计
1.1 spark处理代码
package ace.gjh.streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Duration, StreamingContext}
/**
* Spark流式计算-单词计数<br>
* 数据源:nc
*
* @author ACE_GJH
* @date 2021/5/5
*/
object StreamingWordCount {
def main(args: Array[String]): Unit = {
// 创建Spark流式计算配置
val conf = new SparkConf()
.setAppName("Streaming-WordCount")
.setMaster("local[*]")
val context = new StreamingContext(conf, new Duration(2000))
// 设置日志的输出级别
context.sparkContext.setLogLevel("ERROR")
val stream = context.socketTextStream("localhost", 9999)
stream
.flatMap(_.split(" "))
.map((_, 1))
.groupByKey()
.map(t => (t._1, t._2.size))
.print(20)
// 开启流应用
context.start()
// 关闭资源
context.awaitTermination()
}
}
1.2 数据源
nc -lk 9999
2. 单词统计排行榜
2.1 spark处理代码
package ace.gjh.streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
/**
* Spark流式计算-单词计数<br>
* 数据源:nc
*
* @author ACE_GJH
* @date 2021/5/5
*/
object StreamingWordCountAndWindow {
def main(args: Array[String]): Unit = {
// 创建Spark流式计算配置
val conf = new SparkConf()
.setAppName("Streaming-WordCount")
.setMaster("local[*]")
val context = new StreamingContext(conf, new Duration(1000))
// 设置日志的输出级别
context.sparkContext.setLogLevel("ERROR")
val stream = context.socketTextStream("localhost", 9999)
stream
.flatMap(_.split(" "))
.map((_, 1))
.window(Seconds(15))
.groupByKey()
.map(t => (t._1, t._2.size))
.foreachRDD(rdd => {
val tuples = rdd
.sortBy(_._2, false, 1)
.take(3)
for (elem <- tuples) {
println(elem._1 + " " + elem._2)
}
})
// 开启流应用
context.start()
// 关闭资源
context.awaitTermination()
}
}
2.2 数据源
nc -lk 9999