package spark.SparkStreaming.file
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Description:统计hdfs上指定目录中实时的文件中单词的次数(特点:实时的文件,不是历史的文件)<br/>
*/
object test extends App {
//SparkSession
val spark: SparkSession = SparkSession.builder()
.appName(test.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val ssc: StreamingContext = new StreamingContext(sc, Seconds(2))
//DStream,迭代计算,并显示内容
ssc.textFileStream("hdfs://mini1:9000/spark-streaming/wc") //← hdfs上特定的资源目录
.flatMap(_.split("\\s+"))
.filter(_.nonEmpty)
.map((_, 1))
.print(100)
//启动SparkStreaming应用
ssc.start
//等待结束(必须要添加)
ssc.awaitTermination
}