package spark
/**
*监控网络中的数据,基于broadcast中的黑名单,对获取数据流中的单词进行过滤筛选,并统计过滤筛选出记录的个数
*/
import org.apache.log4j.{Level, Logger}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.Accumulator
/**
* 定义单例对象WordBlacklist,用来注册广播变量。
*/
object WordBlacklist {
@volatile private var instance: Broadcast[Seq[String]] = null
def getInstance(sc: SparkContext): Broadcast[Seq[String]] = {
if (instance == null) {
synchronized { //同步
if (instance == null) {
val wordBlacklist = Seq("hello", "world") //黑名单的词
instance = sc.broadcast(wordBlacklist) //广播变量,一旦创建,发送给所有子节点
}
}
}
instance
}
}
/**
* 定义单例对象DroppedWordsCounter,用来注册累加器。
*/
object DroppedWordsCounter {
@volatile private var instance: Accumulator[Int] = null
def getInstance(sc: SparkContext): Accumulator[Int] = {
if (instance == null) { //如果累加器不存在则创建
synchronized {
if (instance == null) {
instance = sc.accumulator(0) //用以记录被删除的词
}
}
}
instance
}
}
object Streaming_AccumulatorAndBroadcast {
//1.封装业务逻辑;2.创建对象
def createContext(ip: String, port: Int, outputPath: String, checkpointDirectory: String) //接收四个参数
: StreamingContext = {
// 如果没有打印出"Creating new context"说明StreamingContext已经从checkpoint目录中创建
println("Creating new context")
// 创建ssc,设定批生成时间间隔为5s
val sparkConf = new SparkConf().setAppName("Streaming_AccumulatorAndBroadcast").setMaster("local[2]")
val ssc = new StreamingContext(sparkConf, Seconds(5))
ssc.checkpoint(checkpointDirectory)
// 创建基于ip:port的Socket数据流,数据流中word以'\n'进行切分
val lines = ssc.socketTextStream(ip, port)
val words = lines.flatMap(_.split(","))
val wordCounts = words.map((_, 1)).reduceByKey(_ + _) //转换为二元组进行累加操作
//eg: wordCounts = {RDD1={(hell0,2),(spark,1)},RDD2={(world,1),(spark,3)}...}
wordCounts.foreachRDD { rdd=>
// 获取或注册黑名单广播变量blacklist Broadcast Seq("hello", "world")
val blacklist = WordBlacklist.getInstance(rdd.sparkContext)//rdd.sparkContext返回当前rdd中封装的SparkContext实例
// 获取或注册删除单词累加器WordsCounter Accumulator 0
val droppedWordsCounter = DroppedWordsCounter.getInstance(rdd.sparkContext)//rdd.sparkContext返回当前rdd中封装的SparkContext实例
// 使用黑名单blacklist过滤单词,并利用droppedWordsCounter对过滤掉的单词进行统计,并输出
// val counts = rdd.filter { case (word, count) =>
val filteredrdd=rdd.filter { case (word, count) =>
if (blacklist.value.contains(word)) { //判断是否包含"hello"或"world"
droppedWordsCounter.add(count.toInt)
println("the word: "+word+" is deleted "+count+" times")
false //返回false值
}
else {
true
}
}
filteredrdd.saveAsTextFile(outputPath) //保存结果
filteredrdd.foreach(println) //输出
println("the accumulator is "+droppedWordsCounter+"!!!!!!!!!!!!!") //返回累加器的总和
}
ssc //返回ssc
}
def main(args: Array[String]) {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.eclipse.jetty.server").setLevel(Level.OFF)
if (args.length != 4) {
System.err.println("Your arguments were " + args.mkString("[", ", ", "]"))
System.exit(1)
}
//需要事先创建相应目录!!
val Array(ip, port, outputPath, checkpointDirectory) = args
val ssc = StreamingContext.getOrCreate(checkpointDirectory,
() => createContext(ip, port.toInt,outputPath,checkpointDirectory))
ssc.start()
ssc.awaitTermination()
}
}
程序运行时需手动在IDE上配置四个运行参数:ip、端口、输出路径和checkpoint路径,中间以空格隔开。
还是通过早前博文上的模拟器及源文件,在9999端口发送数据。
运行主程序时,模拟器会连接上并开始发送数据,主程序监听并处理数据,输出
484

被折叠的 条评论
为什么被折叠?



