本文章主要通过spark streaming实时过滤黑名单
import org.apache.spark.SparkConf import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.Seconds /** * @author jhp * spark streaming实时过滤黑名单 */ object TransformBlacklist { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local[2]") .setAppName("TransformBlacklist") //实例化流上下文 val ssc = new StreamingContext(conf, Seconds(5)) val blacklist = Array(("tom", true)) val blacklistRDD = ssc.sparkContext.parallelize(blacklist, 5) val adsClickLogDStream = ssc.socketTextStream("spark1", 9999) val userAdsClickLogDStream = adsClickLogDStream .map { adsClickLog => (adsClickLog.split(" ")(1), adsClickLog) } //转换RDD val validAdsClickLogDStream = userAdsClickLogDStream.transform(userAdsClickLogRDD => { val joinedRDD = userAdsClickLogRDD.leftOuterJoin(blacklistRDD) //RDD过滤 val filteredRDD = joinedRDD.filter(tuple => { if(tuple._2._2.getOrElse(false)) { false } else { true } }) val validAdsClickLogRDD = filteredRDD.map(tuple => tuple._2._1) validAdsClickLogRDD }) validAdsClickLogDStream.print() //启动上下文 ssc.start() ssc.awaitTermination() } }