package com.bynear.spark_Streaming;
import com.google.common.base.Optional;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
/**
* 2018/5/15
* 9:57
* transform以及实时黑名单过滤案例实战
*/
public class Transform {
public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("transtrom").setMaster("local[2]");
JavaSparkContext jsc = new JavaSparkContext(conf);
JavaStreamingContext jssc = new JavaStreamingContext(jsc, Durations.seconds(5));
/**用户针对我们的网站上的广告可以进行点击,
* 点击之后,是不是要进行实时计费,点一次,算一次钱
* 但是,对于那些帮助某些无良商家,刷广告的人,那么我们有一个黑名单,只要是黑名单重点 用户点击了广告,
* 我们给过滤点
* 这里的日志格式就简化一下! 就是date username的方式
*/
JavaReceiverInputDStream<String> adsClickLogDStream = jssc.socketTextStream("localhost", 9999);
/**所有要先对输入的数据,进行一下转换操作,变成(username,date username)
* 以便于,后面对每个batch的RDD,与定义好的黑名单进行join操作
*/
// 黑名单RDD
List<Tuple2<String, Boolean>> blacklistData = new ArrayList<Tuple2<String, Boolean>>();
blacklistData.add(new Tuple2<String, Boolean>("tom", true));
final JavaPairRDD<String, Boolean> blcakListRDD = jsc.parallelizePairs(blacklistData);
JavaPairDStream<String, String> userAdsClickLogDStream = adsClickLogDStream.mapToPair(new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String log) throws Exception {
String[] LogSplit = log.split(" ");
// 返回值为 username, date username
return new Tuple2<String, String>(LogSplit[1], log);
}
});
/**之后,就可以执行transform操作,将每个batch的RDD,与黑名单RDD进行join,filter,map等操作
* 实时进行黑名单过滤
*/
JavaDStream<String> validAdsClickLogDStream = userAdsClickLogDStream.transform(
new Function<JavaPairRDD<String, String>, JavaRDD<String>>() {
@Override
// 参数说明(javaPairRDD(username,date username)userAdsClickLogRDD 为输入的每一个batch对应的RDD )
public JavaRDD<String> call(JavaPairRDD<String, String> userAdsClickLogRDD) throws Exception {
// blackListRDD(Tuple(username,true))与JavaPairRDD(username,date username)
// 左外join,返回值为javaPairRDD(username,Tuple2(date username ,true/false))
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinRDD = userAdsClickLogRDD.leftOuterJoin(blcakListRDD);
// filter过滤参数说明 javaPairRDD(username,Tuple2(date username ,true/false))
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> filterJoinRDD = joinRDD.filter(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, Boolean>() {
@Override
// 进行filter过滤,返回值为 true/false
// 参数Tuple2(username,Tuple2(date username,true/false))
public Boolean call(Tuple2<String, Tuple2<String, Optional<Boolean>>> tuple) throws Exception {
// 如果Tuple2中的Tuple2中的第二个参数(true/false)存在,并且值为true,返回false 直接过滤,否则保留
if (tuple._2._2.isPresent() && tuple._2._2.get()) {
return false;
}
return true;
}
});
// 参数Tuple2(username,Tuple2(date usrname,true/false))
JavaRDD<String> validClickLogRDD = filterJoinRDD.map(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, String>() {
@Override
public String call(Tuple2<String, Tuple2<String, Optional<Boolean>>> tuple) throws Exception {
// 返回值为 date username
// 20180515105760 apple apple
// return tuple._2._1 + " " + tuple._1;
return tuple._2._1;
}
});
return validClickLogRDD;
}
});
validAdsClickLogDStream.print();
jssc.start();
jssc.awaitTermination();
jssc.stop();
}
}
说明:生成的jar 运行在了集群上!
结果:
nc -lk 9999 窗口输入的数据
20180515105756 zjs
20180515105757 lt
20180515105758 black
20180515105759 orang
20180515105760 apple
20180515105761 huawei
20180515105762 xiaomi
20180515105763 tom
运行程序的窗口产生的数据
20180515105757 lt
20180515105760 apple
20180515105756 zjs
20180515105761 huawei
20180515105759 orang
20180515105758 black
20180515105762 xiaomi
其中username为“Tom”的被黑名单拦截了!