/**
* java,spark实现黑名单过滤
*/
public class BlackListFilter {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("Simple Application").setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaRDD javaRDD = sc.textFile("F:\\text\\url.txt");
//黑名单
List> blackList = Arrays.asList(new Tuple2("a", true),
new Tuple2("b", true),
new Tuple2("c", true));
//list => JavaPairRDD
JavaPairRDD blackListRDD = sc.parallelizePairs(blackList);
//lines => words => (word, 1)
JavaPairRDD wordsAndCount = javaRDD
.flatMap(new FlatMapFunction() {
@Override
public Iterator call(String s) throws Exception {
return Arrays.asList(s.split("\\s+")).iterator();
}
}).mapToPair(new PairFunction() {
@Override
public Tuple2 call(String s) throws Exception {
return new Tuple2(s, 1);
}
});
//(word, 1) leftOutJoin (word, true) => (word, (1, Option))
JavaPairRDD>> leftOuterJoin =
wordsAndCount.leftOuterJoin(blackListRDD);
//(word, (1, Option)) => filter => (word, (1, option = false)) => map => word
JavaRDD whiteList = leftOuterJoin.filter(new Function>>, Boolean>() {
@Override
public Boolean call(Tuple2>> t) throws Exception {
return t._2._2.orElse(false) ? false : true;
}
}).map(new Function>>, String>() {
@Override
public String call(Tuple2>> t) throws Exception {
return t._1;
}
});
System.out.println(whiteList.collect());
}
}