package gh.spark.SparkStreaming;
import java.util.ArrayList;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
/**
* 广告计费日志实时黑名单过滤
* @author Administrator
*
*/
public class TransformDemo {
public static void main(String[] args) throws Exception {
SparkConf conf=new SparkConf()
.setAppName("TransformDemo")
.setMaster("local[2]");
JavaStreamingContext jsc=
new JavaStreamingContext(conf,Durations.seconds(5));
//创建一份黑名单
ArrayList<Tuple2<String, Boolean>> blackList =
new ArrayList<Tuple2<String, Boolean>>();
blackList.add(new Tuple2<String, Boolean>("leo", true));
//注意,定义时需要添加final关键字
final JavaPairRDD<String, Boolean> blackRDD =
jsc.sparkContext().parallelizePairs(blackList);
JavaReceiverInputDStream<String> linesDtream =
jsc.socketTextStream("tgmaster", 9999);
//日志格式:date username,比如:2016-11-07 jack
//(jack,2016-11-07 jack)
JavaPairDStream<String, String> mapToPairDtream = linesDtream.mapToPair(new PairFunction<String, String, String>() {
private static final long serialVersionUID = 1L;
public Tuple2<String, String> call(String line) throws Exception {
return new Tuple2<String, String>(line.split(" ")[1], line);
}
});
//实时黑名单过滤
JavaDStream<String> resultDtream = mapToPairDtream.transform(new Function<JavaPairRDD<String,String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
public JavaRDD<String> call(JavaPairRDD<String, String> userClickLogRDD)
throws Exception {
/**
* 在此处,我们使用leftOuterJoin左外连接的方式进行join
* 左外连接之后的结果中既有黑名单人员,又有非黑名单人员
*/
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinRDD =
userClickLogRDD.leftOuterJoin(blackRDD);
/**
* 接下来进行filter过滤操作
* 将黑名单用户过滤出来
*/
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> filterRDD = joinRDD.filter(new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> log)
throws Exception {
if(log._2._2.isPresent() && log._2._2.get()){
return false; //此时是一个黑名单用户
}
return true; //非黑名单用户
}
});
//找出非黑名单日志中的用户名username
JavaRDD<String> mapRDD = filterRDD.map(new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, String>() {
private static final long serialVersionUID = 1L;
public String call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> log)
throws Exception {
return log._2._1; //返回用名username的日志
}
});
return mapRDD;
}
});
resultDtream.print();
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}
import java.util.ArrayList;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.Optional;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;
/**
* 广告计费日志实时黑名单过滤
* @author Administrator
*
*/
public class TransformDemo {
public static void main(String[] args) throws Exception {
SparkConf conf=new SparkConf()
.setAppName("TransformDemo")
.setMaster("local[2]");
JavaStreamingContext jsc=
new JavaStreamingContext(conf,Durations.seconds(5));
//创建一份黑名单
ArrayList<Tuple2<String, Boolean>> blackList =
new ArrayList<Tuple2<String, Boolean>>();
blackList.add(new Tuple2<String, Boolean>("leo", true));
//注意,定义时需要添加final关键字
final JavaPairRDD<String, Boolean> blackRDD =
jsc.sparkContext().parallelizePairs(blackList);
JavaReceiverInputDStream<String> linesDtream =
jsc.socketTextStream("tgmaster", 9999);
//日志格式:date username,比如:2016-11-07 jack
//(jack,2016-11-07 jack)
JavaPairDStream<String, String> mapToPairDtream = linesDtream.mapToPair(new PairFunction<String, String, String>() {
private static final long serialVersionUID = 1L;
public Tuple2<String, String> call(String line) throws Exception {
return new Tuple2<String, String>(line.split(" ")[1], line);
}
});
//实时黑名单过滤
JavaDStream<String> resultDtream = mapToPairDtream.transform(new Function<JavaPairRDD<String,String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
public JavaRDD<String> call(JavaPairRDD<String, String> userClickLogRDD)
throws Exception {
/**
* 在此处,我们使用leftOuterJoin左外连接的方式进行join
* 左外连接之后的结果中既有黑名单人员,又有非黑名单人员
*/
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinRDD =
userClickLogRDD.leftOuterJoin(blackRDD);
/**
* 接下来进行filter过滤操作
* 将黑名单用户过滤出来
*/
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> filterRDD = joinRDD.filter(new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> log)
throws Exception {
if(log._2._2.isPresent() && log._2._2.get()){
return false; //此时是一个黑名单用户
}
return true; //非黑名单用户
}
});
//找出非黑名单日志中的用户名username
JavaRDD<String> mapRDD = filterRDD.map(new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, String>() {
private static final long serialVersionUID = 1L;
public String call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> log)
throws Exception {
return log._2._1; //返回用名username的日志
}
});
return mapRDD;
}
});
resultDtream.print();
jsc.start();
jsc.awaitTermination();
jsc.close();
}
}