作用 将Dstream转换为RDD,从而可以和其它的的RDD进行join操作。
scala版本:
package cn.spark.study.streaming
import org.apache.spark.SparkConf
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.Seconds
object TransformTest {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]")
.setAppName("Transform")
val ssc = new StreamingContext(conf, Seconds(2))
val filterArray = Array(("key1", true)) // 这里面包含的key将会被过滤掉
val filterRDD = ssc.sparkContext.parallelize(filterArray, 5) // 生成一个并行RDD
val logDStream = ssc.socketTextStream("test", 8897) // 使用socket作为流数据源
val tmpLogDStream = logDStream
.map { clickLog => (clickLog.split(" ")(1), clickLog) } // 输入数据格式为:id1 key1; id2 key2 的格式
val validAdsClickLogDStream = tmpLogDStream.transform(clickLogRDD => { // 做转换
val joinedRDD = clickLogRDD.leftOuterJoin(filterRDD) // 左外连接,过滤条件中有的,其第二项中的布尔值就为true,就不要
val filteredRDD = joinedRDD.filter(tuple => {
if(tuple._2._2.getOrElse(false)) {
false
} else {
true
}
})
val validAdsClickLogRDD = filteredRDD.map(tuple => tuple._2._1)
validAdsClickLogRDD
})
validAdsClickLogDStream.print() // 直接打印
ssc.start()
ssc.awaitTermination()
}
}
Java版本:
package cn.spark.study.streaming;
import java.util.ArrayList;
import java.util.List;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import com.google.common.base.Optional;
import scala.Tuple2;
public class TransformBlacklist {
@SuppressWarnings("deprecation")
public static void main(String[] args) {
SparkConf conf = new SparkConf()
.setMaster("local[2]")
.setAppName("TransformBlacklist");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
// 先做一份模拟的黑名单RDD
List<Tuple2<String, Boolean>> filterList = new ArrayList<Tuple2<String, Boolean>>();
filterList.add(new Tuple2<String, Boolean>("tom", true));
final JavaPairRDD<String, Boolean> filterRDD = jssc.sc().parallelizePairs(blacklist);
JavaReceiverInputDStream<String> clickLogDStream = jssc.socketTextStream("test", 8897);
JavaPairDStream<String, String> tmpClickLogDStream = clickLogDStream.mapToPair(
new PairFunction<String, String, String>() {
private static final long serialVersionUID = 1L;
@Override
public Tuple2<String, String> call(String tmpClickLog)
throws Exception {
return new Tuple2<String, String>(
tmpClickLog.split(" ")[1], tmpClickLog); // 转换为key 输入数据-->的形式,为后续join做准备
}
});
// 执行transform操作了,将每个batch的RDD,filterRDD进行join、filter、map等操作,实时进行过滤
JavaDStream<String> validClickLogDStream = tmpClickLogDStream.transform(
new Function<JavaPairRDD<String,String>, JavaRDD<String>>() {
private static final long serialVersionUID = 1L;
@Override
public JavaRDD<String> call(JavaPairRDD<String, String> clickLogRDD)
throws Exception {
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinedRDD =
clickLogRDD.leftOuterJoin(filterRDD);
JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> resRDD =
joinedRDD.filter(
new Function<Tuple2<String,
Tuple2<String,Optional<Boolean>>>, Boolean>() {
private static final long serialVersionUID = 1L;
@Override
public Boolean call(
Tuple2<String,
Tuple2<String, Optional<Boolean>>> tuple)
throws Exception {
if(tuple._2._2().isPresent() &&
tuple._2._2.get()) {
return false;
}
return true;
}
});
// 此时,filteredRDD中,就只剩下没有被黑名单过滤的用户点击了
// 进行map操作,转换成我们想要的格式
JavaRDD<String> resClickLogRDD = resRDD.map(
new Function<Tuple2<String,Tuple2<String,Optional<Boolean>>>, String>() {
private static final long serialVersionUID = 1L;
@Override
public String call(
Tuple2<String, Tuple2<String, Optional<Boolean>>> tuple)
throws Exception {
return tuple._2._1;
}
});
return validResClickLogRDD;
}
});
validClickLogDStream.print();
jssc.start();
jssc.awaitTermination();
jssc.close();
}
}