transform以及实时黑名单过滤案例实战

package com.bynear.spark_Streaming;

import com.google.common.base.Optional;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import scala.Tuple2;

import java.util.ArrayList;
import java.util.List;

/**
 * 2018/5/15
 * 9:57
 * transform以及实时黑名单过滤案例实战
 */
public class Transform {
    public static void main(String[] args) {
        SparkConf conf = new SparkConf().setAppName("transtrom").setMaster("local[2]");
        JavaSparkContext jsc = new JavaSparkContext(conf);
        JavaStreamingContext jssc = new JavaStreamingContext(jsc, Durations.seconds(5));
        /**用户针对我们的网站上的广告可以进行点击,
         * 点击之后,是不是要进行实时计费,点一次,算一次钱
         * 但是,对于那些帮助某些无良商家,刷广告的人,那么我们有一个黑名单,只要是黑名单重点 用户点击了广告,
         * 我们给过滤点
         * 这里的日志格式就简化一下!  就是date username的方式
         */
        JavaReceiverInputDStream<String> adsClickLogDStream = jssc.socketTextStream("localhost", 9999);
        /**所有要先对输入的数据,进行一下转换操作,变成(username,date username)
         * 以便于,后面对每个batch的RDD,与定义好的黑名单进行join操作
         */
//        黑名单RDD
        List<Tuple2<String, Boolean>> blacklistData = new ArrayList<Tuple2<String, Boolean>>();
        blacklistData.add(new Tuple2<String, Boolean>("tom", true));
        final JavaPairRDD<String, Boolean> blcakListRDD = jsc.parallelizePairs(blacklistData);

        JavaPairDStream<String, String> userAdsClickLogDStream = adsClickLogDStream.mapToPair(new PairFunction<String, String, String>() {
            @Override
            public Tuple2<String, String> call(String log) throws Exception {
                String[] LogSplit = log.split(" ");
//                返回值为  username, date username
                return new Tuple2<String, String>(LogSplit[1], log);
            }
        });

        /**之后,就可以执行transform操作,将每个batch的RDD,与黑名单RDD进行join,filter,map等操作
         * 实时进行黑名单过滤
         */
        JavaDStream<String> validAdsClickLogDStream = userAdsClickLogDStream.transform(
                new Function<JavaPairRDD<String, String>, JavaRDD<String>>() {
                    @Override
//                    参数说明(javaPairRDD(username,date username)userAdsClickLogRDD 为输入的每一个batch对应的RDD )
                    public JavaRDD<String> call(JavaPairRDD<String, String> userAdsClickLogRDD) throws Exception {
//                        blackListRDD(Tuple(username,true))与JavaPairRDD(username,date username)
//                        左外join,返回值为javaPairRDD(username,Tuple2(date username ,true/false))
                        JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> joinRDD = userAdsClickLogRDD.leftOuterJoin(blcakListRDD);
//                       filter过滤参数说明 javaPairRDD(username,Tuple2(date username ,true/false))
                        JavaPairRDD<String, Tuple2<String, Optional<Boolean>>> filterJoinRDD = joinRDD.filter(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, Boolean>() {
                            @Override
//                            进行filter过滤,返回值为 true/false
//                            参数Tuple2(username,Tuple2(date username,true/false))
                            public Boolean call(Tuple2<String, Tuple2<String, Optional<Boolean>>> tuple) throws Exception {
//                                如果Tuple2中的Tuple2中的第二个参数(true/false)存在,并且值为true,返回false 直接过滤,否则保留
                                if (tuple._2._2.isPresent() && tuple._2._2.get()) {
                                    return false;
                                }
                                return true;
                            }
                        });
//                        参数Tuple2(username,Tuple2(date usrname,true/false))
                        JavaRDD<String> validClickLogRDD = filterJoinRDD.map(new Function<Tuple2<String, Tuple2<String, Optional<Boolean>>>, String>() {
                            @Override
                            public String call(Tuple2<String, Tuple2<String, Optional<Boolean>>> tuple) throws Exception {
//                              返回值为 date username
//                                20180515105760 apple     apple
//                                return tuple._2._1 + "     " + tuple._1;
                                return tuple._2._1;
                            }
                        });


                        return validClickLogRDD;
                    }
                });

        validAdsClickLogDStream.print();

        jssc.start();
        jssc.awaitTermination();
        jssc.stop();


    }
}

说明:生成的jar 运行在了集群上!
结果:
nc -lk 9999 窗口输入的数据
20180515105756 zjs
20180515105757 lt
20180515105758 black
20180515105759 orang
20180515105760 apple
20180515105761 huawei
20180515105762 xiaomi
20180515105763 tom
运行程序的窗口产生的数据
20180515105757 lt
20180515105760 apple
20180515105756 zjs
20180515105761 huawei
20180515105759 orang
20180515105758 black
20180515105762 xiaomi

其中username为“Tom”的被黑名单拦截了!

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值