一个java版本的复杂spark例子

首先说一下这个spark程序的目的。这个程序是要求对网站收到的http请求数据计算一些特征值,要求每10秒钟计算一次最近2小时的数据的特征值。这就需要用到spark的滑动窗口运算了。

数据是从topic为app的kafka中读取,计算出来的特征值发往topic为app_FEATURE的kafka中。

代码如下:

public class FeatureAccumulator {
  public static void main(String[] args) {
    if (args.length != 2) {
      System.out.println("Usage: FeatureAccumulator app");
      return;
    }

    String app = args[0];

    String type = args[1];

    Feature[] features = new Feature[]{
              // IP维度特征
              new DistinctCookieCountByIp(),
              new DistinctFingerCountByIp(),
              new DistinctUaCountByIp(),
              new DistinctUrlCountByIp(),
              new DistinctSessionCountByIp(),
              new TimeIntervalByIp(),
              new TimeIntervalStdByIp(),
              new TimeRangeByIp(),
              new UrlCountByIp(),
              new VisitMeanByIp(),
              new VisitStdByIp(),
              new NoCookieVisitCountByIp(),
              new NoFingerVisitCountByIp(),
              new PcVisitCountByIp(),
              new HttpsVisitCountByIp(),
              new ChinaVisitCountByIp(),
              new MaxIpCountByFingerByIp(),
              new SumIpCountByFingerByIp(),
              new AvgIpCountByFingerByIp(),
              new MaxIpCountByUaByIp(),
              new SumIpCountByUaByIp(),
              new AvgIpCountByUaByIp(),
              new DistinctFingerCountByIp3(),
              new DistinctIpCountByIp3(),
              new DistinctUaCountByIp3(),
              new DistinctCookieCountByIp3(),
              new DistinctTrackCountByIp3(),
              new SumIpCountBySessionByIp(),
              new AvgIpCountBySessionByIp()
      };

    SparkConf sparkConf = new SparkConf().setAppName("Feature Accumulator(" + app + ")");

    // 每10秒一个批次
    JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(10));

    Map<String, String> props = new HashMap<>();
    props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, Config.getProperty("kafka.host"));
    props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
    props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
    props.put(ConsumerConfig.GROUP_ID_CONFIG, "my_group");
    props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
    props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "10000");
    props.put("zookeeper.sync.time.ms", "250");

    try {
      Set<String> topics = new HashSet<String>();
      topics.add(app);

      // 获取kafka的数据
      JavaPairInputDStream<String, String> stream =
              KafkaUtils.createDirectStream(
                      jsc,
                      String.class, // key类型
                      String.class, // value类型
                      StringDecoder.class, // 解码器
                      StringDecoder.class,
                      props,
                      topics);

      MyJsonDataReader reader = new MyJsonDataReader();
      JavaDStream<RequestInfo> reqs = stream.map(s -> {
        RequestInfo info = reader.parseLine(s._2);
        return info;
      });

      JavaPairDStream<String, Serializable> reqsByIp = reqs.mapToPair(req -> {
        Map<String, Object> reqMap = new HashMap<>();
        reqMap.put("client_time", new SimpleDateFormat("yyyyMMdd HH:mm:ss").format(req.getRequestTime()));

        return new Tuple2<String, Serializable>(req.getClientIp(), JSONValue.toJSONString(reqMap));
      });

      JavaPairDStream<String, Serializable> reqsByIp2 = reqsByIp.reduceByKeyAndWindow((t1, t2) -> {
          // 每个IP只保留最近的一条记录
          String s1 = (String)t1;
          String s2 = (String)t2;
          return s1.compareTo(s2) > 0 ? s1 : s2;
        }, Durations.minutes(120));

      JavaPairDStream<String, Serializable> ipResult = null;
      JavaPairDStream<String, Serializable> ipJoinResult = null;
      JavaPairDStream<String, Serializable> ip3Result = null;
      JavaPairDStream<String, Serializable> ip3JoinResult = null;
      for (Feature feature : features) {
        if (!(feature instanceof DataByFeature)) {
          continue;
        }

        DataByFeature f = (DataByFeature) feature;
        JavaPairDStream<String, Serializable> result1 =
                reqs.mapToPair(req -> new Tuple2<>(f.getKey(req), f.getValue(req)));

        //开窗函数 10秒计算一次 计算前120分钟的数据聚合
        JavaPairDStream<String, Serializable> result2 = result1.reduceByKey(
                (Function2<Serializable, Serializable, Serializable>) (a1, a2) -> {
                  Serializable d = f.reduce(a1, a2);
                  return d;
                }).window(Durations.minutes(120));

        JavaPairDStream<String, Serializable> result3 = result2.mapValues(d -> f.getResult(d));

        // ip对应的finger中每个finger对应不同ip数的最大值
        if (feature instanceof DataBy2Feature) {
          DataBy2Feature f2 = (DataBy2Feature) feature;
          JavaPairDStream<String, Serializable> result1_2 =
                  reqs.mapToPair(req -> new Tuple2<>(f2.getKey2(req), f2.getValue2(req)));

          //开窗函数 10秒计算一次 计算前120分钟的数据聚合
          JavaPairDStream<String, Serializable> result2_2 = result1_2.reduceByKey(
                  (Function2<Serializable, Serializable, Serializable>) (a1, a2) -> {
                    Serializable d = f2.reduce(a1, a2);
                    return d;
                  }).window(Durations.minutes(120));

          // 每个finger对应不同ip数
          JavaPairDStream<String, Serializable> result3_2 = result2_2.mapValues(d -> f2.getResult2(d)).
                  mapToPair(d -> new Tuple2<String, Serializable>(Feature.SEP+d._1(), d._2()));

          result3 = result3_2.union(result3);
          result3 = result3.transform(new Function<JavaPairRDD<String, Serializable>, JavaRDD<Tuple2<String, Serializable>>>() {
            @Override
            public JavaRDD<Tuple2<String, Serializable>> call(JavaPairRDD<String, Serializable> t) throws Exception {
              Map<String, Serializable> values = t.filter(d -> d._1().startsWith(Feature.SEP)).
                      mapToPair(d -> new Tuple2<String, Serializable>(d._1().substring(1), d._2())).collectAsMap();

              t = t.filter(d -> !d._1().startsWith(Feature.SEP));
              JavaRDD<Tuple2<String, Serializable>> t2 = t.map(r -> {
                List<String> list = new ArrayList<>();
                for (String v : ((String) r._2()).split(Feature.SEP)) {
                  Serializable o = values.get(v);
                  list.add(o.toString());
                }
                return new Tuple2<String, Serializable>(r._1(), f2.reduce3(list));
              });
              return t2;
            }
          }).filter(d -> !d._1().startsWith(Feature.SEP)).mapToPair(t2 -> t2);
        }

        result3 = result3.mapValues(d -> f.getName()+"="+d);

        if (f.getType() == DataByFeature.IP) {
          if (ipJoinResult == null) {
            if (ipResult == null) {
              ipResult = result3;
            } else {
              ipJoinResult = ipResult.union(result3);
            }
          } else {
            ipJoinResult = ipJoinResult.union(result3);
          }
        } else if (f.getType() == DataByFeature.IP3) {
          if (ip3JoinResult == null) {
            if (ip3Result == null) {
              ip3Result = result3;
            } else {
              ip3JoinResult = ip3Result.union(result3);
            }
          } else {
            ip3JoinResult = ip3JoinResult.union(result3);
          }
        } else if (f.getType() == DataByFeature.FINGER) {
          if (fpJoinResult == null) {
            if (fpResult == null) {
              fpResult = result3;
            } else {
              fpJoinResult = fpResult.union(result3);
            }
          } else {
            fpJoinResult = fpJoinResult.union(result3);
          }
        } else if (f.getType() == DataByFeature.UA) {
          if (uaJoinResult == null) {
            if (uaResult == null) {
              uaResult = result3;
            } else {
              uaJoinResult = uaResult.union(result3);
            }
          } else {
            uaJoinResult = uaJoinResult.union(result3);
          }
        }
      }

      JavaPairDStream<String, Serializable> ipJoinResult2 = ipJoinResult.union(reqsByIp2);
      JavaPairDStream<String, Serializable> ip3JoinResult2 = ip3JoinResult;

      JavaPairDStream<String, Serializable> result34 = ip3JoinResult2.reduceByKey((t1, t2) -> {
        Serializable d = t1 + Feature.SEP + t2;
        return d;
      }).window(Durations.minutes(120));

      // 将IP3的特征合并进IP中
      JavaPairDStream<String, Serializable> result3_2 = result34.
              mapToPair(d -> new Tuple2<String, Serializable>("ip3."+d._1(), d._2()));

      JavaPairDStream<String, Serializable> result5 = result3_2.union(result4);

      JavaPairDStream<String, Serializable> result6 = result5.transform(new Function<JavaPairRDD<String, Serializable>, JavaRDD<Tuple2<String, Serializable>>>() {
        @Override
        public JavaRDD<Tuple2<String, Serializable>> call(JavaPairRDD<String, Serializable> t) throws Exception {
          Map<String, Serializable> values = t.filter(d -> d._1().startsWith("ip3.")).
                  mapToPair(d -> new Tuple2<String, Serializable>(d._1().substring(4), d._2())).collectAsMap();

          JavaPairRDD<String, Serializable> t1 = t.filter(d -> !d._1().startsWith("ip3."));
          JavaRDD<Tuple2<String, Serializable>> t2 = t1.map(d -> {
            String v = (String) d._2();
            String ip = d._1();
            String ip3 = DataByFeature.getIp3(ip);
            if (values.containsKey(ip3)) {
              v += Feature.SEP + values.get(ip3);
            }
            return new Tuple2<String, Serializable>(d._1(), v);
          });
          return t2;
        }
      }).filter(d -> !d._1().startsWith("ip3.")).mapToPair(d -> d);

      result6.foreachRDD(new VoidFunction<JavaPairRDD<String, Serializable>>() {
        public void call(JavaPairRDD<String, Serializable> v1) throws Exception {
          v1.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Serializable>>>() {
            public void call(Iterator<Tuple2<String, Serializable>> it) throws Exception {

              KafkaProducer kafkaProducer = KafkaProducer.getInstance();
              while (it.hasNext()) {
                Tuple2<String, Serializable> d = it.next();

                Map<String, Object> map = new HashMap<>();
                Map<String, Object> reqMap = new HashMap<>();
                reqMap.put("client_ip", d._1());

                Map<String, Object> dataProps = new HashMap<>();
                for (String nv : d._2().toString().split(Feature.SEP)) {
                  if (nv.startsWith("{")) {
                    reqMap.putAll((Map)JSONValue.parse(nv));
                  } else {
                    String[] parts = nv.trim().split("=");
                    dataProps.put(parts[0], parts[1]);
                  }
                }

                map.put("req", reqMap);
                map.put("data", dataProps);
                String data = JSONValue.toJSONString(map);
                kafkaProducer.send(new KeyedMessage<String, String>(app + "_FEATURE", "D", data));
              }
            }
          });
        }
      });
      jsc.start();
      try {
        jsc.awaitTermination();
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        jsc.close();
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值