首先说一下这个spark程序的目的。这个程序是要求对网站收到的http请求数据计算一些特征值,要求每10秒钟计算一次最近2小时的数据的特征值。这就需要用到spark的滑动窗口运算了。
数据是从topic为app的kafka中读取,计算出来的特征值发往topic为app_FEATURE的kafka中。
代码如下:
public class FeatureAccumulator {
public static void main(String[] args) {
if (args.length != 2) {
System.out.println("Usage: FeatureAccumulator app");
return;
}
String app = args[0];
String type = args[1];
Feature[] features = new Feature[]{
// IP维度特征
new DistinctCookieCountByIp(),
new DistinctFingerCountByIp(),
new DistinctUaCountByIp(),
new DistinctUrlCountByIp(),
new DistinctSessionCountByIp(),
new TimeIntervalByIp(),
new TimeIntervalStdByIp(),
new TimeRangeByIp(),
new UrlCountByIp(),
new VisitMeanByIp(),
new VisitStdByIp(),
new NoCookieVisitCountByIp(),
new NoFingerVisitCountByIp(),
new PcVisitCountByIp(),
new HttpsVisitCountByIp(),
new ChinaVisitCountByIp(),
new MaxIpCountByFingerByIp(),
new SumIpCountByFingerByIp(),
new AvgIpCountByFingerByIp(),
new MaxIpCountByUaByIp(),
new SumIpCountByUaByIp(),
new AvgIpCountByUaByIp(),
new DistinctFingerCountByIp3(),
new DistinctIpCountByIp3(),
new DistinctUaCountByIp3(),
new DistinctCookieCountByIp3(),
new DistinctTrackCountByIp3(),
new SumIpCountBySessionByIp(),
new AvgIpCountBySessionByIp()
};
SparkConf sparkConf = new SparkConf().setAppName("Feature Accumulator(" + app + ")");
// 每10秒一个批次
JavaStreamingContext jsc = new JavaStreamingContext(sparkConf, Durations.seconds(10));
Map<String, String> props = new HashMap<>();
props.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, Config.getProperty("kafka.host"));
props.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
props.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringDeserializer");
props.put(ConsumerConfig.GROUP_ID_CONFIG, "my_group");
props.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");
props.put(ConsumerConfig.AUTO_COMMIT_INTERVAL_MS_CONFIG, "10000");
props.put("zookeeper.sync.time.ms", "250");
try {
Set<String> topics = new HashSet<String>();
topics.add(app);
// 获取kafka的数据
JavaPairInputDStream<String, String> stream =
KafkaUtils.createDirectStream(
jsc,
String.class, // key类型
String.class, // value类型
StringDecoder.class, // 解码器
StringDecoder.class,
props,
topics);
MyJsonDataReader reader = new MyJsonDataReader();
JavaDStream<RequestInfo> reqs = stream.map(s -> {
RequestInfo info = reader.parseLine(s._2);
return info;
});
JavaPairDStream<String, Serializable> reqsByIp = reqs.mapToPair(req -> {
Map<String, Object> reqMap = new HashMap<>();
reqMap.put("client_time", new SimpleDateFormat("yyyyMMdd HH:mm:ss").format(req.getRequestTime()));
return new Tuple2<String, Serializable>(req.getClientIp(), JSONValue.toJSONString(reqMap));
});
JavaPairDStream<String, Serializable> reqsByIp2 = reqsByIp.reduceByKeyAndWindow((t1, t2) -> {
// 每个IP只保留最近的一条记录
String s1 = (String)t1;
String s2 = (String)t2;
return s1.compareTo(s2) > 0 ? s1 : s2;
}, Durations.minutes(120));
JavaPairDStream<String, Serializable> ipResult = null;
JavaPairDStream<String, Serializable> ipJoinResult = null;
JavaPairDStream<String, Serializable> ip3Result = null;
JavaPairDStream<String, Serializable> ip3JoinResult = null;
for (Feature feature : features) {
if (!(feature instanceof DataByFeature)) {
continue;
}
DataByFeature f = (DataByFeature) feature;
JavaPairDStream<String, Serializable> result1 =
reqs.mapToPair(req -> new Tuple2<>(f.getKey(req), f.getValue(req)));
//开窗函数 10秒计算一次 计算前120分钟的数据聚合
JavaPairDStream<String, Serializable> result2 = result1.reduceByKey(
(Function2<Serializable, Serializable, Serializable>) (a1, a2) -> {
Serializable d = f.reduce(a1, a2);
return d;
}).window(Durations.minutes(120));
JavaPairDStream<String, Serializable> result3 = result2.mapValues(d -> f.getResult(d));
// ip对应的finger中每个finger对应不同ip数的最大值
if (feature instanceof DataBy2Feature) {
DataBy2Feature f2 = (DataBy2Feature) feature;
JavaPairDStream<String, Serializable> result1_2 =
reqs.mapToPair(req -> new Tuple2<>(f2.getKey2(req), f2.getValue2(req)));
//开窗函数 10秒计算一次 计算前120分钟的数据聚合
JavaPairDStream<String, Serializable> result2_2 = result1_2.reduceByKey(
(Function2<Serializable, Serializable, Serializable>) (a1, a2) -> {
Serializable d = f2.reduce(a1, a2);
return d;
}).window(Durations.minutes(120));
// 每个finger对应不同ip数
JavaPairDStream<String, Serializable> result3_2 = result2_2.mapValues(d -> f2.getResult2(d)).
mapToPair(d -> new Tuple2<String, Serializable>(Feature.SEP+d._1(), d._2()));
result3 = result3_2.union(result3);
result3 = result3.transform(new Function<JavaPairRDD<String, Serializable>, JavaRDD<Tuple2<String, Serializable>>>() {
@Override
public JavaRDD<Tuple2<String, Serializable>> call(JavaPairRDD<String, Serializable> t) throws Exception {
Map<String, Serializable> values = t.filter(d -> d._1().startsWith(Feature.SEP)).
mapToPair(d -> new Tuple2<String, Serializable>(d._1().substring(1), d._2())).collectAsMap();
t = t.filter(d -> !d._1().startsWith(Feature.SEP));
JavaRDD<Tuple2<String, Serializable>> t2 = t.map(r -> {
List<String> list = new ArrayList<>();
for (String v : ((String) r._2()).split(Feature.SEP)) {
Serializable o = values.get(v);
list.add(o.toString());
}
return new Tuple2<String, Serializable>(r._1(), f2.reduce3(list));
});
return t2;
}
}).filter(d -> !d._1().startsWith(Feature.SEP)).mapToPair(t2 -> t2);
}
result3 = result3.mapValues(d -> f.getName()+"="+d);
if (f.getType() == DataByFeature.IP) {
if (ipJoinResult == null) {
if (ipResult == null) {
ipResult = result3;
} else {
ipJoinResult = ipResult.union(result3);
}
} else {
ipJoinResult = ipJoinResult.union(result3);
}
} else if (f.getType() == DataByFeature.IP3) {
if (ip3JoinResult == null) {
if (ip3Result == null) {
ip3Result = result3;
} else {
ip3JoinResult = ip3Result.union(result3);
}
} else {
ip3JoinResult = ip3JoinResult.union(result3);
}
} else if (f.getType() == DataByFeature.FINGER) {
if (fpJoinResult == null) {
if (fpResult == null) {
fpResult = result3;
} else {
fpJoinResult = fpResult.union(result3);
}
} else {
fpJoinResult = fpJoinResult.union(result3);
}
} else if (f.getType() == DataByFeature.UA) {
if (uaJoinResult == null) {
if (uaResult == null) {
uaResult = result3;
} else {
uaJoinResult = uaResult.union(result3);
}
} else {
uaJoinResult = uaJoinResult.union(result3);
}
}
}
JavaPairDStream<String, Serializable> ipJoinResult2 = ipJoinResult.union(reqsByIp2);
JavaPairDStream<String, Serializable> ip3JoinResult2 = ip3JoinResult;
JavaPairDStream<String, Serializable> result34 = ip3JoinResult2.reduceByKey((t1, t2) -> {
Serializable d = t1 + Feature.SEP + t2;
return d;
}).window(Durations.minutes(120));
// 将IP3的特征合并进IP中
JavaPairDStream<String, Serializable> result3_2 = result34.
mapToPair(d -> new Tuple2<String, Serializable>("ip3."+d._1(), d._2()));
JavaPairDStream<String, Serializable> result5 = result3_2.union(result4);
JavaPairDStream<String, Serializable> result6 = result5.transform(new Function<JavaPairRDD<String, Serializable>, JavaRDD<Tuple2<String, Serializable>>>() {
@Override
public JavaRDD<Tuple2<String, Serializable>> call(JavaPairRDD<String, Serializable> t) throws Exception {
Map<String, Serializable> values = t.filter(d -> d._1().startsWith("ip3.")).
mapToPair(d -> new Tuple2<String, Serializable>(d._1().substring(4), d._2())).collectAsMap();
JavaPairRDD<String, Serializable> t1 = t.filter(d -> !d._1().startsWith("ip3."));
JavaRDD<Tuple2<String, Serializable>> t2 = t1.map(d -> {
String v = (String) d._2();
String ip = d._1();
String ip3 = DataByFeature.getIp3(ip);
if (values.containsKey(ip3)) {
v += Feature.SEP + values.get(ip3);
}
return new Tuple2<String, Serializable>(d._1(), v);
});
return t2;
}
}).filter(d -> !d._1().startsWith("ip3.")).mapToPair(d -> d);
result6.foreachRDD(new VoidFunction<JavaPairRDD<String, Serializable>>() {
public void call(JavaPairRDD<String, Serializable> v1) throws Exception {
v1.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Serializable>>>() {
public void call(Iterator<Tuple2<String, Serializable>> it) throws Exception {
KafkaProducer kafkaProducer = KafkaProducer.getInstance();
while (it.hasNext()) {
Tuple2<String, Serializable> d = it.next();
Map<String, Object> map = new HashMap<>();
Map<String, Object> reqMap = new HashMap<>();
reqMap.put("client_ip", d._1());
Map<String, Object> dataProps = new HashMap<>();
for (String nv : d._2().toString().split(Feature.SEP)) {
if (nv.startsWith("{")) {
reqMap.putAll((Map)JSONValue.parse(nv));
} else {
String[] parts = nv.trim().split("=");
dataProps.put(parts[0], parts[1]);
}
}
map.put("req", reqMap);
map.put("data", dataProps);
String data = JSONValue.toJSONString(map);
kafkaProducer.send(new KeyedMessage<String, String>(app + "_FEATURE", "D", data));
}
}
});
}
});
jsc.start();
try {
jsc.awaitTermination();
} catch (Exception e) {
e.printStackTrace();
} finally {
jsc.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}