前言
近些年来Flink大有取代sparkstreaming的趋势,windows作为实时数据分析的核心函数,Flink加了watermarker水印的时间窗口函数在解决实时延时、乱序等问题上相对于sparkstreaming则完全胜出。
下面的示例介绍了flink是如何应用eventtime的watermarker从而实现我们的需求。
一、需求
flink接收kafka每5s传来的一条数据,实时不断的传入形成实时数据流,求下一条数据与上一条数据的差值。
数据:
time,1538359822000,data,T002.B026;1.000000;T001.A087;1.000000;T018.B026;1.00000;T002.B039;T002.B088;T002.B100
time,1538359827000,data,T002.B026;4.000000;T001.A087;4.000000;T018.B026;2.00000;T002.B039;T002.B088;T002.B100
time,1538359832000,data,T002.B026;6.000000;T001.A087;6.000000;T018.B026;5.00000;T002.B039;T002.B088;T002.B100
time,1538359837000,data,T002.B026;9.000000;T001.A087;9.000000;T018.B026;3.00000;T002.B039;T002.B088;T002.B100
time,1538359842000,data,T002.B026;11.000000;T001.A087;11.000000;T018.B026;9.00000;T002.B039;T002.B088;T002.B100
time,1538359847000,data,T002.B026;15.000000;T001.A087;15.000000;T018.B026;11.00000;T002.B039;T002.B088;T002.B100
time,1538359852000,data,T002.B026;20.000000;T001.A087;26.000000;T018.B026;20.00000;T002.B039;T002.B088;T002.B100
time,1538359857000,data,T002.B026;27.000000;T001.A087;27.000000;T018.B026;22.00000;T002.B039;T002.B088;T002.B100
time,1538359862000,data,T002.B026;29.000000;T001.A087;29.000000;T018.B026;26.00000;T002.B039;T002.B088;T002.B100
二、实现过程及详细步骤***
1、数据解析成窗口函数所需要的tuple3格式(全部代码都在main函数中)。
int port = 9000;
//获取运行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置使用eventtime,默认是使用processtime
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//设置并行度为1,默认并行度是当前机器的cpu数量
env.setParallelism(1);
//连接socket获取输入的数据
DataStream<String> text = env.socketTextStream("localhost", port, "\n");
DataStream<Tuple3<String, String, HashMap<String, Object>>> data = text.map(new MapFunction<String, Tuple3<String, String, HashMap<String, Object>>>() {
@Override
public Tuple3<String, String, HashMap<String, Object>> map(String string) throws Exception {
//构建map
HashMap<String, Object> map = new HashMap<>();
HashMap<String, Object> map2 = new HashMap<>();
String[] split = string.split(",");
map.put(split[0], split[1]);
String[] split1 = split[3].split(";");
map2.put(split1[0], split1[1] + "_" + split1[6]);
map2.put(split1[2], split1[3] + "_" + split1[7]);
map2.put(split1[4], split1[5] + "_" + split1[8]);
map.put(split[2], map2);
//map转tuple3
Tuple3<String, String, HashMap<String, Object>> tuple3 = new Tuple3<>();
for (String k : map.keySet()) {
if (k.equals("time")) {
String o = map.get(k).toString();
tuple3.setField(o, 1);
} else if (k.equals("data")) {
HashMap map3 = (HashMap) map.get(k);
tuple3.setField(map3, 2);
tuple3.setField("1", 0);
}
}
// System.out.println("tuple3:"+tuple3);
return tuple3;
}
});
2、利用watermarker,解决延迟、乱序等问题,这里设置的最大允许的乱序时间是10s。
DataStream<Tuple3<String, String, HashMap<String, Object>>> waterMarkStream = data.assignTimestampsAndWatermarks(new AssignerWithPeriodicWatermarks<Tuple3<String, String, HashMap<String, Object>>>() {
//默认时间戳
Long currentMaxTimestamp = 0L;
//最大允许的乱序时间是10s
Long maxOutOfOrderness = 10000L;
//没100ms获取一次Watermark
@Nullable
@Override
public Watermark getCurrentWatermark() {
return new Watermark(currentMaxTimestamp - maxOutOfOrderness);
}
//提取时间戳timestamp
@Override
public long extractTimestamp(Tuple3<String, String, HashMap<String, Object>> element, long previousElementTimestamp) {
//从数据中获取时间戳
long timestamp = Long.parseLong(element.f1);
currentMaxTimestamp = Math.max(timestamp, currentMaxTimestamp);
return timestamp;
}
});
3、经过水印处理后的数据,才能进入时间窗口函数,window函数大致分为四种:ReduceFunction,AggregateFunction,FoldFunction,ProcessWindowFunction,该示例中应用的窗口函数为ReduceFunction,即进入窗口的函数可以进行聚合操作。需要注意的是,该函数的输入和输出是保持一致的。
DataStream<Tuple3<String, String, HashMap<String, Object>>> amountAllStream = waterMarkStream
//进行以Tuple3的f0进行分组
.keyBy(0)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
.reduce(new ReduceFunction<Tuple3<String, String, HashMap<String, Object>>>() {
@Override
//Tuple3中Map<String, Object>的数据格式:
//{T002.B026=1.000000_T002.B039, T001.A087=1.000000_T002.B088,T001.A087=1.000000_T002.B100}
public Tuple3<String, String, HashMap<String, Object>> reduce(Tuple3<String, String, HashMap<String, Object>> t1, Tuple3<String, String, HashMap<String, Object>> t2) throws Exception {
Tuple3<String, String, HashMap<String, Object>> tu1 = new Tuple3<>();
//遍历t1
//Tuple3中t1第一条数据的map
HashMap<String, Object> map1 = t1.f2;
//Tuple3中t1第二条数据的map
HashMap<String, Object> map2 = t2.f2;
HashMap<String, Object> map3 = new HashMap<>();
//判断两条数据哪个时间戳是较小的
long min = Long.parseLong(t1.f1) > Long.parseLong(t2.f1) ? Long.parseLong(t2.f1) : Long.parseLong(t1.f1);
//如果第一条时间戳较小
if (Long.parseLong(t1.f1) == min) {
for (String k1 : map1.keySet()) {
//t2示值-t1示值=量值
//map1,map2:{T002.B026=1.000000_T002.B039, T001.A087=1.000000_T002.B088,T001.A087=1.000000_T002.B100}
Double i = Double.parseDouble(map2.get(k1).toString().split("_")[0]) -
Double.parseDouble(map1.get(k1).toString().split("_")[0]);
// 把全部量值封装到map3中 {T007.B055=2.0}
//这里map2.get(k1)中的map2 是map1 map2都行 因为是相同的指标
map3.put(map2.get(k1).toString().split("_")[1], i);
}
}
//如果第二条时间戳较小
else if (Long.parseLong(t2.f1) == min) {
for (String k1 : map1.keySet()) {
//t1示值-t2示值=量值
Double i = Double.parseDouble(map1.get(k1).toString().split("_")[0]) -
Double.parseDouble(map2.get(k1).toString().split("_")[0]);
//这里map2.get(k1)中的map2 是map1 map2都行 因为是相同的指标
map3.put(map2.get(k1).toString().split("_")[1], i);
}
}
String minn = String.valueOf(min);
//Tuple3< "data",minnTime,Map<String, Object>> 把量值赋给前一时间戳
tu1.setFields(t1.f0, minn, map3);
return tu1;
}
});
//得到经过异常处理的量值
DataStream<HashMap<String, Object>> amountExStream = amountAllStream.map(new MapFunction<Tuple3<String, String, HashMap<String, Object>>, HashMap<String, Object>>() {
@Override
public HashMap<String, Object> map(Tuple3<String, String, HashMap<String, Object>> tuple3) throws Exception {
//得到全部量值
HashMap<String, Object> map = tuple3.f2;
HashMap<String, Object> excMap = new HashMap<>();
HashMap<String, Object> excMap1 = new HashMap<>();
for (String k1 : map.keySet()) {
//如果数据中不包含"_" ,就进行判断
if (!map.get(k1).toString().contains("_")) {
//非空判断
if (map.get(k1) != null && !("").equals(map.get(k1))) {
double i = Double.parseDouble((map.get(k1)).toString());
//如果小于0,把量值置0
if (i < 0) {
//YC为0 即把异常量值置零
excMap1.put(k1, 0);
} else {
excMap1.put(k1, i);
}
}
}
}
//把数据重新放入一个map中,还原到基础map格式
//第一个是空的 判断如果为非空map
if (excMap1.size() > 0) {
excMap.put("data", excMap1);
excMap.put("time", tuple3.f1);
}
// {data={T002.B039=3.0, T002.B100=1.0, T002.B088=3.0}, time=1538359822000}
return excMap;
}
});
amountExStream.print();
env.execute();
最大允许乱序时间maxOutOfOrderness 设置为10s,窗口长度为10s,即最终延迟时间为20s,需等待第5条数据输入后第一条结果才会输出。当时间输入为1538359842000的时候,第一条数据的时间戳为1538359822000。
结果为:
{data={T002.B039=3.0, T002.B100=1.0, T002.B088=3.0}, time=1538359822000}
{data={T002.B039=2.0, T002.B100=3.0, T002.B088=2.0}, time=1538359827000}
{data={T002.B039=3.0, T002.B100=1.0, T002.B088=3.0}, time=1538359832000}
{data={T002.B039=2.0, T002.B100=3.0, T002.B088=2.0}, time=1538359837000}
三、注意
- 利用eventtime来界定窗口的大小,sources中就必须带有时间戳;
- 使用水印后面跟的必须是时间窗口;
- 该maxOutOfOrderness为10s,可以尝试进行10s的乱序数据插入,验证watermarker的正确性。