数据流窗口Join（双流join）

藤井大旋風

已于 2022-04-07 22:32:57 修改

阅读量1k

点赞数

文章标签： flink

于 2022-04-07 22:32:19 首次发布

本文链接：https://blog.csdn.net/weixin_43466909/article/details/124028525

版权

两个数据流都源源不断地发生变化，而且数据流之间存在着某种联系，那么就可以用双流join将两个数据流关联起来。这里用异步IO可以吗？异步IO主要针对其中一个流固定不变（如geo字典库），并一个流在变化。用广播状态可以吗？广播状态只适用于数据量少和数据量变化不频繁地情况。

要实现两个数据流的关联，就需要创造两个数据流在同一时间能出现在同一个空间(同一subtask)的条件，而不是像两个流星一样转瞬即逝。首先我们可以按照要join的条件进行keyBy，让数据流进入到同一个分区；其次我们可以划窗口，让两个流数据放慢脚步，解决了数据流出现在同一时间段内的问题。此外还可以让两个流数据加上TTL都缓存一定的时间来实现。

滚动窗口join

滑动窗口join

会话窗口join

IntervalJoin

join（TumblingEventTimeWindows）

package cn._51doit.flink.day09;

import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;

import java.time.Duration;

/**
 * 将两个数据流，进行join
 *
 * 如果让两个流能够join上，必须满足两个条件
 * 1.数据分散在多台机器上，须将join条件相同的数据通过网络传输到同一台机器的同一个分区中（按照条件进行KeyBy）
 * 2.让每个流中的数据都放慢脚步，等待对方（划分相同类型、长度一样的窗口）
 *
 *
 */
public class EventTimeTumblingWindowJoin {

    public static void main(String[] args) throws Exception {


        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //1000,c1,300
        //4999,c1,300
        //5000,c2,200
        DataStreamSource<String> lines1 = env.socketTextStream("localhost", 8888);
        //1200,c1,图书
        //5001,c2,家具
        DataStreamSource<String> lines2 = env.socketTextStream("localhost", 9999);

        //期望返回的数据
        //1000,c1,300,1200,图书
        //按照EventTime进行join，窗口长度为5000秒，使用新的提取EventTime生成WaterMark的API
        SingleOutputStreamOperator<String> lines1WithWaterMark = lines1.assignTimestampsAndWatermarks(WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ofSeconds(0)).withTimestampAssigner(new SerializableTimestampAssigner<String>() {
            @Override
            public long extractTimestamp(String element, long recordTimestamp) {
                return Long.parseLong(element.split(",")[0]);
            }
        }));

        SingleOutputStreamOperator<Tuple3<Long, String, String>> tpStream1WithWaterMark = lines1WithWaterMark.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String input) throws Exception {
                String[] fields = input.split(",");
                return Tuple3.of(Long.parseLong(fields[0]), fields[1], fields[2]);
            }
        });

        SingleOutputStreamOperator<String> lines2WithWaterMark = lines2.assignTimestampsAndWatermarks(WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ofSeconds(0)).withTimestampAssigner(new SerializableTimestampAssigner<String>() {
            @Override
            public long extractTimestamp(String element, long recordTimestamp) {
                return Long.parseLong(element.split(",")[0]);
            }
        }));

        SingleOutputStreamOperator<Tuple3<Long, String, String>> tpStream2WithWaterMark = lines2WithWaterMark.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String input) throws Exception {
                String[] fields = input.split(",");
                return Tuple3.of(Long.parseLong(fields[0]), fields[1], fields[2]);
            }
        });
        //https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/dev/datastream/operators/joining/
        //期望得到的数据
        //c1,图书,300
        DataStream<Tuple5<Long, String, String, Long, String>> res = tpStream1WithWaterMark.join(tpStream2WithWaterMark)
                .where(t1 -> t1.f1)//也可以new一个keyselector来写
                .equalTo(t2 -> t2.f1)
                .window(TumblingEventTimeWindows.of(Time.seconds(5)))
                .apply(new JoinFunction<Tuple3<Long, String, String>, Tuple3<Long, String, String>, Tuple5<Long, String, String, Long, String>>() {
                    //窗口触发后，条件相同的，并且在同一个窗口内的数据，会传入到join方法中
                    @Override
                    public Tuple5<Long, String, String, Long, String> join(Tuple3<Long, String, String> first, Tuple3<Long, String, String> second) throws Exception {
                        return Tuple5.of(first.f0, first.f1, first.f2, second.f0, second.f2);
                    }
                });//apply表示全量聚合，必须等到窗口触发后数据到齐才开始关联

        res.print();

        env.execute();
    }
}

输入：
流1：
1000,c1,300
4999,c1,300
5000,c2,200
  
流2：
1200,c1,图书
5001,c2,家具

输出：
1>(1000,c1,300,1200,图书)
1>(4999,c1,300,1200,图书)

注意：两个窗口都需要触发，才能完成连接

leftOuterJoin（TumblingEventTimeWindows）

具体实现是将join方法改成cogroup方法

package cn._51doit.flink.day09;

import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.CoGroupFunction;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

import java.time.Duration;

/**
 * 将两个数据流，进行leftOutJoin(左外连接)
 *
 * 左边流的数据不论是否join上，都输出数据
 * join是innerJoin，必须完全join上才输出数据
 *
 */
public class EventTimeTumblingWindowLeftOutJoin {

    public static void main(String[] args) throws Exception {


        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //1000,c1,300
        //4999,c8,300
        //4999,c1,300
        //5000,c2,200
        DataStreamSource<String> lines1 = env.socketTextStream("localhost", 8888);
        //1200,c1,图书
        //5001,c2,家具
        DataStreamSource<String> lines2 = env.socketTextStream("localhost", 9999);

        //期望返回的数据
        //1000,c1,300,1200,图书
        //按照EventTime进行LeftOuterJoin，窗口长度为5000秒，使用新的提取EventTime生成WaterMark的API
        SingleOutputStreamOperator<String> lines1WithWaterMark = lines1.assignTimestampsAndWatermarks(WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ofSeconds(0)).withTimestampAssigner(new SerializableTimestampAssigner<String>() {
            @Override
            public long extractTimestamp(String element, long recordTimestamp) {
                return Long.parseLong(element.split(",")[0]);
            }
        }));

        SingleOutputStreamOperator<Tuple3<Long, String, String>> tpStream1WithWaterMark = lines1WithWaterMark.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String input) throws Exception {
                String[] fields = input.split(",");
                return Tuple3.of(Long.parseLong(fields[0]), fields[1], fields[2]);
            }
        });

        SingleOutputStreamOperator<String> lines2WithWaterMark = lines2.assignTimestampsAndWatermarks(WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ofSeconds(0)).withTimestampAssigner(new SerializableTimestampAssigner<String>() {
            @Override
            public long extractTimestamp(String element, long recordTimestamp) {
                return Long.parseLong(element.split(",")[0]);
            }
        }));

        SingleOutputStreamOperator<Tuple3<Long, String, String>> tpStream2WithWaterMark = lines2WithWaterMark.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String input) throws Exception {
                String[] fields = input.split(",");
                return Tuple3.of(Long.parseLong(fields[0]), fields[1], fields[2]);
            }
        });
        //https://nightlies.apache.org/flink/flink-docs-release-1.14/docs/dev/datastream/operators/joining/
        //期望得到的数据
        //1000,c1,300,1200,图书
        //1300,c8,300,null,null
        DataStream<Tuple5<Long, String, String, Long, String>> res = tpStream1WithWaterMark.coGroup(tpStream2WithWaterMark)
                .where(t -> t.f1)
                .equalTo(t -> t.f1)
                .window(TumblingEventTimeWindows.of(Time.seconds(5)))
                .apply(new CoGroupFunction<Tuple3<Long, String, String>, Tuple3<Long, String, String>, Tuple5<Long, String, String, Long, String>>() {

                    /**
                     * coGroup当窗口触发后，每个key会调用一次coGroup
                     * 三种情况会调用coGroup方法
                     * 1.第一个流和第二个流中，都有key相同的数据数据，并且在同一个窗口内，那么coGroup方法中的两个Iterable都不为empty
                     * 2.第一个流中出现了同一个key的数据，.第二个流中没有出现相同key的数据，那么coGroup方法中的第一个Iterable不为empty，第二个为empty
                     * 3.第二个流中出现了同一个key的数据，.第一个流中没有出现相同key的数据，那么coGroup方法中的第二个Iterable不为empty，第一个为empty
                     * @param first
                     * @param second
                     * @param out
                     * @throws Exception
                     */
                    @Override
                    public void coGroup(Iterable<Tuple3<Long, String, String>> first, Iterable<Tuple3<Long, String, String>> second, Collector<Tuple5<Long, String, String, Long, String>> out) throws Exception {
                        //实现左外连接
                       boolean isJoined = false;
                        //先循环左流的数据
                        for (Tuple3<Long, String, String> left : first) {

                            for (Tuple3<Long, String, String> right : second) {
                               isJoined = true;
                               //输出数据
                                out.collect(Tuple5.of(left.f0, left.f1, left.f2, right.f0, right.f2));
                            }
                            if (!isJoined) {
                                out.collect(Tuple5.of(left.f0, left.f1, left.f2, null, null));
                            }
                        }
                    }
                });//同理，先循环second可以实现右外连接

        res.print();

        env.execute();
    }
}

输入： 
流1： 
1000,c1,300
4999,c8,300
4999,c1,300 
5000,c2,200   

流2：
1200,c1,图书 
5001,c2,家具 

输出： 
2>(4999,c8,300,null,null)
1>(1000,c1,300,1200,图书) 
1>(4999,c1,300,1200,图书)

注意：两个窗口都需要触发，才能完成连接

IntervalJoin

具体实现是将join方法改成cogroup方法

按照时间范围来join

package cn._51doit.flink.day09;

import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple5;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

import java.time.Duration;

/**
 * 将两个数据流不划分窗口，按照时间范围进行join，即intervalJoin
 *
 * 以第一个流中的数据为标准进行比较时间
 *
 * 实现步骤：
 * 1.分别将两个流按照相同的条件进行KeyBy（可以保证key等值的数据一定进入到同一台机器的同一个分区中）
 * 2.将两个数据流的数据缓存到KeyedState，然后将两个流Connected到一起（可以共享状态）
 *
 */
public class EventTimeIntervalJoin {

    public static void main(String[] args) throws Exception {


        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        //1000,c1,300
        //2999,c8,300
        //3000,c8,300
        //5000,c2,200
        DataStreamSource<String> lines1 = env.socketTextStream("localhost", 8888);
        //1200,c1,图书
        //1999,c1,图书
        //2000,c8,电话
        //5001,c2,家具
        DataStreamSource<String> lines2 = env.socketTextStream("localhost", 9999);

        //期望返回的数据
        //1000,c1,300,1200,图书
        //按照EventTime进行join，窗口长度为5000秒，使用新的提取EventTime生成WaterMark的API
        SingleOutputStreamOperator<String> lines1WithWaterMark = lines1.assignTimestampsAndWatermarks(WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ofSeconds(0)).withTimestampAssigner(new SerializableTimestampAssigner<String>() {
            @Override
            public long extractTimestamp(String element, long recordTimestamp) {
                return Long.parseLong(element.split(",")[0]);
            }
        }));

        SingleOutputStreamOperator<Tuple3<Long, String, String>> tpStream1WithWaterMark = lines1WithWaterMark.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String input) throws Exception {
                String[] fields = input.split(",");
                return Tuple3.of(Long.parseLong(fields[0]), fields[1], fields[2]);
            }
        });

        SingleOutputStreamOperator<String> lines2WithWaterMark = lines2.assignTimestampsAndWatermarks(WatermarkStrategy.<String>forBoundedOutOfOrderness(Duration.ofSeconds(0)).withTimestampAssigner(new SerializableTimestampAssigner<String>() {
            @Override
            public long extractTimestamp(String element, long recordTimestamp) {
                return Long.parseLong(element.split(",")[0]);
            }
        }));

        SingleOutputStreamOperator<Tuple3<Long, String, String>> tpStream2WithWaterMark = lines2WithWaterMark.map(new MapFunction<String, Tuple3<Long, String, String>>() {
            @Override
            public Tuple3<Long, String, String> map(String input) throws Exception {
                String[] fields = input.split(",");
                return Tuple3.of(Long.parseLong(fields[0]), fields[1], fields[2]);
            }
        });

        //先将第一个流进行KeyBy
        KeyedStream<Tuple3<Long, String, String>, String> keyedStream1 = tpStream1WithWaterMark.keyBy(t -> t.f1);
        //然后将第二个流进行KeyBy
        KeyedStream<Tuple3<Long, String, String>, String> keyedStream2 = tpStream2WithWaterMark.keyBy(t -> t.f1);

        SingleOutputStreamOperator<Tuple5<Long, String, String, Long, String>> res = keyedStream1.intervalJoin(keyedStream2)
                .between(Time.seconds(-1), Time.seconds(1)) //指定时间范围,以先到的数据的数据流为标准（然后一直以这个流为基准），向前或后推时间，匹配另一个数据流
                .upperBoundExclusive() //不包括后面的时间 [,)
                .process(new ProcessJoinFunction<Tuple3<Long, String, String>, Tuple3<Long, String, String>, Tuple5<Long, String, String, Long, String>>() {

                    @Override
                    public void processElement(Tuple3<Long, String, String> left, Tuple3<Long, String, String> right, Context ctx, Collector<Tuple5<Long, String, String, Long, String>> out) throws Exception {

                        out.collect(Tuple5.of(left.f0, left.f1, left.f2, right.f0, right.f2));

                    }
                });

        res.print();

        env.execute();
    }
}

输入： 
流1： 
1000,c1,300

流2：
1200,c1,图书 

输出： 
1>(1000,c1,300,1200,图书) 

----------------------
输入： 
流2：
1999,c1,图书 

输出： 
1>(1000,c1,300,1999,图书) 

----------------------
输入： 
流2：
2000,c1,图书 

输出： 
无

-----------------------
输入： 
流2：
2000,c8,电话

流1： 
2999,c8,300
3000,c8,300

输出： 
2>(2999,c8,300,2000,电话) 
2>(3000,c8,300,2000,电话)

藤井大旋風

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
数据流窗口Join（双流join）

两个数据流都源源不断地发生变化，而且数据流之间存在着某种联系，那么就可以用双流join将两个数据流关联起来。这里用异步IO可以吗？异步IO主要针对其中一个流固定不变（如geo字典库），并一个流在变化。用广播状态可以吗？广播状态只适用于数据量少和数据量变化不频繁地情况。要实现两个数据流的关联，就需要创造两个数据流在同一时间能出现在同一个空间(同一subtask)的条件，而不是像两个流星一样转瞬即逝。首先我们可以按照要join的条件进行keyBy，让数据流进入到同一个分区；其次我们可以划窗口，让两个流数据放
复制链接

扫一扫