Flink Stream join && intervalJoin && coGroup的区别
Flink DataStream Api
提供了3中Stream join
的算子,分别是join
,intervalJoin
和coGroup
算子。
join
:是一个流join另一个流,需要设置窗口,2个流join需要的key字段。使用的是innerJoin
。对Processing Time
和Event Time
都支持。
intervalJoin
:是一个流join另一个流,不需要设置窗口,但是需要设置流join的时间范围(需要时间字段),仅支持Event Time的计算。
coGroup
:和join类似,不过CoGroupFunction
和JoinFunction
的参数不一样。coGroup
是需要自己组装数据。
下面具体看代码实现:
join
/**
* join 测试数据
* <p>
* User{userId='1001', name='caocao', age='20', sex='null', createTime=1561023040338, updateTime=1561023040338}
* Order{orderId='1001', userId='1001', price='10', timestamp=1561023042640}
* Order{orderId='1002', userId='1001', price='20', timestamp=1561023043649}
* Order{orderId='1003', userId='1002', price='30', timestamp=1561023044651}
* order join user> (1001,caocao,20,1001,1001,10,1561023042640)
* order join user> (1001,caocao,20,1001,1002,20,1561023043649)
*/
public class JoinOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
sEnv.setParallelism(1);
Properties p = new Properties();
p.setProperty("bootstrap.servers", "localhost:9092");
DataStreamSource<String> order = sEnv.addSource(new FlinkKafkaConsumer010<String>("order", new SimpleStringSchema(), p));
DataStreamSource<String> user = sEnv.addSource(new FlinkKafkaConsumer010<String>("user", new SimpleStringSchema(), p));
SingleOutputStreamOperator<Order> operator = order.map(new MapFunction<String, Order>() {
@Override
public Order map(String value) throws Exception {
return new Gson().fromJson(value, Order.class);
}
});
operator.print();
SingleOutputStreamOperator<User> operator1 = user.map(new MapFunction<String, User>() {
@Override
public User map(String value) throws Exception {
return new Gson().fromJson(value, User.class);
}
});
operator1.print();
// 使用的是inner join 具体看测试数据
operator.join(operator1)
.where(new KeySelector<Order, String>() {
@Override
public String getKey(Order value) throws Exception {
return value.userId;
}
})
.equalTo(new KeySelector<User, String>() {
@Override
public String getKey(User value) throws Exception {
return value.userId;
}
}).window(TumblingProcessingTimeWindows.of(Time.minutes(1), Time.seconds(30)))
.trigger(ProcessingTimeTrigger.create())
.apply(new JoinFunction<Order, User, Tuple7<String, String, String, String, String, String, Long>>() {
@Override
public Tuple7<String, String, String, String, String, String, Long> join(Order order, User user) throws Exception {
return new Tuple7<>(user.userId, user.name, user.age, order.userId, order.orderId, order.price, order.timestamp);
}
})
.print("order join user");
sEnv.execute("JoinOperator");
}
}
join需要在window内操作。然后在JoinFunction算子,返回join后的内容。
intervalJoin
/**
* 测试数据
* <p>
* User{userId='1001', name='caocao', age='20', sex='null', createTime=1561024902404, updateTime=1561024902404}
* Order{orderId='1001', userId='1001', price='10', timestamp=1561024905185}
* (1001,caocao,20,1001,1001,10,1561024905185)
* Order{orderId='1002', userId='1001', price='20', timestamp=1561024906197}
* (1001,caocao,20,1001,1002,20,1561024906197)
* Order{orderId='1003', userId='1002', price='30', timestamp=1561024907198}
*/
public class IntervalJoinOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
sEnv.setParallelism(1);
sEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
Properties p = new Properties();
p.setProperty("bootstrap.servers", "localhost:9092");
SingleOutputStreamOperator<Order> order = sEnv
.addSource(new FlinkKafkaConsumer010<String>("order", new SimpleStringSchema(), p))
.map(new MapFunction<String, Order>() {
@Override
public Order map(String value) throws Exception {
return new Gson().fromJson(value, Order.class);
}
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<Order>() {
@Override
public long extractAscendingTimestamp(Order element) {
return element.timestamp;
}
});
order.print();
SingleOutputStreamOperator<User> user = sEnv
.addSource(new FlinkKafkaConsumer010<String>("user", new SimpleStringSchema(), p))
.map(new MapFunction<String, User>() {
@Override
public User map(String value) throws Exception {
return new Gson().fromJson(value, User.class);
}
}).assignTimestampsAndWatermarks(new AscendingTimestampExtractor<User>() {
@Override
public long extractAscendingTimestamp(User element) {
return element.createTime;
}
});
user.print();
order
.keyBy("userId")
.intervalJoin(user.keyBy("userId"))
// between 只支持 event time
.between(Time.seconds(-60), Time.seconds(60))
.lowerBoundExclusive()
.upperBoundExclusive()
.process(new ProcessJoinFunction<Order, User, Tuple7<String, String, String, String, String, String, Long>>() {
@Override
public void processElement(Order order, User user, Context ctx, Collector<Tuple7<String, String, String, String, String, String, Long>> out) throws Exception {
out.collect(new Tuple7<>(user.userId, user.name, user.age, order.userId, order.orderId, order.price, order.timestamp));
}
})
.print();
sEnv.execute("IntervalJoinOperator");
}
}
between(Time.seconds(-60), Time.seconds(60)):相当于order.createTime - 60s < user.createTime < order.createTime + 60s
lowerBoundExclusive():取下边界,order.createTime - 60s <= user.createTime
upperBoundExclusive():取上边界, user.createTime <= order.createTime + 60s
coGroup
/**
* coGroup测试数据
* <p>
* User{userId='1001', name='caocao', age='20', sex='male', createTime=1561087784197, updateTime=1561087784197}
* Order{orderId='1001', userId='1001', price='10', timestamp=1561087786816}
* Order{orderId='1002', userId='1001', price='20', timestamp=1561087787831}
* Order{orderId='1003', userId='1002', price='30', timestamp=1561087788832}
* ---------------------------------
* [Order{orderId='1001', userId='1001', price='10', timestamp=1561087786816}, Order{orderId='1002', userId='1001', price='20', timestamp=1561087787831}]
* [User{userId='1001', name='caocao', age='20', sex='male', createTime=1561087784197, updateTime=1561087784197}]
* ---------------------------------
* [Order{orderId='1003', userId='1002', price='30', timestamp=1561087788832}]
* []
*/
public class JoinAndCoGroupOperator {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment sEnv = StreamExecutionEnvironment.getExecutionEnvironment();
sEnv.setParallelism(1);
Properties p = new Properties();
p.setProperty("bootstrap.servers", "localhost:9092");
DataStreamSource<String> order = sEnv.addSource(new FlinkKafkaConsumer010<String>("order", new SimpleStringSchema(), p));
DataStreamSource<String> user = sEnv.addSource(new FlinkKafkaConsumer010<String>("user", new SimpleStringSchema(), p));
SingleOutputStreamOperator<Order> operator = order.map(new MapFunction<String, Order>() {
@Override
public Order map(String value) throws Exception {
return new Gson().fromJson(value, Order.class);
}
});
operator.print();
SingleOutputStreamOperator<User> operator1 = user.map(new MapFunction<String, User>() {
@Override
public User map(String value) throws Exception {
return new Gson().fromJson(value, User.class);
}
});
operator1.print();
// coGroup既会输出匹配的记过,也会输出未匹配的结果,给出的方式,一个迭代器,需要自己组装。这是和join的区别
operator.coGroup(operator1)
.where(new KeySelector<Order, String>() {
@Override
public String getKey(Order value) throws Exception {
return value.userId;
}
})
.equalTo(new KeySelector<User, String>() {
@Override
public String getKey(User value) throws Exception {
return value.userId;
}
}).window(TumblingProcessingTimeWindows.of(Time.minutes(1), Time.seconds(30)))
.trigger(ProcessingTimeTrigger.create())
.apply(new CoGroupFunction<Order, User, Tuple7<String, String, String, String, String, String, Long>>() {
@Override
public void coGroup(Iterable<Order> first, Iterable<User> second, Collector<Tuple7<String, String, String, String, String, String, Long>> out) throws Exception {
System.out.println("---------------------------------");
System.out.println(first);
System.out.println(second);
}
}).print("coGroup");
sEnv.execute("JoinOperator");
}
}
coGroup既会输出匹配的结果,也会输出未匹配的结果,给出的方式,一个迭代器,需要自己组装。这是和join的区别。