很多架构开始往流批一体进行过渡,其中flink面临最大的挑战之一就是做好流join
引言
本篇文章借鉴:https://www.jianshu.com/p/3872d6d81cfd
一般来说,我们遇到的join场景有普通的join、left join、righr join等等,结合这几种场景来简单讲一下。
join的前提操作操作是指定了watermarks,保证数据在一定时间内满足条件的join。
windowJoin
通过窗口,匹配在这一个时间窗口内的,实现方式较简单。
使用模版
stream.join(otherStream)
.where(<KeySelector>)
.equalTo(<KeySelector>)
.window(<WindowAssigner>)
.apply(<JoinFunction>)
class InnerWindowJoinFunction extends JoinFunction[Obj1, Obj2, Obj3] {
override def join(first: Obj1, second: Obj2): Obj3 = {
Obj3(first.elem1, first.elem2, second.elem1)
}
}
coGroupJoin
通过窗口实现,以一条流为标准,匹配每个点在时间区间内的数据,匹配不到指定为0或者null,实现方式不算复杂。
使用模版
stream.coGroup(otherStream)
.where(<KeySelector>)
.equalTo(<KeySelector>)
.window(<WindowAssigner>)
.apply(<CoGroupFunction>)
class LeftWindowJoinFunction extends CoGroupFunction[Obj1,Obj2,Obj3]{
override def coGroup(first: java.lang.Iterable[Obj1],
second: java.lang.Iterable[Obj2],
out: Collector[Obj3]): Unit = {
/**
* 将Java的Iterable对象转化为Scala的Iterable对象
*/
import scala.collection.JavaConverters._
val scalaT1 = first.asScala.toList
val scalaT2 = second.asScala.toList
for (left <- scalaT1) {
var flag = false // 定义flag,left流中的key在right流中是否匹配
for (right <- scalaT2) {
out.collect(Obj3(left.orderId,left.amount,right.itemId))
flag = true;
}
if (!flag){ // left流中的key在right流中没有匹配到,则给itemId输出默认值0L
out.collect(Obj3(left.orderId,left.amount,0L))
}
}
}
}
intervalJoin
区别上边两种的是没有窗口,以一条流为标准,匹配每个点在时间区间内的数据,实现方式较简单。
使用模版
stream.intervalJoin(otherStream)
.between(<-time>, <+time>)
.process(<ProcessJoinFunction>)
class IntervalJoinFunction extends ProcessJoinFunction[Obj1,Obj2,Obj3]{
override def processElement(left: Obj1,
right: Obj2,
ctx: ProcessJoinFunction[Obj1, Obj2, Obj3]#Context,
out: Collector[Obj3]): Unit = {
out.collect(obj3(left.elem1, left.elem2, right.elem2))
}
}
完整代码
import org.apache.flink.api.common.functions.{CoGroupFunction, JoinFunction}
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction
import org.apache.flink.util.Collector
import org.apache.flink.api.common.functions.CoGroupFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
import java.text.SimpleDateFormat
// 两个订单流,测试双流Join
case class OrderLogEvent1(orderId:Long,amount:Double, timeStamp:Long)
case class OrderLogEvent2(orderId:Long,itemId:Long, timeStamp:Long)
case class OrderResultEvent(orderId:Long,amount:Double, itemId:Long)
object Join {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
val leftOrderStream = env.fromCollection(List(
OrderLogEvent1(1L, 22.1, getTime("2020-04-29 13:01")),
OrderLogEvent1(2L, 22.2, getTime("2020-04-29 13:03")),
OrderLogEvent1(4L, 22.3, getTime("2020-04-29 13:04")),
OrderLogEvent1(4L, 22.4, getTime("2020-04-29 13:05")),
OrderLogEvent1(5L, 22.5, getTime("2020-04-29 13:07")),
OrderLogEvent1(6L, 22.6, getTime("2020-04-29 13:09"))
))
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[OrderLogEvent1](Time.seconds(5)) {
override def extractTimestamp(element: OrderLogEvent1): Long = element.timeStamp
})
.keyBy(_.orderId)
val rightOrderStream = env.fromCollection(List(
OrderLogEvent2(1L, 121, getTime("2020-04-29 13:01")),
OrderLogEvent2(2L, 122, getTime("2020-04-29 13:03")),
OrderLogEvent2(3L, 123, getTime("2020-04-29 13:04")),
OrderLogEvent2(4L, 124, getTime("2020-04-29 13:05")),
OrderLogEvent2(5L, 125, getTime("2020-04-29 13:07")),
OrderLogEvent2(7L, 126, getTime("2020-04-29 13:09"))
))
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[OrderLogEvent2](Time.seconds(5)) {
override def extractTimestamp(element: OrderLogEvent2): Long = element.timeStamp
})
.keyBy(_.orderId)
leftOrderStream
.join(rightOrderStream)
.where(_.orderId)
.equalTo(_.orderId)
.window(TumblingEventTimeWindows.of(Time.minutes(5))) // 5min的时间滚动窗口
.apply(new InnerWindowJoinFunction)
.print()
leftOrderStream
.coGroup(rightOrderStream)
.where(_.orderId)
.equalTo(_.orderId)
.window(TumblingEventTimeWindows.of(Time.minutes(5))) // 5min的时间滚动窗口
.apply(new LeftWindowJoinFunction)
.print()
leftOrderStream
.intervalJoin(rightOrderStream)
.between(Time.minutes(-2),Time.minutes(2))
.process(new IntervalJoinFunction)
.print()
env.execute("point_check")
}
def getTime(tm:String): Long ={
val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm")
val dt = fm.parse(tm)
val tim: Long = dt.getTime
tim
}
}
class InnerWindowJoinFunction extends JoinFunction[OrderLogEvent1, OrderLogEvent2, OrderResultEvent] {
override def join(first: OrderLogEvent1, second: OrderLogEvent2): OrderResultEvent = {
OrderResultEvent(first.orderId, first.amount, second.itemId)
}
}
class LeftWindowJoinFunction extends CoGroupFunction[OrderLogEvent1,OrderLogEvent2,OrderResultEvent]{
override def coGroup(first: java.lang.Iterable[OrderLogEvent1],
second: java.lang.Iterable[OrderLogEvent2],
out: Collector[OrderResultEvent]): Unit = {
/**
* 将Java的Iterable对象转化为Scala的Iterable对象
*/
import scala.collection.JavaConverters._
val scalaT1 = first.asScala.toList
val scalaT2 = second.asScala.toList
for (left <- scalaT1) {
var flag = false // 定义flag,left流中的key在right流中是否匹配
for (right <- scalaT2) {
out.collect(OrderResultEvent(left.orderId,left.amount,right.itemId))
flag = true;
}
if (!flag){ // left流中的key在right流中没有匹配到,则给itemId输出默认值0L
out.collect(OrderResultEvent(left.orderId,left.amount,0L))
}
}
}
}
class IntervalJoinFunction extends ProcessJoinFunction[OrderLogEvent1,OrderLogEvent2,OrderResultEvent]{
override def processElement(left: OrderLogEvent1,
right: OrderLogEvent2,
ctx: ProcessJoinFunction[OrderLogEvent1, OrderLogEvent2, OrderResultEvent]#Context,
out: Collector[OrderResultEvent]): Unit = {
out.collect(OrderResultEvent(left.orderId,left.amount,right.itemId))
}
}