Flink的三种join

很多架构开始往流批一体进行过渡,其中flink面临最大的挑战之一就是做好流join

引言

本篇文章借鉴:https://www.jianshu.com/p/3872d6d81cfd

一般来说,我们遇到的join场景有普通的join、left join、righr join等等,结合这几种场景来简单讲一下。

join的前提操作操作是指定了watermarks,保证数据在一定时间内满足条件的join。

windowJoin

通过窗口,匹配在这一个时间窗口内的,实现方式较简单。
使用模版

stream.join(otherStream)
    .where(<KeySelector>)
    .equalTo(<KeySelector>)
    .window(<WindowAssigner>)
    .apply(<JoinFunction>)
class InnerWindowJoinFunction extends JoinFunction[Obj1, Obj2, Obj3] {
  override def join(first: Obj1, second: Obj2): Obj3 = {
    Obj3(first.elem1, first.elem2, second.elem1)
  }
}

coGroupJoin

通过窗口实现,以一条流为标准,匹配每个点在时间区间内的数据,匹配不到指定为0或者null,实现方式不算复杂。
使用模版

stream.coGroup(otherStream)
    .where(<KeySelector>)
    .equalTo(<KeySelector>)
    .window(<WindowAssigner>)
    .apply(<CoGroupFunction>)
class LeftWindowJoinFunction extends CoGroupFunction[Obj1,Obj2,Obj3]{
  override def coGroup(first: java.lang.Iterable[Obj1],
                       second: java.lang.Iterable[Obj2],
                       out: Collector[Obj3]): Unit = {
    /**
     * 将Java的Iterable对象转化为Scala的Iterable对象
     */
    import scala.collection.JavaConverters._
    val scalaT1 = first.asScala.toList
    val scalaT2 = second.asScala.toList

    for (left <- scalaT1) {
      var flag = false // 定义flag,left流中的key在right流中是否匹配
      for (right <- scalaT2) {
        out.collect(Obj3(left.orderId,left.amount,right.itemId))
        flag = true;
      }
      if (!flag){ // left流中的key在right流中没有匹配到,则给itemId输出默认值0L
        out.collect(Obj3(left.orderId,left.amount,0L))
      }
    }
  }
}

intervalJoin

区别上边两种的是没有窗口,以一条流为标准,匹配每个点在时间区间内的数据,实现方式较简单。
使用模版

stream.intervalJoin(otherStream)
    .between(<-time>, <+time>)
    .process(<ProcessJoinFunction>)
class IntervalJoinFunction extends ProcessJoinFunction[Obj1,Obj2,Obj3]{
  override def processElement(left: Obj1,
                              right: Obj2,
                              ctx: ProcessJoinFunction[Obj1, Obj2, Obj3]#Context,
                              out: Collector[Obj3]): Unit = {
    out.collect(obj3(left.elem1, left.elem2, right.elem2))
  }
}

完整代码


import org.apache.flink.api.common.functions.{CoGroupFunction, JoinFunction}
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction
import org.apache.flink.util.Collector
import org.apache.flink.api.common.functions.CoGroupFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows

import java.text.SimpleDateFormat

// 两个订单流,测试双流Join
case class OrderLogEvent1(orderId:Long,amount:Double, timeStamp:Long)
case class OrderLogEvent2(orderId:Long,itemId:Long, timeStamp:Long)
case class OrderResultEvent(orderId:Long,amount:Double, itemId:Long)


object Join {

  def main(args: Array[String]): Unit = {

    val env = StreamExecutionEnvironment.getExecutionEnvironment

    env.setParallelism(1)

    val leftOrderStream = env.fromCollection(List(
      OrderLogEvent1(1L, 22.1, getTime("2020-04-29 13:01")),
      OrderLogEvent1(2L, 22.2, getTime("2020-04-29 13:03")),
      OrderLogEvent1(4L, 22.3, getTime("2020-04-29 13:04")),
      OrderLogEvent1(4L, 22.4, getTime("2020-04-29 13:05")),
      OrderLogEvent1(5L, 22.5, getTime("2020-04-29 13:07")),
      OrderLogEvent1(6L, 22.6, getTime("2020-04-29 13:09"))
    ))
      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[OrderLogEvent1](Time.seconds(5)) {
        override def extractTimestamp(element: OrderLogEvent1): Long = element.timeStamp
      })
      .keyBy(_.orderId)

    val rightOrderStream = env.fromCollection(List(
      OrderLogEvent2(1L, 121, getTime("2020-04-29 13:01")),
      OrderLogEvent2(2L, 122, getTime("2020-04-29 13:03")),
      OrderLogEvent2(3L, 123, getTime("2020-04-29 13:04")),
      OrderLogEvent2(4L, 124, getTime("2020-04-29 13:05")),
      OrderLogEvent2(5L, 125, getTime("2020-04-29 13:07")),
      OrderLogEvent2(7L, 126, getTime("2020-04-29 13:09"))
    ))
      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[OrderLogEvent2](Time.seconds(5)) {
        override def extractTimestamp(element: OrderLogEvent2): Long = element.timeStamp
      })
      .keyBy(_.orderId)


    leftOrderStream
      .join(rightOrderStream)
      .where(_.orderId)
      .equalTo(_.orderId)
      .window(TumblingEventTimeWindows.of(Time.minutes(5))) // 5min的时间滚动窗口
      .apply(new InnerWindowJoinFunction)
      .print()


    leftOrderStream
      .coGroup(rightOrderStream)
      .where(_.orderId)
      .equalTo(_.orderId)
      .window(TumblingEventTimeWindows.of(Time.minutes(5))) // 5min的时间滚动窗口
      .apply(new LeftWindowJoinFunction)
      .print()

    leftOrderStream
      .intervalJoin(rightOrderStream)
      .between(Time.minutes(-2),Time.minutes(2))
      .process(new IntervalJoinFunction)
      .print()

    env.execute("point_check")
  }

  def getTime(tm:String): Long ={
    val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm")
    val dt = fm.parse(tm)
    val tim: Long = dt.getTime
    tim
  }

}

class InnerWindowJoinFunction extends JoinFunction[OrderLogEvent1, OrderLogEvent2, OrderResultEvent] {
  override def join(first: OrderLogEvent1, second: OrderLogEvent2): OrderResultEvent = {
    OrderResultEvent(first.orderId, first.amount, second.itemId)
  }
}


class LeftWindowJoinFunction extends CoGroupFunction[OrderLogEvent1,OrderLogEvent2,OrderResultEvent]{
  override def coGroup(first: java.lang.Iterable[OrderLogEvent1],
                       second: java.lang.Iterable[OrderLogEvent2],
                       out: Collector[OrderResultEvent]): Unit = {
    /**
     * 将Java的Iterable对象转化为Scala的Iterable对象
     */
    import scala.collection.JavaConverters._
    val scalaT1 = first.asScala.toList
    val scalaT2 = second.asScala.toList

    for (left <- scalaT1) {
      var flag = false // 定义flag,left流中的key在right流中是否匹配
      for (right <- scalaT2) {
        out.collect(OrderResultEvent(left.orderId,left.amount,right.itemId))
        flag = true;
      }
      if (!flag){ // left流中的key在right流中没有匹配到,则给itemId输出默认值0L
        out.collect(OrderResultEvent(left.orderId,left.amount,0L))
      }
    }
  }
}


class IntervalJoinFunction extends ProcessJoinFunction[OrderLogEvent1,OrderLogEvent2,OrderResultEvent]{
  override def processElement(left: OrderLogEvent1,
                              right: OrderLogEvent2,
                              ctx: ProcessJoinFunction[OrderLogEvent1, OrderLogEvent2, OrderResultEvent]#Context,
                              out: Collector[OrderResultEvent]): Unit = {
    out.collect(OrderResultEvent(left.orderId,left.amount,right.itemId))
  }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值