Flink之经典电商需求

最新推荐文章于 2024-06-05 08:30:00 发布

Hi Xiu Hui

最新推荐文章于 2024-06-05 08:30:00 发布

阅读量314

点赞数

文章标签： flink

本文链接：https://blog.csdn.net/YellowXiuHui/article/details/106893211

版权

文章目录

一、热门实时商品统计
- 1.Table API实现
- 2.Flink SQL 实现
二、实时统计用户访问量
- 1.优化前的，去重占用很大空间
- 2.优化后，使用布隆过滤器过滤
三、实时对账，双流Join

一、热门实时商品统计

1.Table API实现

import java.sql.Timestamp

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.ListStateDescriptor
import org.apache.flink.api.scala.typeutils.Types
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

/**
 * 热门实时商品统计
 * 经验：当窗口长度很长，滑动距离很短时，性能会急速下降
 *        因为一份数据属于多个窗口，被复制的次数太多了
 *        滚动窗口不存在这个问题
 */
object TopHotItems {
  case class UserBehavior(userId: Long,
                          itemId: Long,
                          categoryId: Long,
                          behavior: String,
                          timestamp: Long)

  case class ItemViewCount(itemId: Long,
                           windowStart: Long,
                           windowEnd: Long,
                           count: Long)

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    var stream = env.readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
      .map(line => {
        val arr = line.split(",")
        UserBehavior(arr(0).toLong,arr(1).toLong,arr(2).toLong,arr(3),arr(4).toLong * 1000L)
      })
      .filter(_.behavior.equals("pv"))
      // 必须在分流之前插入水位线，
      // 因为是离线数据集，所以只会在数据集的末尾插入一个Long类型的最大值，触发所有窗口闭合计算
      .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
      .keyBy(_.itemId) // 使用商品ID分流 KeyedStream
      .timeWindow(Time.hours(1),Time.minutes(5))  // 开窗 WindowedStream
      .aggregate(new CountAgg, new WindowResult)  // 增量聚合和全窗口聚合结合使用 DataStream
      // 水位线继续复制分发下去，因为在源头插入的水位线
      .keyBy(_.windowEnd)
      // 同一份数据，给它打上不同的窗口信息，这样就可以销毁窗口，不影响其它窗口
        .process(new TopN(3))

    stream.print()
    env.execute()
  }

  class CountAgg extends AggregateFunction[UserBehavior,Long,Long] {
    override def createAccumulator(): Long = 0L

    override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1

    override def getResult(accumulator: Long): Long =accumulator
    // 只有会话窗口(特殊时间某时间段)用到merger,
    // 滚动，滑动窗口不用merger,所以a + b 用0代替也可以
    override def merge(a: Long, b: Long): Long = a + b
  }

  /**
   *  输入：Long 来自累加器的结果，窗口内(1小时)的总浏览数
   *  输出: 封装的样例类
   *  key是Long,因为keyBy的itemId是Long类型
   *
   */
  class WindowResult extends ProcessWindowFunction[Long,ItemViewCount,Long,TimeWindow] {
    override def process(key: Long, context: Context, elements: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
        out.collect(ItemViewCount(key,context.window.getStart,context.window.getEnd,elements.head))
    }
  }

  // 来一条元素处理一条
  /**
   * @param topSize
   *   这窗口里面，等支流元素到齐以后，再对它进行排序。支流的元素，怎么排序的呢？
   *   我们把它存到listState里面，来一条存一条，等都到齐以后，我们把列表状态里的数据，
   *   全拿出来，然后排序，拿前3个
   *
   *   // 会不会有乱序的呢？
   *   不会，离线数据集，本身就排好序的，从硬盘读入内存。进行时间旅行
   *
   */
  class TopN(val topSize: Int) extends KeyedProcessFunction[Long,ItemViewCount,String] {

    // 只针对当前key可见的
    lazy val listState = getRuntimeContext.getListState(
      new ListStateDescriptor[ItemViewCount]("list-state", Types.of[ItemViewCount])
    )

    override def processElement(value: ItemViewCount, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#Context, out: Collector[String]): Unit = {
      listState.add(value)
      // 不会重复注册
      // 当我们水位线，超过窗口结束时间+100毫秒时，说明窗口里的统计数据都到了，做排序
      // 每一条支流上，都会调用processElement,每条支流，都属于同一窗口
      // 来一条数据，就会在同样的时间，注册一个事件定时器
      ctx.timerService().registerEventTimeTimer(value.windowEnd + 100) // 当水位线越过定时时间的时候，触发定时器排序
    }

    override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
      val allItems: ListBuffer[ItemViewCount] = ListBuffer()
      import scala.collection.JavaConversions._
      // 将列表状态中的数据转移到内存
      // 列表状态无法排序
      for (item <- listState.get()) {
        allItems += item
      }
      // 清空状态
      listState.clear()

      // 使用浏览量降序排列
      val sortedItems = allItems.sortBy(-_.count).take(topSize)

      val result = new StringBuilder

      result
        .append("===========================\n")
        .append("窗口：" + new Timestamp(allItems.head.windowStart) + " ~~~ " + new Timestamp(allItems.head.windowEnd))
        .append("\n")

      // 获取下标值0,1,2
      for (i <- sortedItems.indices) {
          val  currItem = sortedItems(i)
        result
          .append("No.")
          .append(i+1)
          .append(":")
          .append("  商品ID = ")
          .append(currItem.itemId)
          .append("  浏览量 = ")
          .append(currItem.count)
          .append("\n")
      }
      result
        .append("===========================\n\n\n")
      Thread.sleep(1000)
      out.collect(result.toString)
    }
  }
}

2.Flink SQL 实现

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.api.scala._
import org.apache.flink.types.Row

/**
 * 两种问题最难调：
 * 1.内存泄漏
 * 2.锁，多线程的竞争问题
 *
 *  1.引入GC，自动回收内存
 *  2.竞争问题，引入函数式编程，所有变量不可变，这变量就无法修改，
 *     你想用，只能拷贝，就用不着锁，分布式，加机器就ok
 *
 */
object HotItemsSQL {
  case class UserBehavior(userId: Long,
                          itemId: Long,
                          categoryId: Long,
                          behavior: String,
                          timestamp: Long)
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    //数据源并行度必须设为1
    // 如果并行度是10，会把离线数据集，随机打散成10份，然后在每一条流上插入水位线，那肯定乱
    // 流：一变多 ，多变一
    env.setParallelism(1)

    // 新建表环境
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(env, settings)

    val stream = env
      .readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
      .map(line => {
        val arr = line.split(",")
        UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
      })
      .filter(_.behavior.equals("pv"))
      // 为什么要先插入水位线，
      // 因为我们要在单条流的时候，就插入水位线，然后分流，先分流再开窗，先添加key信息，再添加窗口信息
      .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream

    // 创建临时表
    // t 是表名，rowtime是使用事件时间，as是取别名
      tableEnv.createTemporaryView("t",stream,'itemId, 'timestamp.rowtime as 'ts)

    // top n只有blink planner支持
    // 最内部的子查询实现了：stream.keyBy(_.itemId).timeWindow(Time.hours(1), Time.minutes(5)).aggregate(new CountAgg, new WindowResult)
    // 倒数第二层子查询：.keyBy(_.windowEnd).process(Sort)
    // 最外层：取出前三名
    var result = tableEnv
      .sqlQuery(
        """
          SELECT *
          |FROM (
          |    SELECT *,
          |           ROW_NUMBER() OVER (PARTITION BY windowEnd ORDER BY icount DESC) as row_num
          |    FROM (
          |          SELECT itemId, count(itemId) as icount,
          |                 HOP_END(ts, INTERVAL '5' MINUTE, INTERVAL '1' HOUR) as windowEnd
          |                 FROM t GROUP BY itemId, HOP(ts, INTERVAL '5' MINUTE, INTERVAL '1' HOUR)
          |    )
          |)
          |WHERE row_num <= 3
          |""".stripMargin)

    result.toRetractStream[Row]
      // 将false撤回的过滤掉，不断追加，撤回，每条数据来都会更新
      .filter(_._1 == true)
      .print()

    env.execute()
  }
}

二、实时统计用户访问量

1.优化前的，去重占用很大空间

import java.sql.Timestamp

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

// uv: unique visitor
// 有多少用户访问过网站；pv按照userid去重
// 滑动窗口：窗口长度1小时，滑动距离5秒钟，每小时用户数量1亿
// 大数据去重的唯一解决方案：布隆过滤器
// 布隆过滤器的组成：bit数组，哈希函数
// 相同用户名的要去掉
object UV {

  case class UserBehavior(userId: Long,
                          itemId: Long,
                          categoryId: Long,
                          behavior: String,
                          timestamp: Long)

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val stream = env
      .readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
      .map(line => {
        val arr = line.split(",")
        UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
      })
      .filter(_.behavior.equals("pv"))
      .assignAscendingTimestamps(_.timestamp) // 分配升序时间戳 DataStream
      // 这个只有一条流,41-43等价于下面的
//      .timeWindowAll(Time.hours(1))
      .map(r => ("key",r.userId))
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      .process(new UvProcessFunc)

    // 只要print，基本上打印的都是DataStream，
    // WindowedStream,你开窗以后是打印不出来的
    // 开窗，就必须在这窗口把它做进行聚合，计算。把它聚合成一条DataStream,不然，
    stream.print()
    env.execute()
  }

  /**
   * 全窗口聚合函数，(窗口结束后计算，整个大窗口，所有元素一起计算)
   */
  class UvProcessFunc extends ProcessWindowFunction[(String,Long),String,String,TimeWindow] {
    override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[String]): Unit = {
      // 将迭代器的集合转移到Set，再利用Set将userId去重
      var s:Set[Long] = Set()

      for (ele <- elements) {
        s += ele._2
      }
      out.collect("窗口" + new Timestamp(context.window.getStart) + "---" + new Timestamp(context.window.getEnd) + "的UV数是：" + s.size)
    }
  }
}

2.优化后，使用布隆过滤器过滤

优化原理:pv按照userid去重
// 大数据去重的唯一解决方案：布隆过滤器

import java.sql.Timestamp

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis


// uv: unique visitor
// 有多少用户访问过网站；pv按照userid去重
// 滑动窗口：窗口长度1小时，滑动距离5秒钟，每小时用户数量1亿
// 大数据去重的唯一解决方案：布隆过滤器
// 布隆过滤器的组成：bit数组，哈希函数
object vByBloomFilter {

  case class UserBehavior(userId: Long,
                          itemId: Long,
                          categoryId: Long,
                          behavior: String,
                          timestamp: Long)

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    env.setParallelism(1)

    val stream = env
      .readTextFile("D:\\software\\code\\FlinkCode\\src\\main\\resources\\UserBehavior.csv")
      .map(line => {
        val arr = line.split(",")
        UserBehavior(arr(0).toLong, arr(1).toLong, arr(2).toLong, arr(3), arr(4).toLong * 1000L)
      })
      .filter(_.behavior.equals("pv"))
      .assignAscendingTimestamps(_.timestamp)  // 分配升序时间戳 DataStream
      .map(r => ("key",r.userId))
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      .trigger(new UvTrigger)
      .process(new UvProcessFunc)

    stream.print()
    env.execute()
  }

  class UvTrigger extends Trigger[(String,Long),TimeWindow] {
    // 来一条元素调用一次
    override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
      // 来一个事件，就触发一次窗口计算，并清空窗口
      TriggerResult.FIRE_AND_PURGE
    }

    override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
      TriggerResult.CONTINUE
    }

    override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
      if (ctx.getCurrentWatermark >= window.getEnd) {
        val jedis = new Jedis("hadoop103", 6379)
        val windowEnd = window.getEnd.toString
        println(new Timestamp(windowEnd.toLong), jedis.hget("UvCount", windowEnd))
        TriggerResult.FIRE_AND_PURGE // 为保险起见
      }
      TriggerResult.CONTINUE
    }

    override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {}
  }

  class UvProcessFunc extends ProcessWindowFunction[(String, Long), String, String, TimeWindow] {
    // 连接到redis
    lazy val jedis = new Jedis("hadoop103", 6379)

    override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[String]): Unit = {
      // 窗口结束时间 ==> UV数
      // 窗口结束时间 ==> bit数组

      // 拿到key
      val windowEnd = context.window.getEnd.toString

      var count = 0L

      if (jedis.hget("UvCount", windowEnd) != null) {
        count = jedis.hget("UvCount", windowEnd).toLong
      }

      // 迭代器中只有一条元素，因为每来一条元素，窗口清空一次，见trigger
      val userId = elements.head._2.toString
      // 计算userId对应的bit数组的下标
      val idx = hash(userId, 1 << 29)

      // 判断userId是否访问过
      if (!jedis.getbit(windowEnd, idx)) { // 对应的bit为0的话，返回false，用户一定没访问过
        jedis.setbit(windowEnd, idx, true) // 将idx对应的bit翻转为1
        jedis.hset("UvCount", windowEnd, (count + 1).toString)
      }
    }
  }

  // 为了方便理解，只实现一个哈希函数，返回值是Long，bit数组的下标
  // value: 字符串；size：bit数组的长度
  def hash(value: String, size: Long): Long = {
    val seed = 61 // 种子
    var result = 0L
    for (i <- 0 until value.length) {
      result = result * seed + value.charAt(i)
    }
    (size - 1) & result
  }

}

三、实时对账，双流Join

import org.apache.flink.api.common.state.ValueStateDescriptor
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.co.CoProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

object TwoStreamsJoin {
  // 订单支付事件
  case class OrderEvent(orderId: String,
                        eventType: String,
                        eventTime: Long)

  // 第三方支付事件，例如微信，支付宝
  case class PayEvent(orderId: String,
                      eventType: String,
                      eventTime: Long)

  // 用来输出没有匹配到的订单支付事件
  val unmatchedOrders = new OutputTag[String]("unmatched-orders")
  // 用来输出没有匹配到的第三方支付事件
  val unmatchedPays   = new OutputTag[String]("unmatched-pays")

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val orders = env
      .fromElements(
        OrderEvent("order_1", "pay", 2000L),
        OrderEvent("order_2", "pay", 5000L),
        OrderEvent("order_3", "pay", 6000L)
      )
      // 分流之前要先插入水位线
      .assignAscendingTimestamps(_.eventTime)
      .keyBy(_.orderId)

    val pays = env
      .fromElements(
        PayEvent("order_1", "weixin", 7000L),
        PayEvent("order_2", "weixin", 8000L),
        PayEvent("order_4", "weixin", 9000L)
      )
      // 分流之前要先插入水位线
      .assignAscendingTimestamps(_.eventTime)
      .keyBy(_.orderId)

    val processed =  orders
      .connect(pays)
      .process(new MatchFunction)

    processed.print()
    processed.getSideOutput(new OutputTag[String]("unmatched-orders")).print()
    processed.getSideOutput(new OutputTag[String]("unmatched-pays")).print()

    env.execute()

  }


  class MatchFunction extends CoProcessFunction[OrderEvent,PayEvent,String] {
    // 用来保存到来的订单支付事件
    lazy val orderState = getRuntimeContext.getState(
      new ValueStateDescriptor[OrderEvent]("order-state",classOf[OrderEvent])
    )

    // 用来保存到来的第三方支付事件
    lazy val payState = getRuntimeContext.getState(
      new ValueStateDescriptor[PayEvent]("pay-state",classOf[PayEvent])
    )
    override def processElement1(order: OrderEvent, ctx: CoProcessFunction[OrderEvent, PayEvent, String]#Context, out: Collector[String]): Unit = {
      val pay = payState.value()
      // pay 和 order 的订单ID是一样的
      if (pay != null) {
        payState.clear()
        out.collect("订单ID为 " + order.orderId + " 的两条流对账成功！")
      } else {
          orderState.update(order)
          ctx.timerService().registerEventTimeTimer(order.eventTime + 5000L)
      }
    }

    // 处理第三方支付流的
    override def processElement2(pay: PayEvent, ctx: CoProcessFunction[OrderEvent, PayEvent, String]#Context, out: Collector[String]): Unit = {
      val order = orderState.value()
      if (order != null) {
        orderState.clear()
        out.collect("订单ID为 " + pay.orderId + " 的两条流对账成功！")
      } else {
        payState.update(pay)
        ctx.timerService().registerEventTimeTimer(pay.eventTime + 5000L)
      }
    }

    /**
     *
     * @param timestamp
     * @param ctx   只有CEP的侧输出标签用 out.collect输出
     *              其它用ctx.output 输出侧输出标签,
     * @param out   out.collect 一般用于收集数据向下游输出，主流的都用这个输出，侧输出流的不一定
     */
    override def onTimer(timestamp: Long, ctx: CoProcessFunction[OrderEvent, PayEvent, String]#OnTimerContext, out: Collector[String]): Unit = {
        if (orderState.value() != null) {
          ctx.output(unmatchedOrders, "订单ID为 " + orderState.value().orderId + " 的两条流没有对账成功！")
          orderState.clear()
        }

      if (payState.value() != null) {
        ctx.output(unmatchedPays, "订单ID为 " + payState.value().orderId + " 的两条流没有对账成功！")
        payState.clear()
      }
    }
  }

}

Hi Xiu Hui

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Flink之经典电商需求

一、热门实时商品统计1.Table API实现import java.sql.Timestampimport org.apache.flink.api.common.functions.AggregateFunctionimport org.apache.flink.api.common.state.ListStateDescriptorimport org.apache.flink.api.scala.typeutils.Typesimport org.apache.flink.stream
复制链接

扫一扫