20-08-flink项目

最新推荐文章于 2021-07-25 21:02:37 发布

nzch

最新推荐文章于 2021-07-25 21:02:37 发布

阅读量149

点赞数

分类专栏：尚硅谷大数据

本文链接：https://blog.csdn.net/qq_28764557/article/details/114490709

版权

尚硅谷大数据专栏收录该内容

32 篇文章 0 订阅

订阅专栏

批处理的流处理的关系。

代码：

---08-08---

五大模块：

统计一小时，每5min更新结果。

排序我是基于窗口各排各的。

一条数据可能属于12个窗口的。60/5=12。

聚合是没有用count的。

新的聚合的方式。

注意是滑动窗口的整数倍：

开窗之后用windowFunction的，交给windowFunction才能包裹住window的信息的。

结合aggregate预聚合函数，来一次聚合一次，最后传给windowFunction这个需要所有数据，结合两者的优点。

元组：https://blog.csdn.net/qq_36330643/article/details/76484840

代码：关于WindowFunction:https://blog.csdn.net/weixin_38255219/article/details/106714493

package com.atguigu.hotitems_analysis

import java.sql.Timestamp
import java.util.Properties

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

/**
  * Copyright (c) 2018-2028 尚硅谷 All Rights Reserved
  *
  * Project: UserBehaviorAnalysis
  * Package: com.atguigu.hotitems_analysis
  * Version: 1.0
  *
  * Created by wushengran on 2020/4/25 15:39
  */

// 定义输入数据的样例类
case class UserBehavior( userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long )

// 定义窗口聚合结果的样例类
case class ItemViewCount( itemId: Long, windowEnd: Long, count: Long )

object HotItems {
  def main(args: Array[String]): Unit = {
    // 创建一个流处理执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    // 事件时间作为时间语意
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    // 从文件读取数据
    val inputStream: DataStream[String] = env.readTextFile("D:\\Projects\\BigData\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")

    // 从kafka读取数据
//    val properties = new Properties()
//    properties.setProperty("bootstrap.servers", "hadoop104:9092")
//   // properties.setProperty("group.id", "consumer-group")
//    properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
//    properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
//    properties.setProperty("auto.offset.reset", "latest")
//    val inputStream: DataStream[String] = env.addSource( new FlinkKafkaConsumer[String]("hotitems", new SimpleStringSchema(), properties) )

    // 将数据转换成样例类类型，并且提取timestamp定义watermark
    val dataStream: DataStream[UserBehavior] = inputStream
      // 读进来的每一行疏数据
      .map( data => {
        val dataArray = data.split(",")
        UserBehavior( dataArray(0).toLong, dataArray(1).toLong, dataArray(2).toInt, dataArray(3), dataArray(4).toLong )
      } ).assignAscendingTimestamps(_.timestamp * 1000L) // 这里面不用当以延时了 就是简单的升序的数据

    // 对数据进行转换，过滤出pv行为，开窗聚合统计个数
    val aggStream: DataStream[ItemViewCount] = dataStream
      .filter(_.behavior == "pv")    // 过滤pv行为
      .keyBy("itemId")    // 按照itemId分组 然后才开窗
      .timeWindow(Time.hours(1), Time.minutes(5))    // 定义滑动窗口  统计一小时的数据 5分钟统计一次
      .aggregate( new CountAgg(), new ItemCountWindowResult() )

    // 对窗口聚合结果按照窗口进行分组，并做排序取TopN输出
    val resultStream: DataStream[String] = aggStream
      .keyBy("windowEnd")
      .process( new TopNHotItems(5) )

    resultStream.print()

    env.execute("hot items job")
  }
}

// 自定义预聚合函数，来一条数据就加1 输入 中间的状态 输出
class CountAgg() extends AggregateFunction[UserBehavior, Long, Long]{
  override def add(value: UserBehavior, accumulator: Long): Long = accumulator + 1

  override def createAccumulator(): Long = 0L

  override def getResult(accumulator: Long): Long = accumulator

  override def merge(a: Long, b: Long): Long = a + b
}

// 扩展：自定义求平均值的聚合函数，状态为（sum，count）
class AvgAgg() extends AggregateFunction[UserBehavior, (Long, Int), Double]{
  override def add(value: UserBehavior, accumulator: (Long, Int)): (Long, Int) =
    (accumulator._1 + value.timestamp, accumulator._2 + 1)

  override def createAccumulator(): (Long, Int) = (0L, 0)

  override def getResult(accumulator: (Long, Int)): Double = accumulator._1 / accumulator._2.toDouble

  override def merge(a: (Long, Int), b: (Long, Int)): (Long, Int) =
    (a._1 + b._1, a._2 + b._2)
}

// 自定义窗口函数，结合window信息包装成样例类 第一个类型是预聚合的输出 输出为ItemViewCount key的类型就是keyBy的类型就是一个元组就是key的类型
class ItemCountWindowResult() extends WindowFunction[Long, ItemViewCount, Tuple, TimeWindow]{
  override def apply(key: Tuple, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
    val itemId = key.asInstanceOf[Tuple1[Long]].f0
    val windowEnd = window.getEnd
    val count = input.iterator.next()
    out.collect(ItemViewCount(itemId, windowEnd, count))
  }
}


// 自定义 KeyedProcessFunction
class TopNHotItems(n: Int) extends KeyedProcessFunction[Tuple, ItemViewCount, String]{
  // 定义一个ListState，用来保存当前窗口所有的count结果
  lazy val itemCountListState: ListState[ItemViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[ItemViewCount]("itemcount-list", classOf[ItemViewCount]))

  override def processElement(value: ItemViewCount, ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#Context, out: Collector[String]): Unit = {
    // 每来一条数据，就把它保存到状态中
    itemCountListState.add(value)
    // 注册定时器，在 windowEnd + 100 触发
    ctx.timerService().registerEventTimeTimer(value.windowEnd + 100)
  }

  // 定时器触发时，从状态中取数据，然后排序输出
  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
    // 先把状态中的数据提取到一个ListBuffer中
    val allItemCountList: ListBuffer[ItemViewCount] = ListBuffer()
    import scala.collection.JavaConversions._
    for( itemCount <- itemCountListState.get() ){
      allItemCountList += itemCount
    }

    // 按照count值大小排序，取TopN
    val sortedItemCountList = allItemCountList.sortBy(_.count)(Ordering.Long.reverse).take(n)

    // 清除状态
    itemCountListState.clear()

    // 将排名信息格式化成String，方便监控显示
    val result: StringBuilder = new StringBuilder
    result.append("时间：").append( new Timestamp(timestamp - 100) ).append("\n")
    // 遍历sorted列表，输出TopN信息
    for( i <- sortedItemCountList.indices ){
      // 获取当前商品的count信息
      val currentItemCount = sortedItemCountList(i)
      result.append("Top").append(i+1).append(":")
        .append(" 商品ID=").append(currentItemCount.itemId)
        .append(" 访问量=").append(currentItemCount.count)
        .append("\n")
    }
    result.append("==============================\n\n")

    // 控制输出频率
    Thread.sleep(1000)
    out.collect(result.toString())
  }
}

windowFunction的优点是可以统计全部的数据，缺点是存储的代价是很大的。

aggregate的优点是可以聚合，缺点是不能获取窗口信息。

---08-09---

问题：略

---08-10---

项目中一般都是滚动窗口。

按照id和窗口做了聚合。

每一个窗口内部排序的。

看下包装好的信息：

进行统计和整理，按照windowEnd分组，然后组内到齐了输出topn。

定义一个11：00+10ms的定时器，到了就关窗，提取watermark，watermark是升序的。

TopN的思路。

按照事件时间注册定时器。

时间戳提取出waterMark分区的waterMark以最小的为基准的。

---08-11---


// 自定义 KeyedProcessFunction  key是windowEnd I是前面的输出 O是输出
class TopNHotItems(n: Int) extends KeyedProcessFunction[Tuple, ItemViewCount, String] {
  // 定义一个ListState，用来保存当前窗口所有的count结果
  lazy val itemCountListState: ListState[ItemViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[ItemViewCount]("itemcount-list", classOf[ItemViewCount]))

  override def processElement(value: ItemViewCount, ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#Context, out: Collector[String]): Unit = {
    // 每来一条数据，就把它保存到状态中
    itemCountListState.add(value)
    // 注册定时器，在 windowEnd + 100 触发
    ctx.timerService().registerEventTimeTimer(value.windowEnd + 100)
  }

注册定时器。


  // 定时器触发时，从状态中取数据，然后排序输出
  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Tuple, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
    // 先把状态中的数据提取到一个ListBuffer中
    val allItemCountList: ListBuffer[ItemViewCount] = ListBuffer()
    import scala.collection.JavaConversions._
    for (itemCount <- itemCountListState.get()) {
      allItemCountList += itemCount
    }
    // 按照count值大小排序，取TopN
    val sortedItemCountList = allItemCountList.sortBy(_.count)(Ordering.Long.reverse).take(n)
    // 清除状态
    itemCountListState.clear()
    // 将排名信息格式化成String，方便监控显示
    val result: StringBuilder = new StringBuilder
    result.append("时间：").append(new Timestamp(timestamp - 100)).append("\n")
    // 遍历sorted列表，输出TopN信息
    for (i <- sortedItemCountList.indices) {
      // 获取当前商品的count信息
      val currentItemCount = sortedItemCountList(i)
      result.append("Top").append(i + 1).append(":")
        .append(" 商品ID=").append(currentItemCount.itemId)
        .append(" 访问量=").append(currentItemCount.count)
        .append("\n")
    }
    result.append("==============================\n\n")
    // 控制输出频率
    Thread.sleep(1000)
    out.collect(result.toString())
  }

scala的<-:https://blog.csdn.net/weixin_45577149/article/details/103293091

---08-12---

---08-13---

---08-14---

--------------------------------------------------------