代码
未优化:
package com.zjc.flow_analysis
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import java.lang
case class UserBehavior(userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long)
case class PvCount(windowEnd: Long, count: Long)
object PageView {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val inputStream = env.readTextFile("E:\\zjc\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
val dataStream = inputStream.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).toLong, dataArray(1).toLong,dataArray(2).toInt, dataArray(3).toString, dataArray(4).toLong)
}).assignAscendingTimestamps(_.timestamp * 1000L)
//分配key,包装成二元组开窗聚合
val pvStream = dataStream.filter(_.behavior == "pv")
.map(data => ("pv", 1L))
.keyBy(_._1)
.timeWindow(Time.minutes(60))
// .sum(1) // 或
.aggregate(new PvCountAgg(), new PvCountResult())
pvStream.print()
env.execute("pv job")
}
}
class PvCountAgg() extends AggregateFunction[(String, Long), Long, Long]{
override def createAccumulator(): Long = 0L
override def add(in: (String, Long), acc: Long): Long = acc + 1
override def getResult(acc: Long): Long = acc
override def merge(acc: Long, acc1: Long): Long = acc + acc1
}
class PvCountResult() extends WindowFunction[Long,PvCount, String, TimeWindow] {
override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[PvCount]): Unit = {
out.collect(PvCount(window.getEnd,input.head))
}
}
不足: .map(data => (“pv”, 1L)),然后代码中按“pv”聚合,若数据量巨大容易oom。
优化:
package com.zjc.flow_analysis
import org.apache.flink.api.common.functions.{MapFunction, RichMapFunction}
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.util.Collector
import scala.util.Random
object PageView2 {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(4)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val inputStream = env.readTextFile("E:\\zjc\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
val userBehaviorStream = inputStream.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).toLong, dataArray(1).toLong,dataArray(2).toInt, dataArray(3).toString, dataArray(4).toLong)
}).assignAscendingTimestamps(_.timestamp * 1000L)
val aggStream = userBehaviorStream.filter(_.behavior == "pv")
.map(new MyMapper())
.keyBy(_._1)
.timeWindow(Time.hours(1))
.aggregate(new PvCountAgg(), new PvCountResult())
val resultStream = aggStream.keyBy(_.windowEnd)
// .sum("count")
.process(new AllPageCount())
resultStream.print()
env.execute("PageView2的执行")
}
}
class MyMapper() extends RichMapFunction[UserBehavior, (String, Long)] {
lazy val index: Long = getRuntimeContext.getIndexOfThisSubtask//所属的分区
override def map(t: UserBehavior): (String, Long) = {
(index.toString,1L)
}
}
class AllPageCount() extends KeyedProcessFunction[Long, PvCount, PvCount] {
lazy val totalCountState:ValueState[Long] = getRuntimeContext.getState(new ValueStateDescriptor[Long]("total-count", classOf[Long]))
override def processElement(i: PvCount, context: KeyedProcessFunction[Long, PvCount, PvCount]#Context, collector: Collector[PvCount]): Unit = {
totalCountState.update(totalCountState.value() + i.count)
context.timerService().registerEventTimeTimer(i.windowEnd + 1)
}
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, PvCount, PvCount]#OnTimerContext, out: Collector[PvCount]): Unit = {
out.collect(PvCount(timestamp - 1, totalCountState.value()))
totalCountState.clear()
}
}
说明,通过自定义map【map(new MyMapper())】,实现数据打散,解决oom,不足输出乱序。