20-10-flink项目

---01---

一小时滑动一次的话,一个数据就可能同时属于3600s。

代码:

package com.atguigu.market_analysis

import java.sql.Timestamp
import java.util.UUID

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.util.Random

/**
  * Copyright (c) 2018-2028 尚硅谷 All Rights Reserved 
  *
  * Project: UserBehaviorAnalysis
  * Package: com.atguigu.market_analysis
  * Version: 1.0
  *
  * Created by wushengran on 2020/4/28 9:21
  */

// 定义输入数据样例类
case class MarketUserBehavior( userId: String, behavior: String, channel: String, timestamp: Long )
// 定义输出统计的样例类 输出window的信息 渠道 用户行为 时间戳
case class MarketCount( windowStart: String, windowEnd: String, channel: String, behavior: String, count: Long )

// 自定义测输数据源
class SimulateMarketEventSource() extends RichParallelSourceFunction[MarketUserBehavior]{
  // 定义是否在运行的标识位
  var running: Boolean = true
  // 定义用户行为和推广渠道的集合
  val behaviorSet: Seq[String] = Seq("CLICK", "DOWNLOAD", "INSTALL", "UNINSTALL")
  // 推广的渠道
  val channelSet: Seq[String] = Seq("appstore", "huaweiStore", "weibo", "wechat")
  // 定义随机数生成器
  val rand: Random = Random

  override def cancel(): Unit = running = false

  override def run(ctx: SourceFunction.SourceContext[MarketUserBehavior]): Unit = {
    // 定义一个发出数据的最大量,用于控制测试数据量
    val maxCounts = Long.MaxValue
    var count = 0L
    // while循环,不停地随机生成数据
    while( running && count < maxCounts ){
      val id = UUID.randomUUID().toString
      val behavior = behaviorSet(rand.nextInt(behaviorSet.size))
      val channel = channelSet(rand.nextInt(channelSet.size))
      val ts = System.currentTimeMillis()
      ctx.collect(MarketUserBehavior(id, behavior, channel, ts))
      count += 1
      Thread.sleep(50L)
    }
  }
}

object AppMarketingByChannel {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val dataStream: DataStream[MarketUserBehavior] = env.addSource(new SimulateMarketEventSource())
      .assignAscendingTimestamps(_.timestamp)

    val resultStream: DataStream[MarketCount] = dataStream
      .filter(_.behavior != "UNINSTALL")    // 过滤掉卸载行为
      .keyBy( data => (data.channel, data.behavior) )    // 按照渠道和行为类型分组
      .timeWindow( Time.hours(1), Time.seconds(5) )
      .process( new MarketCountByChannel() )    // 自定义全窗口函数 习惯是预聚合函数+全窗口函数

    resultStream.print()

    env.execute("market count by channel job")
  }
}

// 自定义ProcessWindowFunction 泛型定义的事输入 输出 key TimeWindow
class MarketCountByChannel() extends ProcessWindowFunction[MarketUserBehavior, MarketCount, (String, String), TimeWindow]{
  override def process(key: (String, String), context: Context, elements: Iterable[MarketUserBehavior], out: Collector[MarketCount]): Unit = {
    val windowStart: String = new Timestamp(context.window.getStart).toString
    val windowEnd: String = new Timestamp(context.window.getEnd).toString
    val channel: String = key._1
    val behavior: String = key._2
    val count: Long = elements.size
    out.collect( MarketCount(windowStart, windowEnd, channel, behavior, count) )
  }
}

---02---

aggregateFunction和reduceFunction都是增量的聚合函数。

aggregateFunction其中输入输出事可以不一样的。

前面做了预聚合后面再传入的话就是全窗口函数。

apply方法传的事windowFunction。

还有一个特殊的process方法,传入的processWindowFunction。

增量聚合函数:

aggregateFunction reduceFunction

全窗口函数:

windowFunction processWindowFunction

---03---

 

---04---

 

---04---

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值