chapter06

聚合函数(AggregateFunctionTest)

package com.liao.chapter06

import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time

object AggregateFunctionTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env.addSource(new ClickSource)
      .assignAscendingTimestamps(_.timestamp)

    //统计pv和uv,输出pv/uv
    stream.keyBy(data =>true)
      .window( SlidingEventTimeWindows.of(Time.seconds(10),Time.seconds(2)))
      .aggregate(new PvUv)
      .print()

    env.execute()

  }

  //实现自定义聚合函数,用一个二元组(Long, Set)来表示中间聚合的(Pv,Uv)状态
  class PvUv extends AggregateFunction[Event, (Long,Set[String]), Double] {
    override def createAccumulator(): (Long, Set[String]) = (0L, Set[String]())

    //每来一条数据,都会进行add叠加聚合
    override def add(in: Event, acc: (Long, Set[String])): (Long, Set[String]) = (acc._1+1, acc._2 + in.user)

    //返回最终计算结果
    override def getResult(acc: (Long, Set[String])): Double = acc._1.toDouble / acc._2.size

    override def merge(acc: (Long, Set[String]), acc1: (Long, Set[String])): (Long, Set[String]) = ???
  }
}

全窗口函数(FullWindowFunctionTest)

package com.liao.chapter06

import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

object FullWindowFunctionTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env.addSource(new ClickSource)
      .assignAscendingTimestamps(_.timestamp)

    //测试全窗口函数.统计uv
    stream.keyBy( data => "key" )
      .window( TumblingEventTimeWindows.of(Time.seconds(10)))
      .process(new UvCountByWindow)
      .print()

    env.execute()



  }

  //自定义实现ProcessWindowFunction
  class UvCountByWindow extends ProcessWindowFunction[Event, String,String ,TimeWindow] {
    override def process(key: String, context: Context, elements: Iterable[Event], out: Collector[String]): Unit = {
      //使用一个Set进行去重操作
      var userSet = Set[String]()

      //从elements中提取所有数据,依次放入Set中去重
      elements.foreach( userSet += _.user )
      val uv = userSet.size
      //提取窗口信息包装String进行输出
      val windowEnd = context.window.getEnd
      val windowStart = context.window.getStart

      out.collect(s"窗口 $windowStart ~ $windowEnd 的uv值为: $uv")

    }
  }
}

ProcessLateDataExample

package com.liao.chapter06

import com.liao.chapter05.Event
import com.liao.chapter06.UrlViewCountExample.{UrlViewCountAgg, UrlViewCountResult}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{SlidingEventTimeWindows, TumblingEventTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time

object ProcessLateDataExample {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env.socketTextStream("hadoop002", 7777)
      .map(
        data => {
          val fields = data.split(",")
          Event(fields(0).trim, fields(1).trim, fields(2).trim.toLong)
        })
        .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.seconds(5)) {
          override def extractTimestamp(t: Event): Long = t.timestamp
        })

    //定义一个侧输出流的输出标签
    val outputTag = OutputTag[Event]("late-data")

    //结合使用增量聚合函数和全窗口函数,包装统计信息
    val result = stream.keyBy(_.url)
      .window(TumblingEventTimeWindows.of(Time.seconds(10)))
      //指定窗口允许等待的时间
      .allowedLateness(Time.minutes(1))
      //将迟到数据输出到侧输出流
      .sideOutputLateData(outputTag)

      .aggregate(new UrlViewCountAgg, new UrlViewCountResult)

    result.print("result")

    stream.print("input")

    //将侧输出流的数据进行打印输出
    result.getSideOutput(outputTag).print("late data")

    env.execute()
  }
}

UrlViewCountExample

package com.liao.chapter06

import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector


//定义统计输出的结果数据结构
case class UrlViewCount(url: String ,count: Long,windowStart :Long,windowEnd: Long)

object UrlViewCountExample {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env.addSource(new ClickSource)
      .assignAscendingTimestamps(_.timestamp)

    //结合使用增量聚合函数和全窗口函数,包装统计信息
    stream.keyBy(_.url)
      .window(SlidingEventTimeWindows.of(Time.seconds(10),Time.seconds(5)))
      .aggregate(new UrlViewCountAgg, new UrlViewCountResult)
      .print()

    env.execute()
  }

  //实现自定义增量聚合函数,每来一个数据就加1
  class UrlViewCountAgg extends AggregateFunction[Event,Long,Long]{
    override def createAccumulator(): Long = 0L

    override def add(in: Event, acc: Long): Long = acc + 1

    override def getResult(acc: Long): Long = acc

    override def merge(acc: Long, acc1: Long): Long = ???
  }

  //实现全窗口函数
  class UrlViewCountResult extends ProcessWindowFunction[Long,UrlViewCount,String,TimeWindow] {
    override def process(url: String, context: Context, elements: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
      //提取需要的数据
      val count = elements.iterator.next()
      val start = context.window.getStart
      val end = context.window.getEnd


      //输出数据
      out.collect(UrlViewCount(url,count,start,end))
    }


  }

}

WatermarkTest(水印)

package com.liao.chapter06

import java.lang

import com.liao.chapter05.Event
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

object WatermarkTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    //设置为事件时间
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
//    env.getConfig.setAutoWatermarkInterval(500L)

        val stream :DataStream[Event]= env.socketTextStream("hadoop002",7777)
          .map( data=> {
            val fields = data.split(",")
            Event(fields(0).trim,fields(1).trim,fields(2).toLong)
          } )
    //有序
//    stream.assignAscendingTimestamps(_.timestamp)
//    乱序水位线
//     stream=stream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.seconds(5)) {
//      override def extractTimestamp(t: Event): Long = t.timestamp         Duration
//     })
    //2.乱序流的水位线
    stream.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.seconds(5)) {
      override def extractTimestamp(t: Event): Long = t.timestamp
    })
      .keyBy(_.user)
      .window(TumblingEventTimeWindows.of(Time.seconds(10)))
      .process( new WatermarkWindowResult )
      .print()

      env.execute()
  }

  //实现自定义的全窗口函数
  class WatermarkWindowResult extends ProcessWindowFunction[Event,String,String,TimeWindow] {
    override def process(user: String, context: Context, elements: Iterable[Event], out: Collector[String]): Unit = {
      //提取信息
      val start = context.window.getStart
      val end = context.window.getEnd
      val count = elements.size

      //增加水位线信息
      val currentWatermark = context.currentWatermark

      out.collect(s"窗口 $start ~ $end , 用户 $user 的活跃度为: $count, 水位线现在位于: $currentWatermark")

    }
  }
}

WindowTest

package com.liao.chapter06


import com.liao.chapter05.Event
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{EventTimeSessionWindows, SlidingEventTimeWindows, TumblingEventTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time

object WindowTest {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    //读取数据源
//    val dataStream:DataStream[String]= env.readTextFile("./input/clicks.txt")

    val lineDataStream = env.socketTextStream("hadoop002", 7777)

    //先转换成样例类类型(简单转换操作)
    val stream = lineDataStream.map(data => {
      val arr = data.split(",")
      Event(arr(0),arr(1),arr(2).toLong)
    })
//      .assignAscendingTimestamps(_.timestamp)   //升序数据提取时间戳
      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time.milliseconds(30)) {
        override def extractTimestamp(t: Event): Long = t.timestamp *1000L
      })
    //每15秒统计一次,窗口内各点击时间的最小值
    val resultStream = stream
      .map( data => (data.user,data.timestamp) )
      /*.keyBy(data => data._1)*/
      .keyBy( _._1 )    //按照二元组地第一个元素分组
//      .window( TumblingEventTimeWindows.of( Time.seconds(15)))    //滚动时间窗口
//      .window( SlidingEventTimeWindows.of(Time.seconds(15),Time.seconds(3)) )   //滑动时间窗口
//      .window( EventTimeSessionWindows.withGap(Time.seconds(10)) )  //会话窗口
//      .countWindow(10)    //滚动计数窗口
      .timeWindow(Time.seconds(15))
      .allowedLateness(Time.minutes(1))
      .sideOutputLateData(new OutputTag[(String, Long)]("late"))
      .minBy(1)

    resultStream.print()

    env.execute("window test")


  }
}

WindowTest2

package com.liao.chapter06

import com.liao.chapter05.{ClickSource, Event}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.assigners.{EventTimeSessionWindows, SlidingEventTimeWindows, TumblingEventTimeWindows, TumblingProcessingTimeWindows}
import org.apache.flink.streaming.api.windowing.time.Time

object WindowTest2 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env.addSource(new ClickSource)
      //      .assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[Event](Time) {
      //        override def extractTimestamp(t: Event): Long = ???
      //      })
      .assignAscendingTimestamps(_.timestamp)

    stream.map(data => (data.user,1))
      .keyBy(_._1)
      .window( TumblingEventTimeWindows.of(Time.seconds(5)))  //基于事件时间的滚动窗口
//      .window( TumblingProcessingTimeWindows.of(Time.days(1),Time.hours(-8))) //基于处理时间的滚动窗口
//      .window( SlidingEventTimeWindows.of(Time.hours(1),Time.minutes(10)) ) //基于事件时间的滑动窗口
//      .window( EventTimeSessionWindows.withGap(Time.seconds(10)))   //基于事件时间的会话窗口
//      .countWindow(10)  //滚动计数窗口
//      .countWindow(10,2)  //滑动计数窗口
      .reduce( (state, data) => (data._1,state._2+ data._2) )
      .print()

    env.execute()


  }
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值