Flink process(底层)(windowprocess可以获取到时间)

1. process

package com.wt.flink.core

import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

object Demo7Process {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment

    env.setParallelism(1)

    val linesDS: DataStream[String] = env.socketTextStream("master", 8888)

    /**
     * process:一行一行的处理数据,可以返回多行,相当于flatmap
     * process:可以用于替代 map,filter,flatmap
     *
     */
    //在DataStream 上使用process函数
    val procesDS: DataStream[(String, Int)] = linesDS
      .process(new ProcessFunction[String, (String, Int)] {

        /**
         * 也可以重写open和close方法,可以将初始化的代码放在open中
         *
         */
        override def open(parameters: Configuration): Unit = super.open(parameters)

        override def close(): Unit = super.close()

        /**
         * processElement: 将ds的中的数据一条一条传递给processElement,
         * processElement可以输出多条数据,相当于flatmap
         *
         * @param value : 一行数据
         * @param ctx   : 上下问对象,可以获取到flink的时间
         * @param out   : 用于见数据发送到下游
         */

        override def processElement(value: String,
                                    context: ProcessFunction[String, (String, Int)]#Context,
                                    out: Collector[(String, Int)]): Unit = {

          val clazz: String = value.split(",")(4)

          //将数据发送到下游
          out.collect((clazz, 1))
        }
      })
    procesDS.print()
    env.execute()
  }
}

KeyByProcess

package com.wt.flink.core

import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

import scala.collection.mutable

object Demo8KeyByPrecess {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    val linesDS: DataStream[String] = env.socketTextStream("master", 8888)

    val wordsDS: DataStream[String] = linesDS.flatMap(_.split(","))

    val kvDS: DataStream[(String, Int)] = wordsDS.map((_, 1))

    val keyByDS: KeyedStream[(String, Int), String] = kvDS.keyBy(_._1)

    /**
     * keyBy之后使用process
     * keyBy: 将相同的key分到同一个task中
     *
     * KeyedProcessFunction
     *
     */
    val countDS: DataStream[(String, Int)] = keyByDS
      .process(new KeyedProcessFunction[String, (String, Int), (String, Int)] {
        //同一个并行度中是一个变量
        //var count: Int = 0
        //用于保存每个单词的数量
        val countMap: mutable.Map[String, Int] = new mutable.HashMap[String, Int]()

        /**
         * 将数据一行一行传递给processElement,processElement可以返回多条数据
         *
         * @param value : 一行数据
         * @param ctx   :上下文对象
         * @param out   :用于将数据发送到下游
         */
        override def processElement(value: (String, Int),
                                    ctx: KeyedProcessFunction[String, (String, Int), (String, Int)]#Context,
                                    out: Collector[(String, Int)]): Unit = {
          //获取当前的key
          val key: String = ctx.getCurrentKey

          //从map中获取单词的数量,如果有就返回,如果没有就返回0
          var count: Int = countMap.getOrElse(key, 0)

          //做一个累计
          count += 1

          //更新map中的单词的数量
          countMap.put(key, count)

          //将数据发送到下游
          out.collect(key, count)
        }
      })
    countDS.print()
    env.execute()
  }
}

WindowProcess

package com.wt.flink.core

import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.TumblingProcessingTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

object Demo9WindowProcess {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment

    env.setParallelism(1)

    val linesDS: DataStream[String] = env.socketTextStream("master", 8888)

    val wordsDS: DataStream[String] = linesDS.flatMap(_.split(','))

    //按照单词分组
    val keyByDS: KeyedStream[String, String] = wordsDS.keyBy(word => word)

    //统计最近5秒单词的数量
    val windowDS: WindowedStream[String, String, TimeWindow] = keyByDS
      .window(TumblingProcessingTimeWindows.of(Time.seconds(5)))

    /**
     * 在窗口之后使用process函数,一次传入一个窗口的数据
     *
     */
    val countDS: DataStream[(String, Long, Long, Int)] = windowDS.process(
      new ProcessWindowFunction[String, (String, Long, Long, Int), String, TimeWindow] {
        /**
         * process: 一个窗口处理一次
         *
         * @param key      : key
         * @param context  : 上下文对象,可以获取窗口的开始和结束时间
         * @param elements :窗口内所有的数据
         * @param out      :用于将数据发送到下游
         */
        override def process(key: String,
                             context: Context,
                             elements: Iterable[String],
                             out: Collector[(String, Long, Long, Int)]): Unit = {

          //计算单词的数量
          val count: Int = elements.size
          //获取窗口的开始和结束时间
          val window: TimeWindow = context.window
          val start: Long = window.getStart
          val end: Long = window.getEnd

          //将统计的结果发送到下游
          out.collect(key, start, end, count)
        }
      }
    )
    countDS.print()

    env.execute()

  }
}

windowsprocess可以获取到时间,我们可以与小汽车一题结合,获取小汽车通过卡口的时间

package com.wt.flink.core

import com.alibaba.fastjson.{JSON, JSONObject}
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.connector.kafka.source.KafkaSource
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import java.time.Duration

object Demo10CarTime {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment

    env.setParallelism(1)

    /**
     * 读取数据
     *
     */

    val source: KafkaSource[String] = KafkaSource
      .builder[String]
      .setBootstrapServers("master:9092,node1:9092,node2:9092") //kafka集群broker列表
      .setTopics("cars") //指定topic
      .setGroupId("asdasdasd") //指定消费者组,一条数据在一个组内只被消费一次
      .setStartingOffsets(OffsetsInitializer.latest()) //读取数据的位置,earliest:读取所有的数据,latest:读取最新的数据
      .setValueOnlyDeserializer(new SimpleStringSchema()) //反序列的类
      .build

    //使用kafka source
    val carsDS: DataStream[String] = env.fromSource(source, WatermarkStrategy.noWatermarks(), "Kafka Source")

    /**
     * 解析数据
     *
     */

    val cardAndTimeDS: DataStream[(Long, Long)] = carsDS.map(line => {
      //将字符串转换成json对象
      val jsonObj: JSONObject = JSON.parseObject(line)
      //使用字段名获取字段值
      //卡口编号
      val card: Long = jsonObj.getLong("card")
      //事件时间,事件时间要求时毫秒级别
      val time: Long = jsonObj.getLong("time") * 1000
      (card, time)
    })

    /**
     * 增加水位线和时间字段
     *
     */
    val assDS: DataStream[(Long, Long)] = cardAndTimeDS.assignTimestampsAndWatermarks(
      WatermarkStrategy
        //设置水位线的生成策略,前移5秒
        .forBoundedOutOfOrderness(Duration.ofSeconds(5))
        //设置时间字段
        .withTimestampAssigner(new SerializableTimestampAssigner[(Long, Long)] {
          override def extractTimestamp(element: (Long, Long), recordTimestamp: Long): Long = {
            //时间字段
            element._2
          }
        })
    )


    /**
     * 按照卡口分组
     *
     */
    val kvDS: DataStream[(Long, Int)] = assDS.map(kv => (kv._1, 1))

    //按照卡口分组
    val keyBYDS: KeyedStream[(Long, Int), Long] = kvDS.keyBy(_._1)

    //开窗口
    val windowDS: WindowedStream[(Long, Int), Long, TimeWindow] = keyBYDS
      .window(SlidingEventTimeWindows.of(Time.minutes(15), Time.minutes(4)))

    val cardFlowDS: DataStream[(Long, Long, Long, Int)] = windowDS
      .process(new ProcessWindowFunction[(Long, Int), (Long, Long, Long, Int), Long, TimeWindow] {
        /**
         * 一个窗口执行一次
         *
         * @param key      :卡口
         * @param context  :上下文对象
         * @param elements :窗口内所有的数据
         * @param out      : 用于将数据发送到下游
         */
        override def process(key: Long,
                             context: Context,
                             elements: Iterable[(Long, Int)],
                             out: Collector[(Long, Long, Long, Int)]): Unit = {

          //车流量
          val flow: Int = elements.size

          //获取窗口的开始和结束时间
          val window: TimeWindow = context.window
          val stat: Long = window.getStart
          val end: Long = window.getEnd

          //将数据发送到下游
          out.collect((key, stat, end, flow))
        }
      })
    cardFlowDS.print()
    env.execute()
  }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值