实时统计每小时内的网站PV

代码

未优化:

package com.zjc.flow_analysis

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import java.lang

case class UserBehavior(userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long)
case class PvCount(windowEnd: Long, count: Long)

object PageView {

  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    val inputStream = env.readTextFile("E:\\zjc\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")

    val dataStream = inputStream.map(data => {
      val dataArray = data.split(",")
      UserBehavior(dataArray(0).toLong, dataArray(1).toLong,dataArray(2).toInt, dataArray(3).toString, dataArray(4).toLong)
    }).assignAscendingTimestamps(_.timestamp * 1000L)

    //分配key,包装成二元组开窗聚合
    val pvStream = dataStream.filter(_.behavior == "pv")
      .map(data => ("pv", 1L))
      .keyBy(_._1)
      .timeWindow(Time.minutes(60))

      // .sum(1)   // 或
      .aggregate(new PvCountAgg(), new PvCountResult())
    pvStream.print()
    env.execute("pv job")
  }
}
class PvCountAgg() extends AggregateFunction[(String, Long), Long, Long]{
  override def createAccumulator(): Long = 0L

  override def add(in: (String, Long), acc: Long): Long = acc  + 1

  override def getResult(acc: Long): Long = acc

  override def merge(acc: Long, acc1: Long): Long = acc + acc1
}
class PvCountResult() extends WindowFunction[Long,PvCount, String, TimeWindow] {
  override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[PvCount]): Unit = {

    out.collect(PvCount(window.getEnd,input.head))
  }
}

在这里插入图片描述
不足: .map(data => (“pv”, 1L)),然后代码中按“pv”聚合,若数据量巨大容易oom。
优化:

package com.zjc.flow_analysis

import org.apache.flink.api.common.functions.{MapFunction, RichMapFunction}
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.util.Collector

import scala.util.Random


object PageView2 {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(4)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
    val inputStream = env.readTextFile("E:\\zjc\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
    val userBehaviorStream = inputStream.map(data => {
      val dataArray = data.split(",")
      UserBehavior(dataArray(0).toLong, dataArray(1).toLong,dataArray(2).toInt, dataArray(3).toString, dataArray(4).toLong)
    }).assignAscendingTimestamps(_.timestamp * 1000L)
    val aggStream = userBehaviorStream.filter(_.behavior == "pv")
      .map(new MyMapper())
      .keyBy(_._1)
      .timeWindow(Time.hours(1))
      .aggregate(new PvCountAgg(), new PvCountResult())
    val resultStream = aggStream.keyBy(_.windowEnd)
//      .sum("count")
      .process(new AllPageCount())
    resultStream.print()
    env.execute("PageView2的执行")
  }
}
class MyMapper() extends RichMapFunction[UserBehavior, (String, Long)] {
  lazy val index: Long = getRuntimeContext.getIndexOfThisSubtask//所属的分区
  override def map(t: UserBehavior): (String, Long) = {
    (index.toString,1L)
  }
}
class AllPageCount() extends KeyedProcessFunction[Long, PvCount, PvCount] {
  lazy val totalCountState:ValueState[Long] = getRuntimeContext.getState(new ValueStateDescriptor[Long]("total-count", classOf[Long]))

  override def processElement(i: PvCount, context: KeyedProcessFunction[Long, PvCount, PvCount]#Context, collector: Collector[PvCount]): Unit = {
    totalCountState.update(totalCountState.value() + i.count)
    context.timerService().registerEventTimeTimer(i.windowEnd + 1)
  }

  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, PvCount, PvCount]#OnTimerContext, out: Collector[PvCount]): Unit = {

    out.collect(PvCount(timestamp - 1, totalCountState.value()))
    totalCountState.clear()
  }
}

在这里插入图片描述
说明,通过自定义map【map(new MyMapper())】,实现数据打散,解决oom,不足输出乱序。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
好的,针对您的问题,我提供两种解决方案。 方案一:使用SQL统计一天内每个时段的用户量、点击量、购买量。 假设您有一张名为“user_behavior”的表,其中包含用户ID(user_id)、商品ID(item_id)、商品类目ID(category_id)、行为类型(behavior_type)和时间戳(timestamp)等字段,那么可以使用以下SQL语句进行统计: ``` SELECT DATE_FORMAT(FROM_UNIXTIME(timestamp), '%Y-%m-%d %H') AS time_hour, COUNT(DISTINCT user_id) AS user_count, COUNT(CASE behavior_type WHEN 'pv' THEN 1 ELSE NULL END) AS click_count, COUNT(CASE behavior_type WHEN 'buy' THEN 1 ELSE NULL END) AS purchase_count FROM user_behavior WHERE DATE_FORMAT(FROM_UNIXTIME(timestamp), '%Y-%m-%d') = '2022-01-01' GROUP BY DATE_FORMAT(FROM_UNIXTIME(timestamp), '%Y-%m-%d %H'); ``` 以上SQL语句将一天内的数据按照小时进行分组,统计出每个时段的用户量、点击量、购买量等数据。 方案二:使用Spark RDD统计一天内每个时段的用户量、点击量、购买量。 首先,需要读入数据文件并将其转为RDD格式。假设您的数据文件为“user_behavior.txt”,可以使用以下代码进行读取: ``` val data = sc.textFile("user_behavior.txt") ``` 接着,可以使用map函数对每行数据进行处理,提取出时间戳、用户ID、行为类型等信息。代码如下所示: ``` val parsedData = data.map(line => { val fields = line.split(",") val timestamp = fields(4).toLong val user_id = fields(0) val behavior_type = fields(3) ((timestamp / 3600) * 3600, (user_id, behavior_type)) }) ``` 以上代码将每行数据按照小时进行分组,并提取出用户ID和行为类型等信息。 接着,可以使用reduceByKey函数对每个小时内的数据进行统计。代码如下: ``` val result = parsedData.groupByKey().mapValues(iter => { val users = iter.map(_._1).toSet.size val clicks = iter.count(_._2 == "pv") val purchases = iter.count(_._2 == "buy") (users, clicks, purchases) }) ``` 以上代码将每个小时内的数据进行统计,得到每个小时内的用户量、点击量、购买量等数据。 最后,可以使用foreach函数将结果输出到文件中。代码如下: ``` result.foreach(println) ``` 以上代码将结果打印到控制台上,您也可以将其输出到文件中。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值