Spark-RDD 统计用户上网流量连续上网案例

最新推荐文章于 2021-02-18 14:36:02 发布

商俊超

最新推荐文章于 2021-02-18 14:36:02 发布

阅读量282

点赞数 1

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/m0_46538284/article/details/112177132

版权

Spark 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

1,2020-02-18 14:20:30,2020-02-18 14:46:30,20
1,2020-02-18 14:47:20,2020-02-18 15:20:30,30
1,2020-02-18 15:37:23,2020-02-18 16:05:26,40
1,2020-02-18 16:06:27,2020-02-18 17:20:49,50
1,2020-02-18 17:21:50,2020-02-18 18:03:27,60
2,2020-02-18 14:18:24,2020-02-18 15:01:40,20
2,2020-02-18 15:20:49,2020-02-18 15:30:24,30
2,2020-02-18 16:01:23,2020-02-18 16:40:32,40
2,2020-02-18 16:44:56,2020-02-18 17:40:52,50
3,2020-02-18 14:39:58,2020-02-18 15:35:53,20
3,2020-02-18 15:36:39,2020-02-18 15:24:54,30

需求：统计用户上网流量，如果两次上网的时间小于10分钟，就可以rollup到一起

方法一

import java.text.SimpleDateFormat
import java.util.Date

import doit.day05_t.utils.SparkUtils
import org.apache.spark.rdd.RDD

object FlowRollup {

  def main(args: Array[String]): Unit = {

    val sc = SparkUtils.createContext(true)

    val lines: RDD[String] = sc.textFile("src/main/scala/data/internet")

    val res = lines.mapPartitions(it => {
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      it.map(e => {
        val fields = e.split(",")
        val uid = fields(0)
        val startTime = dateFormat.parse(fields(1)).getTime
        val endTime = dateFormat.parse(fields(2)).getTime
        val downFlow = fields(3).toLong
        (uid, (startTime, endTime, downFlow))
      })
    }).groupByKey().flatMapValues(it => {
      //(起始时间戳，结束时间戳，下行流量)
      val sorted: List[(Long, Long, Long)] = it.toList.sortBy(_._1)
      var temp = 0L
      var flag = 0 //0 或 1
      var sum = 0 //0,0, 0一组 1，1，1 另一种  2 2 2 又是一组
      sorted.map(e => {
        val startTime = e._1
        val endTime = e._2
        val flow = e._3
        if(temp != 0) {
          if((startTime - temp) / (1000 * 60) > 10) {
            flag = 1
          } else {
            flag = 0
          }
        }
        temp = endTime
        sum += flag
        (startTime, endTime, flow, sum)
      })
    }).map{
      case (uid, (startTime, endTime, flow, sum)) => {
        ((uid, sum), (flow, startTime, endTime))
      }
    }.reduceByKey((a, b) => {
      (a._1 + b._1, Math.min(a._2, b._2), Math.max(a._3, b._3))
    }).mapPartitions(it => {
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      it.map{
        case ((uid, _), (flow, startTime, endTime)) => {
          (uid, dateFormat.format(new Date(startTime)), dateFormat.format(new Date(endTime)), flow)
        }
      }
    }).collect()

    println(res.toBuffer)

//    Thread.sleep(10000000)

    sc.stop()


  }

}

方法二

import java.text.SimpleDateFormat
import java.util.Date
import java.util.logging.SimpleFormatter

import doit.utils.SparkUtils
import org.apache.spark.rdd.RDD

object RollUpInter {
  def main(args: Array[String]): Unit = {
    val sc = SparkUtils.createContext(true)

    val lines: RDD[String] = sc.textFile("src/main/scala/data/internet")

    val rdd1= lines.mapPartitions(it => {
      val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      it.map(e => {
        val split = e.split(",")
        val uid = split(0)
        val upInter: Long = format.parse(split(1)).getTime
        val downInter: Long = format.parse(split(2)).getTime
        val runtime = split(3).toInt
        (uid, (upInter, downInter, runtime))
      })
    })
      .groupByKey().flatMapValues(it => {
      val list = it.toList.sortBy(_._1)
      var start = 0L
      var end = 0L
      var rollTime = 0
      list.map(e => {
        if (start ==0L || end == 0L){
          start = e._1
          end = e._2
          rollTime = e._3
        }else{
          if (e._1 - end < (10*60*1000)){
            end = e._2
            rollTime += e._3
          }else{
            start = e._1
            end = e._2
            rollTime = e._3
          }
        }
        (start,end,rollTime)
      })
    }).map(e => {
      ((e._1,e._2._1),(e._2._2,e._2._3))
    })
      .reduceByKey((e1,e2) => {
        (Math.max(e1._1,e2._1),Math.max(e1._2,e2._2))
      }).mapPartitions(it => {
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      it.map(e=>{
        (e._1._1,(dateFormat.format(new Date(e._1._2)),dateFormat.format(new Date(e._2._1)),e._2._2))
      })
    })

    rdd1.foreach(println)
    //Thread.sleep(10000000)

    sc.stop()


  }

}

方法三（一次Shuffle）

import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.rdd.RDD
import utils.SparkUtils
import scala.collection.mutable.ArrayBuffer
/**
 * 需求:统计用户上网流量，如果两次上网的时间小于10分钟，就可以rollup到一起
 */
object FlowTest {
  def main(args: Array[String]): Unit = {
    
    val sc = SparkUtils.createContext()
    val lines = sc.textFile("data/anli3_flow")

    //对数据进行预处理
    val idTimeAndFlow: RDD[(String, Date, Date, Int)] = lines.mapPartitions(e => {
      //创建时间格式对象
      val timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      val tuples = e.map(ee => {
        val arr = ee.split(",")
        val uid = arr(0)
        val startTime = timeFormat.parse(arr(1))
        val endTime = timeFormat.parse(arr(2))
        val flow = arr(3).toInt
        (uid, startTime, endTime, flow)
      })
      tuples
    })

    val grouped: RDD[Iterable[(String, Date, Date, Int)]] = idTimeAndFlow.groupBy(_._1).values

    val result: RDD[(String, String, String, Int)] = grouped.flatMap(e => {
      //创建时间格式对象
      val timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      //定义一个变量,用来判断迭代器的进程
      var i = 0
      var startTime: Date = null
      var endTime: Date = null
      //定义一个ArrayBuffer,存放结果;定义sum,计算总流量
      val arr = new ArrayBuffer[(String, String, String, Int)]()
      var sum = 0
      e.foreach(ee => {
        //第一次进入为startTime和endTime附初始值
        if (i == 0) {
          startTime = ee._2
          endTime = ee._3
          sum += ee._4
        } else if (ee._2.getTime - endTime.getTime >= 600000) {
          //如果间隔大于十分钟,把startTime和endTime和sum加到结果中,并进行重置
          arr.append((ee._1, timeFormat.format(startTime), timeFormat.format(endTime), sum))
          startTime = ee._2
          endTime = ee._3
          sum = ee._4
        } else {  //后面进入的进行比较,如果间隔时间小于10分钟
          sum += ee._4
          endTime = ee._3
        }
        i += 1
        //如果是最后一个元素,追加输出一个
        if (i == e.size) arr.append((ee._1, timeFormat.format(startTime), timeFormat.format(endTime), sum))
      })
      arr
    })

    println(result.collect().toBuffer)

    Thread.sleep(10000000)
  }
}

商俊超

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Spark-RDD 统计用户上网流量连续上网案例

1,2020-02-18 14:20:30,2020-02-18 14:46:30,201,2020-02-18 14:47:20,2020-02-18 15:20:30,301,2020-02-18 15:37:23,2020-02-18 16:05:26,401,2020-02-18 16:06:27,2020-02-18 17:20:49,501,2020-02-18 17:21:50,2020-02-18 18:03:27,602,2020-02-18 14:18:24,2020-02-.
复制链接

扫一扫