案例计算连续登录3天及以上的用户--spark程序编写三种实现方式

最新推荐文章于 2022-12-09 09:37:49 发布

z小丑八怪r

最新推荐文章于 2022-12-09 09:37:49 发布

阅读量460

点赞数 3

本文链接：https://blog.csdn.net/weixin_46959672/article/details/112067365

版权

spark 同时被 2 个专栏收录

8 篇文章 0 订阅

订阅专栏

scala

4 篇文章 0 订阅

订阅专栏

工具类

//本地运行模式和集群运行模式
object SparkUtils {
  def createContext(isLocal: Boolean = false): SparkContext = {
    val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
    if (isLocal) conf.setMaster("local[*]")
    val sc = new SparkContext(conf)
    sc
  }
}

第一种实现方式

适合用户量比较大，但每个用户(组内)的数据比较少，因为组内如果数据特别大,toset 或者toList 可能会内存溢出

/**
 * guid01,2018-02-28  1
 * guid01,2018-03-01  2
 * guid01,2018-03-01
 * guid01,2018-03-02  3
 * guid01,2018-03-04  4
 * guid01,2018-03-05  5
 * guid01,2018-03-06  6
 * guid01,2018-03-07  7
 *
 * 思路：根据uid分组，组内按日期排序，开个窗口row_num
 * 		日期减去row_num 得到的日期差值相同的聚合uidDataDifAndOne  即是连续登录的天数
 */
object ContinuedLoginUser {
  def main(args: Array[String]): Unit = {
    val sc = SparkUtils.createContext(true)
    val lines = sc.textFile("data/data1.txt")

    //切割读到的数据
    val groupd: RDD[(String, Iterable[String])] = lines.map(e => {
      val fields = e.split(",")
      val uid = fields(0)
      val date = fields(1)
      (uid, date)
    }).distinct() //去重 去掉uid，date相同的数据
      .groupByKey() //根据uid分组

    val dateDifRDD: RDD[(String, (String, String))] = groupd.flatMapValues(it => {
      //toSet可以将日期去重,再tolist,进行排序
      //留意如果数据特别大,toset 或者toList 可能会内存溢出
//      val sorted = it.toSet.toList.sorted
      //组内按日期排序，开个窗口row_num,日期减去row_num
      val sorted = it.toList.sorted
      var num = 0
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
      val calendar = Calendar.getInstance()
      sorted.map(dt => {
        val date = dateFormat.parse(dt)
        calendar.setTime(date)
        //日期减去row_num
        calendar.add(Calendar.DATE, -num)
        val timeDif: Date = calendar.getTime
        val dateDif: String = dateFormat.format(timeDif)
        num += 1
        //返回(日期，差值)
        (dt, dateDif)
      })
    })

    //将uid,dataDif 相同的聚合 uidDataDifAndOne
    val res: RDD[(String, String, String, Int)] = dateDifRDD.map {
      case (uid, (dt, dateDif)) => ((uid, dateDif), (1, dt, dt))
    }.reduceByKey((v1, v2) => {
      //      val startDt = if (v1._2 < v2._2 ) v1._2 else v2._2
      //      val endDt = if (v1._2 > v2._2 ) v1._2 else v2._2

      //返回连续登录的天数，开始日期，结束日期
      (v1._1 + v2._1, Ordering[String].min(v1._2, v2._2), Ordering[String].max(v1._2, v2._2))
    }).filter(e => e._2._1 >= 3) //过滤登录天数大于等于3天的
      .map(t => (t._1._1, t._2._2, t._2._3, t._2._1)) //整理数据

    println(res.collect().toBuffer)
    sc.stop()

  }

}

第二种实现方式

分区且排序，一个用户一个分区
适合用户量少，每个用户(分区)的数据多
不适合用户量大的数据，用户量大的话每个用户一个分区，如果每个用户的数据不多，生成很多Task，浪费资源

/**
 * guid01,2018-02-28  1
 * guid01,2018-03-01  2
 * guid01,2018-03-01
 * guid01,2018-03-02  3
 * guid01,2018-03-04  4
 * guid01,2018-03-05  5
 * guid01,2018-03-06  6
 * guid01,2018-03-07  7
 *
 * 思路：根据uid分区，并且按日期排序，开个窗口row_num
 * 		日期减去row_num 得到的日期差值相同的聚合uidDataDifAndOne  即是连续登录的天数
 *
 */
object ContinuedLoginUser02 {
  def main(args: Array[String]): Unit = {
    val sc = SparkUtils.createContext(true)
    val lines = sc.textFile("data/data1.txt")

    //切割读到的数据
    val uidAndDate = lines.map(e => {
      val fields = e.split(",")
      val uid = fields(0)
      val date = fields(1)
      (uid, date)
    }).distinct() //去重 去掉uid，date相同的数据

    //收集所有的用户，以便按照用户分区
    val uids: Array[String] = uidAndDate.keys.distinct().collect()
    val uidPartitioner = UidPartitioner.apply(uids)

    //把日期放在前面，在分区内利用元组的排序规则排序，不需要再传入排序规则
    val DtUidAndNull: RDD[((String, String), Null)] = uidAndDate.map(e => (e.swap, null))

    //分区且排序
    val partitionedAndSorted: RDD[((String, String), Null)] = DtUidAndNull.repartitionAndSortWithinPartitions(uidPartitioner)

    //处理一个分区中用户的数据 row_num
    val dateDifRDD: RDD[((String, String), (Int, String, String))] = partitionedAndSorted.mapPartitions(it => {
      var row_num = 0
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
      val calendar = Calendar.getInstance()
      it.map(e => {
        val dt = e._1._1
        val uid = e._1._2
        val date = dateFormat.parse(dt)
        calendar.setTime(date)
        calendar.add(Calendar.DATE, -row_num)
        val dateDif: String = dateFormat.format(calendar.getTime)
        row_num += 1
        ((uid, dateDif), (1, dt, ""))
      })
    })

    val res: RDD[(String, String, String, Int)] = dateDifRDD.reduceByKey((a, b) => {
      (a._1 + b._1, Ordering[String].min(a._2, b._2), Ordering[String].max(a._2, b._2))
    }).filter(e => e._2._1 >= 3).map(t => (t._1._1, t._2._2, t._2._3, t._2._1))


    println(res.collect().toBuffer)
    sc.stop()

  }


}
//分区数量是在shuffle write之前获取的
//如果以后还要用到这个分区器，最好重写equals hashcode方法
object UidPartitioner extends Partitioner {
  val uidToIndex = new mutable.HashMap[String, Int]()

  def apply(uids: Array[String]): this.type = {
    var index = 0
    for (uid <- uids) {
      uidToIndex(uid) = index
      index += 1
    }
    //返回这个类的实例
    this
  }

  override def numPartitions: Int = uidToIndex.size

  override def getPartition(key: Any): Int = {
    val uid = key.asInstanceOf[(String, String)]._2
    uidToIndex(uid)
  }
}

第三种实现方式(最佳)

适合用户量大，并且每个用户的数据也很多
一个分区内有一到多个用户，生成的task数量不多，而且在分区内是排序的

/**
 * guid01,2018-02-28  1
 * guid01,2018-03-01  2
 * guid01,2018-03-01
 * guid01,2018-03-02  3
 * guid01,2018-03-04  4
 * guid01,2018-03-05  5
 * guid01,2018-03-06  6
 * guid01,2018-03-07  7
 *
 * 思路：根据uid分区，并且按日期排序，开个窗口row_num
 * 		日期减去row_num 得到的日期差值相同的聚合uidDataDifAndOne  即是连续登录的天数
 */
object ContinuedLoginUser03 {
  def main(args: Array[String]): Unit = {
    val sc = SparkUtils.createContext(true)
    val lines = sc.textFile("data/data1.txt")

    //切割读到的数据
    val uidAndDate = lines.map(e => {
      val fields = e.split(",")
      val uid = fields(0)
      val date = fields(1)
      (uid, date)
    }).distinct() //去重 去掉uid，date相同的数据


    //把uid和日期作为key，利用元组的排序规则排序
    val uidDateAndNull: RDD[((String, String), Null)] = uidAndDate.map(e => ((e._1, e._2), null))
    val partitionedAndSorted: RDD[((String, String), Null)] = uidDateAndNull.repartitionAndSortWithinPartitions(new MyHashPartitioner(2))

    val dateDifRDD: RDD[((String, String), (Int, String, String))] = partitionedAndSorted.mapPartitions(it => {
      val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
      val calendar = Calendar.getInstance()
      var row_num = 0
      var temp: String = null
      it.map(e => {
        val uid = e._1._1
        val dt = e._1._2


        //判断如果不是同一个用户，row_num从1重新开始
        if (temp != null && !temp.equals(uid)) {
          temp = uid
          row_num = 1
        } else {
          temp = uid
          row_num += 1
        }


        val date = dateFormat.parse(dt)
        calendar.setTime(date)
        calendar.add(Calendar.DATE, -row_num)
        val dateDif = dateFormat.format(calendar.getTime)

        ((uid, dateDif), (1, dt, ""))
      })
    })
//      .saveAsTextFile("data/out1")


    val res: RDD[(String, String, String, Int)] = dateDifRDD.reduceByKey((a, b) => {
      (a._1 + b._1, Ordering[String].min(a._2, b._2), Ordering[String].max(a._2, b._2))
    }).filter(e => e._2._1 >= 3).map(t => (t._1._1, t._2._2, t._2._3, t._2._1))


    println(res.collect().toBuffer)
    sc.stop()

  }


}

class MyHashPartitioner(partitions: Int) extends Partitioner {
  override def numPartitions: Int = partitions

  override def getPartition(key: Any): Int = {
    if (key == null) {
      0
    } else {
      val uid = key.asInstanceOf[(String, String)]._1
      nonNegativeMod(uid.hashCode, numPartitions)
    }
  }

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

}

z小丑八怪r

关注

3
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
案例计算连续登录3天及以上的用户--spark程序编写三种实现方式

第一种实现方式/** * guid01,2018-02-28 1 * guid01,2018-03-01 2 * guid01,2018-03-01 * guid01,2018-03-02 3 * guid01,2018-03-04 4 * guid01,2018-03-05 5 * guid01,2018-03-06 6 * guid01,2018-03-07 7 * * 思路：根据uid分组，组内按日期排序，开个窗口row_num * 日期减去row_num 得
复制链接

扫一扫