saprk:计算连续登陆3天及以上的用户

数据

guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-01
guid01,2018-03-02
guid01,2018-03-05
guid01,2018-03-04
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
guid03,2018-03-06
guid03,2018-03-07
guid03,2018-03-09
guid03,2018-03-10
guid03,2018-03-11
guid04,2018-03-05
guid04,2018-03-06
guid04,2018-03-07
guid04,2018-03-09
import java.text.SimpleDateFormat
import java.util.{Calendar, Date}
import Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object GuidDemo {
  def main(args: Array[String]): Unit = {
   val sc: SparkContext = SparkUtils.getSparkContext()
    val file: RDD[String] =sc.textFile("data/data1.txt")
    val distinctRdd: RDD[(String, String)] =file.map(e=>{
      val fields: Array[String] =e.split(",")
     val uid: String = fields(0)
      val dt: String = fields(1)
      (uid,dt)
    }).distinct()
   // distinctRdd.foreach(println)
     val groupdd: RDD[(String, Iterable[String])] =distinctRdd.groupByKey()
   // groupdd.foreach(println)
     val res: RDD[(String, Int, String, String)] =groupdd.flatMapValues(it=>{
       //对迭代器中的数据进行排序
       val sortrdd: List[String] =it.toList.sorted
       var num=0
       val df: SimpleDateFormat =new SimpleDateFormat("yyyy-MM-dd")
       val cal: Calendar =Calendar.getInstance()
       sortrdd.map(st=>{
         val dfp: Date = df.parse(st)
         cal.setTime(dfp)
         cal.add(Calendar.DATE,-num)
         val ct: Date =cal.getTime
         val datediff: String =df.format(ct)
         num += 1
         (st,datediff)
       })
     }).map{
       case(uid,(st,datediff))=>( (uid,datediff),(1,st,st))
     }.reduceByKey((v1,v2)=>{
       (v1._1+v2._1,Ordering[String].min(v1._2,v2._2),Ordering[String].max(v1._3,v2._3))
     }).filter(_._2._1>=3).map(t => (t._1._1, t._2._1, t._2._2, t._2._3))
    println(res.collect().toBuffer)

  }
}

结果:

ArrayBuffer((guid02,3,2018-03-01,2018-03-03), (guid03,3,2018-03-09,2018-03-11), (guid01,4,2018-03-04,2018-03-07), (guid04,3,2018-03-05,2018-03-07), (guid01,3,2018-02-28,2018-03-02))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值