数据
guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-01
guid01,2018-03-02
guid01,2018-03-05
guid01,2018-03-04
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
guid03,2018-03-06
guid03,2018-03-07
guid03,2018-03-09
guid03,2018-03-10
guid03,2018-03-11
guid04,2018-03-05
guid04,2018-03-06
guid04,2018-03-07
guid04,2018-03-09
import java.text.SimpleDateFormat
import java.util.{Calendar, Date}
import Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object GuidDemo {
def main(args: Array[String]): Unit = {
val sc: SparkContext = SparkUtils.getSparkContext()
val file: RDD[String] =sc.textFile("data/data1.txt")
val distinctRdd: RDD[(String, String)] =file.map(e=>{
val fields: Array[String] =e.split(",")
val uid: String = fields(0)
val dt: String = fields(1)
(uid,dt)
}).distinct()
// distinctRdd.foreach(println)
val groupdd: RDD[(String, Iterable[String])] =distinctRdd.groupByKey()
// groupdd.foreach(println)
val res: RDD[(String, Int, String, String)] =groupdd.flatMapValues(it=>{
//对迭代器中的数据进行排序
val sortrdd: List[String] =it.toList.sorted
var num=0
val df: SimpleDateFormat =new SimpleDateFormat("yyyy-MM-dd")
val cal: Calendar =Calendar.getInstance()
sortrdd.map(st=>{
val dfp: Date = df.parse(st)
cal.setTime(dfp)
cal.add(Calendar.DATE,-num)
val ct: Date =cal.getTime
val datediff: String =df.format(ct)
num += 1
(st,datediff)
})
}).map{
case(uid,(st,datediff))=>( (uid,datediff),(1,st,st))
}.reduceByKey((v1,v2)=>{
(v1._1+v2._1,Ordering[String].min(v1._2,v2._2),Ordering[String].max(v1._3,v2._3))
}).filter(_._2._1>=3).map(t => (t._1._1, t._2._1, t._2._2, t._2._3))
println(res.collect().toBuffer)
}
}
结果:
ArrayBuffer((guid02,3,2018-03-01,2018-03-03), (guid03,3,2018-03-09,2018-03-11), (guid01,4,2018-03-04,2018-03-07), (guid04,3,2018-03-05,2018-03-07), (guid01,3,2018-02-28,2018-03-02))
3772

被折叠的 条评论
为什么被折叠?



