工具类
object SparkUtils {
def createContext(isLocal: Boolean = false): SparkContext = {
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
if (isLocal) conf.setMaster("local[*]")
val sc = new SparkContext(conf)
sc
}
}
第一种实现方式
- 适合用户量比较大,但每个用户(组内)的数据比较少 ,因为组内如果数据特别大,toset 或者toList 可能会内存溢出
object ContinuedLoginUser {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext(true)
val lines = sc.textFile("data/data1.txt")
val groupd: RDD[(String, Iterable[String])] = lines.map(e => {
val fields = e.split(",")
val uid = fields(0)
val date = fields(1)
(uid, date)
}).distinct()
.groupByKey()
val dateDifRDD: RDD[(String, (String, String))] = groupd.flatMapValues(it => {
val sorted = it.toList.sorted
var num = 0
val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
val calendar = Calendar.getInstance()
sorted.map(dt => {
val date = dateFormat.parse(dt)
calendar.setTime(date)
calendar.add(Calendar.DATE, -num)
val timeDif: Date = calendar.getTime
val dateDif: String = dateFormat.format(timeDif)
num += 1
(dt, dateDif)
})
})
val res: RDD[(String, String, String, Int)] = dateDifRDD.map {
case (uid, (dt, dateDif)) => ((uid, dateDif), (1, dt, dt))
}.reduceByKey((v1, v2) => {
(v1._1 + v2._1, Ordering[String].min(v1._2, v2._2), Ordering[String].max(v1._2, v2._2))
}).filter(e => e._2._1 >= 3)
.map(t => (t._1._1, t._2._2, t._2._3, t._2._1))
println(res.collect().toBuffer)
sc.stop()
}
}
第二种实现方式
- 分区且排序,一个用户一个分区
- 适合用户量少,每个用户(分区)的数据多
- 不适合用户量大的数据,用户量大的话每个用户一个分区,如果每个用户的数据不多,生成很多Task,浪费资源
object ContinuedLoginUser02 {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext(true)
val lines = sc.textFile("data/data1.txt")
val uidAndDate = lines.map(e => {
val fields = e.split(",")
val uid = fields(0)
val date = fields(1)
(uid, date)
}).distinct()
val uids: Array[String] = uidAndDate.keys.distinct().collect()
val uidPartitioner = UidPartitioner.apply(uids)
val DtUidAndNull: RDD[((String, String), Null)] = uidAndDate.map(e => (e.swap, null))
val partitionedAndSorted: RDD[((String, String), Null)] = DtUidAndNull.repartitionAndSortWithinPartitions(uidPartitioner)
val dateDifRDD: RDD[((String, String), (Int, String, String))] = partitionedAndSorted.mapPartitions(it => {
var row_num = 0
val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
val calendar = Calendar.getInstance()
it.map(e => {
val dt = e._1._1
val uid = e._1._2
val date = dateFormat.parse(dt)
calendar.setTime(date)
calendar.add(Calendar.DATE, -row_num)
val dateDif: String = dateFormat.format(calendar.getTime)
row_num += 1
((uid, dateDif), (1, dt, ""))
})
})
val res: RDD[(String, String, String, Int)] = dateDifRDD.reduceByKey((a, b) => {
(a._1 + b._1, Ordering[String].min(a._2, b._2), Ordering[String].max(a._2, b._2))
}).filter(e => e._2._1 >= 3).map(t => (t._1._1, t._2._2, t._2._3, t._2._1))
println(res.collect().toBuffer)
sc.stop()
}
}
object UidPartitioner extends Partitioner {
val uidToIndex = new mutable.HashMap[String, Int]()
def apply(uids: Array[String]): this.type = {
var index = 0
for (uid <- uids) {
uidToIndex(uid) = index
index += 1
}
this
}
override def numPartitions: Int = uidToIndex.size
override def getPartition(key: Any): Int = {
val uid = key.asInstanceOf[(String, String)]._2
uidToIndex(uid)
}
}
第三种实现方式(最佳)
- 适合用户量大,并且每个用户的数据也很多
- 一个分区内有一到多个用户,生成的task数量不多,而且在分区内是排序的
object ContinuedLoginUser03 {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext(true)
val lines = sc.textFile("data/data1.txt")
val uidAndDate = lines.map(e => {
val fields = e.split(",")
val uid = fields(0)
val date = fields(1)
(uid, date)
}).distinct()
val uidDateAndNull: RDD[((String, String), Null)] = uidAndDate.map(e => ((e._1, e._2), null))
val partitionedAndSorted: RDD[((String, String), Null)] = uidDateAndNull.repartitionAndSortWithinPartitions(new MyHashPartitioner(2))
val dateDifRDD: RDD[((String, String), (Int, String, String))] = partitionedAndSorted.mapPartitions(it => {
val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
val calendar = Calendar.getInstance()
var row_num = 0
var temp: String = null
it.map(e => {
val uid = e._1._1
val dt = e._1._2
if (temp != null && !temp.equals(uid)) {
temp = uid
row_num = 1
} else {
temp = uid
row_num += 1
}
val date = dateFormat.parse(dt)
calendar.setTime(date)
calendar.add(Calendar.DATE, -row_num)
val dateDif = dateFormat.format(calendar.getTime)
((uid, dateDif), (1, dt, ""))
})
})
val res: RDD[(String, String, String, Int)] = dateDifRDD.reduceByKey((a, b) => {
(a._1 + b._1, Ordering[String].min(a._2, b._2), Ordering[String].max(a._2, b._2))
}).filter(e => e._2._1 >= 3).map(t => (t._1._1, t._2._2, t._2._3, t._2._1))
println(res.collect().toBuffer)
sc.stop()
}
}
class MyHashPartitioner(partitions: Int) extends Partitioner {
override def numPartitions: Int = partitions
override def getPartition(key: Any): Int = {
if (key == null) {
0
} else {
val uid = key.asInstanceOf[(String, String)]._1
nonNegativeMod(uid.hashCode, numPartitions)
}
}
def nonNegativeMod(x: Int, mod: Int): Int = {
val rawMod = x % mod
rawMod + (if (rawMod < 0) mod else 0)
}
}