1,2020-02-18 14:20:30,2020-02-18 14:46:30,20
1,2020-02-18 14:47:20,2020-02-18 15:20:30,30
1,2020-02-18 15:37:23,2020-02-18 16:05:26,40
1,2020-02-18 16:06:27,2020-02-18 17:20:49,50
1,2020-02-18 17:21:50,2020-02-18 18:03:27,60
2,2020-02-18 14:18:24,2020-02-18 15:01:40,20
2,2020-02-18 15:20:49,2020-02-18 15:30:24,30
2,2020-02-18 16:01:23,2020-02-18 16:40:32,40
2,2020-02-18 16:44:56,2020-02-18 17:40:52,50
3,2020-02-18 14:39:58,2020-02-18 15:35:53,20
3,2020-02-18 15:36:39,2020-02-18 15:24:54,30
-
需求:统计用户上网流量,如果两次上网的时间小于10分钟,就可以rollup到一起
方法一
import java.text.SimpleDateFormat
import java.util.Date
import doit.day05_t.utils.SparkUtils
import org.apache.spark.rdd.RDD
object FlowRollup {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext(true)
val lines: RDD[String] = sc.textFile("src/main/scala/data/internet")
val res = lines.mapPartitions(it => {
val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
it.map(e => {
val fields = e.split(",")
val uid = fields(0)
val startTime = dateFormat.parse(fields(1)).getTime
val endTime = dateFormat.parse(fields(2)).getTime
val downFlow = fields(3).toLong
(uid, (startTime, endTime, downFlow))
})
}).groupByKey().flatMapValues(it => {
//(起始时间戳,结束时间戳,下行流量)
val sorted: List[(Long, Long, Long)] = it.toList.sortBy(_._1)
var temp = 0L
var flag = 0 //0 或 1
var sum = 0 //0,0, 0一组 1,1,1 另一种 2 2 2 又是一组
sorted.map(e => {
val startTime = e._1
val endTime = e._2
val flow = e._3
if(temp != 0) {
if((startTime - temp) / (1000 * 60) > 10) {
flag = 1
} else {
flag = 0
}
}
temp = endTime
sum += flag
(startTime, endTime, flow, sum)
})
}).map{
case (uid, (startTime, endTime, flow, sum)) => {
((uid, sum), (flow, startTime, endTime))
}
}.reduceByKey((a, b) => {
(a._1 + b._1, Math.min(a._2, b._2), Math.max(a._3, b._3))
}).mapPartitions(it => {
val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
it.map{
case ((uid, _), (flow, startTime, endTime)) => {
(uid, dateFormat.format(new Date(startTime)), dateFormat.format(new Date(endTime)), flow)
}
}
}).collect()
println(res.toBuffer)
// Thread.sleep(10000000)
sc.stop()
}
}
方法二
import java.text.SimpleDateFormat
import java.util.Date
import java.util.logging.SimpleFormatter
import doit.utils.SparkUtils
import org.apache.spark.rdd.RDD
object RollUpInter {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext(true)
val lines: RDD[String] = sc.textFile("src/main/scala/data/internet")
val rdd1= lines.mapPartitions(it => {
val format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
it.map(e => {
val split = e.split(",")
val uid = split(0)
val upInter: Long = format.parse(split(1)).getTime
val downInter: Long = format.parse(split(2)).getTime
val runtime = split(3).toInt
(uid, (upInter, downInter, runtime))
})
})
.groupByKey().flatMapValues(it => {
val list = it.toList.sortBy(_._1)
var start = 0L
var end = 0L
var rollTime = 0
list.map(e => {
if (start ==0L || end == 0L){
start = e._1
end = e._2
rollTime = e._3
}else{
if (e._1 - end < (10*60*1000)){
end = e._2
rollTime += e._3
}else{
start = e._1
end = e._2
rollTime = e._3
}
}
(start,end,rollTime)
})
}).map(e => {
((e._1,e._2._1),(e._2._2,e._2._3))
})
.reduceByKey((e1,e2) => {
(Math.max(e1._1,e2._1),Math.max(e1._2,e2._2))
}).mapPartitions(it => {
val dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
it.map(e=>{
(e._1._1,(dateFormat.format(new Date(e._1._2)),dateFormat.format(new Date(e._2._1)),e._2._2))
})
})
rdd1.foreach(println)
//Thread.sleep(10000000)
sc.stop()
}
}
方法三(一次Shuffle)
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.rdd.RDD
import utils.SparkUtils
import scala.collection.mutable.ArrayBuffer
/**
* 需求:统计用户上网流量,如果两次上网的时间小于10分钟,就可以rollup到一起
*/
object FlowTest {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.createContext()
val lines = sc.textFile("data/anli3_flow")
//对数据进行预处理
val idTimeAndFlow: RDD[(String, Date, Date, Int)] = lines.mapPartitions(e => {
//创建时间格式对象
val timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val tuples = e.map(ee => {
val arr = ee.split(",")
val uid = arr(0)
val startTime = timeFormat.parse(arr(1))
val endTime = timeFormat.parse(arr(2))
val flow = arr(3).toInt
(uid, startTime, endTime, flow)
})
tuples
})
val grouped: RDD[Iterable[(String, Date, Date, Int)]] = idTimeAndFlow.groupBy(_._1).values
val result: RDD[(String, String, String, Int)] = grouped.flatMap(e => {
//创建时间格式对象
val timeFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
//定义一个变量,用来判断迭代器的进程
var i = 0
var startTime: Date = null
var endTime: Date = null
//定义一个ArrayBuffer,存放结果;定义sum,计算总流量
val arr = new ArrayBuffer[(String, String, String, Int)]()
var sum = 0
e.foreach(ee => {
//第一次进入为startTime和endTime附初始值
if (i == 0) {
startTime = ee._2
endTime = ee._3
sum += ee._4
} else if (ee._2.getTime - endTime.getTime >= 600000) {
//如果间隔大于十分钟,把startTime和endTime和sum加到结果中,并进行重置
arr.append((ee._1, timeFormat.format(startTime), timeFormat.format(endTime), sum))
startTime = ee._2
endTime = ee._3
sum = ee._4
} else { //后面进入的进行比较,如果间隔时间小于10分钟
sum += ee._4
endTime = ee._3
}
i += 1
//如果是最后一个元素,追加输出一个
if (i == e.size) arr.append((ee._1, timeFormat.format(startTime), timeFormat.format(endTime), sum))
})
arr
})
println(result.collect().toBuffer)
Thread.sleep(10000000)
}
}