项目中有一个需求,求取taxi出行od的最大峰值(taxi od最大交叉值),采用队列处理:
需求示意图如下:
案例(自制测试数据):
val data = Seq(
(“A”,“2019-01-05 00:23:20”,“2019-01-05 00:27:20”,“2019-1-05”),
(“A”,“2019-01-05 00:25:20”,“2019-01-05 00:37:20”,“2019-1-05”),
(“A”,“2019-01-05 00:35:20”,“2019-01-05 00:40:20”,“2019-1-05”),
(“A”,“2019-01-05 01:25:20”,“2019-01-05 01:37:20”,“2019-1-05”),
(“A”,“2019-01-05 02:25:20”,“2019-01-05 02:37:20”,“2019-1-05”),
(“A”,“2019-01-05 02:26:20”,“2019-01-05 02:30:20”,“2019-1-05”),
(“A”,“2019-01-05 02:27:20”,“2019-01-05 02:28:20”,“2019-1-05”),
(“A”,“2019-01-05 02:27:40”,“2019-01-05 03:37:20”,“2019-1-05”),
(“A”,“2019-01-05 04:25:20”,“2019-01-05 04:57:20”,“2019-1-05”),
(“A”,“2019-01-05 04:35:20”,“2019-01-05 05:37:20”,“2019-1-05”),
(“A”,“2019-01-05 05:00:20”,“2019-01-05 06:37:20”,“2019-1-05”),
(“A”,“2019-01-05 08:25:20”,“2019-01-05 08:37:20”,“2019-1-05”),
(“A”,“2019-01-05 09:25:20”,“2019-01-05 09:37:20”,“2019-1-05”),
(“A”,“2019-01-05 09:35:20”,“2019-01-05 09:47:20”,“2019-1-05”),
(“A”,“2019-01-05 10:35:20”,“2019-01-05 10:47:20”,“2019-1-05”)
).toDF(“carID”,“departTime”,“arrivalTime”,“date”)
/**
* 提取大OD
*/
def getBigOD(data: DataFrame): DataFrame = {
import data.sparkSession.implicits._
data.groupByKey(row => row.getAs[String]("carID") + "," + row.getAs[String]("date")).flatMapGroups((str, it) => {
val indexArr = ArrayBuffer[(Int, Int, String)]() //标记大od划分时对应的records记录下标、最大od数、大od的结束时间
val res = ArrayBuffer[(String, String, Int, String)]()
val info = str.split(",")
val carID = info.head
val date = info.last
val records = it.toList.sortBy(_.getAs[String]("arrivalTime"))
val queue = new Queue[Row]()
val len_records = records.length
//当天记录数在2条及以上
if (len_records >= 2) {
var maxOdNum = 0 //初始化od数量
var index = 0 //初始化标记
var lastTime = "" //大od结束时的时间
for (record <- records) {
//首条记录入队
if (queue.length == 0) {
queue.enqueue(record)
} else {
//判断当前od是否应该加入队列
val departTime = record.getAs[String]("departTime")
//遍历队列中的所有od
var flag = true
var bg = 0
while (bg < queue.length && flag) {
val que = queue(bg)
val arrivalTime = que.getAs[String]("arrivalTime")
// 当前od的上车时间若大于队列中od的到达时间,则队列中已存od出队
if (departTime > arrivalTime) {
val queueLen = queue.length
if (queueLen > maxOdNum) maxOdNum = queueLen //队首元素出列前,队列的最大长度赋值给maxOdNum
queue.dequeue() //队首元素出列
bg -= 1 //队首元素出列后,下一次依旧从队首元素开始判断(**易忽略**)
} else {
queue.enqueue(record)
flag = false //当前上车时间小于队首元素的下车时间,则不必再往后查看,直接加入当前od即可
}
bg += 1
}
flag = true //供下一次循环队列使用
//若队列清空,则表示一次大od划分成功
if (queue.length == 0) {
// 出栈元素应该添加至最终的结果集中
lastTime = records(index-1).getAs[String]("arrivalTime")
indexArr.append((index-1, maxOdNum, lastTime))
//清空列表后,需将当前od加入队列中,成为队首元素
queue.enqueue(record)
maxOdNum = 0 //maxOdNum值归零
}
}
//插入最后一个元素后,需输出一次
if (index == len_records - 1) {
lastTime = records.last.getAs[String]("arrivalTime")
indexArr.append((index, queue.length, lastTime))
}
index += 1
}
} else {
//只有一条记录,直接添加进去即可
val lastTime = records.last.getAs[String]("arrivalTime")
indexArr.append((0, 1, lastTime))
}
for (ind <- indexArr) {
res.append((carID, date, ind._2, ind._3))
}
//根据indexArr和records生成最终的结果数据集
res
}).toDF("carID", "date", "maxOdNum", "endTime")
}
result: