先将RDD按断面分组,生成新的RDD
rdd.groupBy(s => (s.station_fore,s.station_back)).map(s=> cleandata(s)).flatMap(s=>s)//flatMap将数组展开,每条数据生成一条记录
def cleandata(data:((String,String),Iterable[sample])):Array[sample]={ val nt = ArrayBuffer[sample]() val t2 = data._2.toArray.sortBy(_.deal_time)//按时间排序 var temp = t2(0) nt+=t2(0) for (i<- 0 until t2.length){ if (temp.flow != t2(i).flow){ temp = t2(i) nt+=temp } } nt.toArray }
case class sample(station_fore:String,station_back:String,flow:Long,deal_time:String){ override def toString = station_fore+","+station_back+","+flow+","+deal_time }