有这种情况,一个项目下的一个设备多次上报检测成功记录,但我只要最早上报记录
代码如下:
package com.spark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
object LastestTime {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder.appName(s"${this.getClass.getSimpleName}").master("local[4]").getOrCreate();
import spark.implicits._
val s = spark.read.json("/user/kafka/flume/ota_check/pt=2017-11-17")
s.printSchema();
val w = Window.partitionBy($"product_id", $"device_id", $"check_status").orderBy($"create_time".asc_nulls_last);
val filterResult = s.withColumn("rank", row_number.over(w)).where($"rank" === 1).drop("rank");
filterResult.show();
}
}
主要按项目号,设备号,上报状态分组,且看时间升序排序,选取第一条记录.
它是先新建rank一列,最后再删除这列.
附:Java版代码
import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
....
WindowSpec w= Window.partitionBy("product_id","device_id","check_status").orderBy(col("create_time").asc_nulls_last());
Dataset<Row> userLastestOrder=tableData.withColumn("rank",row_number().over(w)).where(col("rank").equalTo(1)).drop("rank");