今天重新修改了一版charge切片逻辑
NewChargeSlice 是基于原来的charge的注释的一版
package BatteryCareSparkJob.chargeslice
import java.text.SimpleDateFormat
import java.util.{Date, Properties}
import BatteryCareSparkJob.chargeslice
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.IntegerType
/**
* @Author: Houzhq
* @Date: 2019/8/6 9:42
* @Version 1.0
* @Note
*/
object NewChargeSlice {
def main(args: Array[String]): Unit = {
def writeSQL(df:DataFrame, sql_url:String, user:String, password:String, table:String): Unit ={
//val sql_url="jdbc:mysql://10.170.245.98:3306/bigdata?useUnicode=true&characterEncoding=utf8"
val prop=new Properties()
prop.setProperty("user", user) //"root"
prop.setProperty("password", password) //"reachauto"
prop.setProperty("driver","com.mysql.jdbc.Driver")
df.write.mode(SaveMode.Append).jdbc(sql_url,table,prop)
}
def initializationSpark(app_name:String): SparkSession ={
val sparkconf = new SparkConf()
.set( "spark.serializer", "org.apache.spark.serializer.KryoSerializer" )
val spark = SparkSession
.builder()
.appName(app_name)
.config(sparkconf)
// .master("local[*]")
.getOrCreate()
spark
}
val spark = initializationSpark("charge_slices")
import spark.implicits._
def tranTimeToString(tm:String) :String={
val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val tim = fm.format(new Date(tm.toLong*1000))
tim
}
val ctime2date = udf((ctime:String) => tranTimeToString(ctime))
/**last2hoursNotCompleteDFInputPath 上个两小时未充完电(未切分)的数据输入存储路径**/
val last2hoursNotCompleteDFInputPath = args(0)
/**originalDataInputPath 本个两小时输入的原始数据**/
val originalDataInputPath = args(1)
/**batchSize 确定同时从原始数据获取的车辆数**/
val batchSize = args(2).toInt
/**last2hoursNotCompleteDFOutputPath 上个两小时未充完电(未切分)的数据输出存储路径(这里需要脚本控制,每次程序跑完将此目录中的数据cp到last2hoursNotCompleteDFInputPath)**/
val last2hoursNotCompleteDFOutputPath = args(3)
/**historyIdDFInputPath 历史charge_id等数据的输入路径**/
val historyIdDFInputPath = args(4)
/**historyIdDFOutputPath 历史charge_id等数据的输出路径 (这里需要脚本控制,每次程序跑完将此目录中的数据cp到historyIdDFInputPath)**/
val historyIdDFOutputPath = args(5)
/**前十次充电的数据**/
val vdiff_10pre_output = args(6)
/**充电切片输出路径**/
val chargeSliceOutput = args(7)
val sql_url = args(8)
val user = args(9)
val password = args(10)
val table = args(11)
/**vin 车架号 可以用逗号连接各个车架号,可以为空,为空为默认从本个两小时中找到所有车辆**/
val vin = args.lift(12)
/**加载上两个小时未充完数据**/
val last2hoursNotCompleteDF: DataFrame = spark.read.parquet(last2hoursNotCompleteDFInputPath)
// .select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt", "lon", "lat","curr", "soc",
// "celldata_error", "probedata_error","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
// .drop("id")
.select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt","curr", "soc",
"celldata_error", "probedata_error","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
.drop("id")
val vins = vin match {
case None => FileSystem
.get(spark.sparkContext.hadoopConfiguration)
.listStatus(new Path(originalDataInputPath))
.flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))
// case None => FileSystem.get(spark.sparkContext.hadoopConfiguration).listStatus(new Path(originalDataInputPath)).flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))
case Some(v) => v.split(',')
}
val fDF = vins
.map{ v=> originalDataInputPath + "id=" + v }
.grouped(batchSize)
.map {
p =>
val vinArray = p.map{ l => "(?<=id=).+".r.findFirstIn( l.replace(originalDataInputPath, "")).get.toString}
val last2hoursNotCompleteDF_vin = last2hoursNotCompleteDF.filter($"vin".isin(vinArray:_*))
/** 当前两小时全部数据 **/
val parquet_DF: DataFrame = spark.read.parquet(p: _*)
.select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt", "lon", "lat","curr", "soc",
"celldata_error", "probedata_error", "temp_high", "temp_low","cell_high","cell_low","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
// .withColumnRenamed("miles", "miles")
.drop("lon", "lat","temp_high", "temp_low","cell_high","cell_low")
.union(last2hoursNotCompleteDF_vin)
.withColumn("last_charge_state",last("charge_state").over(Windows.w_vin_unboundedFollowing))
val notCompleteDF =parquet_DF
.filter($"last_charge_state" === 1)
.drop("last_charge_state")
notCompleteDF.withColumn("id", $"vin").repartition($"id").write.mode(SaveMode.Append).partitionBy("id").parquet(last2hoursNotCompleteDFOutputPath)
val todayDF = parquet_DF.filter($"last_charge_state" =!= 1).drop("last_charge_state")
val charge_unsliced = todayDF
.withColumn("time_interval_pre", $"ctime" - lag("ctime",1,0).over(Windows.w_vin))
.withColumn("time_interval_post", $"ctime" - lead("ctime", 1, 0).over(Windows.w_vin))
.filter($"charge_state" === 1)
.withColumn("charge_num_label",
when(($"ctime" - lag("ctime", 1, 0).over(Windows.w_vin)) > 1800, 1)
.otherwise(0))
.withColumn("charge_id", sum("charge_num_label").over(Windows.w_Preceding))
.drop("charge_num_label")
.withColumn("start_time", first("ctime").over(Windows.w_chargectime))
.withColumn("end_time", last("ctime").over(Windows.w_chargectime))
.withColumn("st+40",$"start_time" + 40)
.withColumn("et-30",$"end_time" - 30)
.withColumn("real_st",when($"ctime" >= $"st+40" && $"ctime" <= $"et-30",1).otherwise(0))
.filter($"real_st" === 1)
.drop("real_st","st+40","et-30")
.filter($"celldata_error" === 0)
.filter($"probedata_error" === 0)
.drop("celldata_error","probedata_error")
// charge_unsliced.show(1000,false)
// val aSeriesMax = new chargeslice.AnotherSeriesMax(IntegerType, 34)
// val aSeriesMin = new chargeslice.AnotherSeriesMin(IntegerType, 34)
// val array_sub_int = udf((a: Seq[Int], b: Int) => a.map(_ - b))
//
//
// val pack_warmtime_ctime = new MyAnotherMax(IntegerType)
// val pack_cooltime_ctime = new MyAnotherMin(IntegerType)
val charge_unsliced_2 = charge_unsliced
.withColumn("celldata_array", chargeslice.Windows.fun_data2array($"celldata"))
.withColumn("probedata_array", chargeslice.Windows.fun_probe2array($"probedata"))
.drop("celldata","probedata")
.withColumn("power", $"volt" * $"curr")
.withColumn("curr_last", lag("curr", 1, 0).over(Windows.w_vin))
.withColumn("Ah_each_record", ($"curr" + $"curr_last") * 0.5 * $"time_interval_pre" / 3600)
.drop("time_interval_pre","time_interval_post","curr_last")
.withColumn("Ah",sum("Ah_each_record").over(Windows.w_chargectime))
.drop("Ah_each_record")
.withColumn("start_soc", first("soc").over(Windows.w_chargectime))
.withColumn("end_soc", last("soc").over(Windows.w_chargectime))
.drop("soc")
// .withColumn("usage_time",max($"ctime").over(Windows.w_chargeid) - min($"ctime").over(Windows.w_chargeid))
// .withColumn("cell_volt_before_charge_array", first("celldata_array").over(Windows.w_chargectime))
// .withColumn("cell_volt_after_charge_array", last("celldata_array").over(Windows.w_chargectime))
// .withColumn("probe_temp_before_charge_array", first("probedata_array").over(Windows.w_chargectime))
// .withColumn("probe_temp_after_charge_array", last("probedata_array").over(Windows.w_chargectime))
.withColumn("charge_soc", $"end_soc" - $"start_soc")
// .withColumn("cell_mean", Windows.fun_mean($"celldata_array"))
// .withColumn("cell_std", Windows.fun_std($"celldata_array"))
// .withColumn("cell_diff", $"cell_high_evcc" - $"cell_low_evcc")
// .withColumn("temp_mean", Windows.fun_mean($"probedata_array"))
// .withColumn("temp_std", Windows.fun_std($"probedata_array"))
.withColumn("temp_diff", $"temp_high_evcc" - $"temp_low_evcc")
// .withColumn("max-fifthMax", Windows.fun_diff_among_max($"celldata_array"))
// .withColumn("min-fifthMin", Windows.fun_diff_among_min($"celldata_array"))
// .withColumn("max-mean", Windows.fun_max_diff_with_mean($"celldata_array"))
// .withColumn("min-mean", Windows.fun_min_diff_with_mean($"celldata_array"))
// .withColumn("probe(median-min)", Windows.fun_median($"probedata_array") - Windows.fun_min($"probedata_array"))
// .withColumn("probe(max-median)", Windows.fun_max($"probedata_array") - Windows.fun_median($"probedata_array"))
.drop("celldata_array")
val charge_sliced = charge_unsliced_2.groupBy("vin", "charge_id")
.agg(
first("miles").as("miles"),
// first("lon").as("lon"),
// first("lat").as("lat"),
first("start_time").as("start_time"),
first("end_time").as("end_time"),
first("Ah").as("Ah"),
// first("usage_time").as("usage_time"),
first("start_soc").as("start_soc"),
first("end_soc").as("end_soc"),
max("curr").as("curr_max"),
min("curr").as("curr_min"),
mean("curr").as("curr_mean"),
max("volt").as("volt_max"),
min("volt").as("volt_min"),
mean("volt").as("volt_mean"),
max("power").as("power_max"),
min("power").as("power_min"),
mean("power").as("power_mean"),
max($"temp_high_evcc").as("temp_high_max"),
// mean($"temp_high_evcc").as("temp_high_mean"),
min($"temp_low_evcc").as("temp_low_min"),
// mean($"temp_low_evcc").as("temp_low_mean"),
// max($"temp_low_evcc").as("temp_low_evcc_max"),
min($"temp_high_evcc").as("temp_high_evcc_min"),
first("charge_soc").as("charge_soc"),
// first("cell_volt_before_charge_array").as("cell_volt_before_charge_array"),
// first("cell_volt_after_charge_array").as("cell_volt_after_charge_array"),
// first("probe_temp_before_charge_array").as("probe_temp_before_charge_array"),
// first("probe_temp_after_charge_array").as("probe_temp_after_charge_array"),
// first("temp_high_evcc").as("temp_high_evcc"),
// first("temp_low_evcc").as("temp_low_evcc"),
// array_sub_int(aSeriesMax($"ctime", $"probedata_array"), min($"ctime")).as("probe_warmtime"),
// array_sub_int(aSeriesMin($"ctime", $"probedata_array"), min($"ctime")).as("probe_cooltime"),
// (pack_warmtime_ctime($"ctime", $"temp_high_evcc") - min($"ctime")).as("pack_warmtime"),
// (pack_cooltime_ctime($"ctime", $"temp_low_evcc") - min($"ctime")).as("pack_cooltime"),
// max("cell_mean").as("cell_volt_mean_max"),
// min("cell_mean").as("cell_volt_mean_min"),
// PercentileApprox.percentile_approx($"cell_mean", lit(0.5)).as("cell_volt_mean_median"),
// max($"cell_std").as("cell_volt_std_max"),
// min($"cell_std").as("cell_volt_std_min"),
// PercentileApprox.percentile_approx($"cell_std", lit(0.5)).as("cell_volt_std_median"),
// max($"cell_diff").as("cell_volt_diff_max"),
// min($"cell_diff").as("cell_volt_diff_min"),
// PercentileApprox.percentile_approx($"cell_diff", lit(0.5)).as("cell_volt_diff_median"),
// max($"temp_mean").as("probe_mean_max"),
// min($"temp_mean").as("probe_mean_min"),
// PercentileApprox.percentile_approx($"temp_mean", lit(0.5)).as("probe_mean_median"),
// max($"temp_std").as("probe_std_max"),
// min($"temp_std").as("probe_std_min"),
// PercentileApprox.percentile_approx($"temp_std", lit(0.5)).as("probe_std_median"),
max($"temp_diff").as("probe_diff_max")
// min($"temp_diff").as("probe_diff_min"),
// PercentileApprox.percentile_approx($"temp_diff", lit(0.5)).as("probe_diff_median"),
// max("max-fifthMax").as("diff_among_max"),
// max("min-fifthMin").as("diff_among_min"),
// max("max-mean").as("max_diff_with_mean"),
// max("min-mean").as("min_diff_with_mean"),
// max("probe(median-min)").as("probe_diff_med_low_max"),
// max("probe(max-median)").as("probe_diff_high_med_max"),
// PercentileApprox.percentile_approx($"cell_mean", lit(0.25)).as("cell_volt_mean_qu1"),
// PercentileApprox.percentile_approx($"cell_mean", lit(0.75)).as("cell_volt_mean_qu3"),
// PercentileApprox.percentile_approx($"cell_std", lit(0.25)).as("cell_volt_std_qu1"),
// PercentileApprox.percentile_approx($"cell_std", lit(0.75)).as("cell_volt_std_qu3"),
// PercentileApprox.percentile_approx($"cell_diff", lit(0.25)).as("cell_volt_diff_qu1"),
// PercentileApprox.percentile_approx($"cell_diff", lit(0.75)).as("cell_volt_diff_qu3"),
// PercentileApprox.percentile_approx($"temp_mean", lit(0.25)).as("probe_mean_qu1"),
// PercentileApprox.percentile_approx($"temp_mean", lit(0.75)).as("probe_mean_qu3"),
// PercentileApprox.percentile_approx($"temp_std", lit(0.25)).as("probe_std_qu1"),
// PercentileApprox.percentile_approx($"temp_std", lit(0.75)).as("probe_std_qu3"),
// PercentileApprox.percentile_approx($"temp_diff", lit(0.25)).as("probe_diff_qu1"),
// PercentileApprox.percentile_approx($"temp_diff", lit(0.75)).as("probe_diff_qu3"),
// PercentileApprox.percentile_approx($"temp_high_evcc", lit(0.5)).as("temp_high_median"),
// PercentileApprox.percentile_approx($"temp_high_evcc", lit(0.25)).as("temp_high_qu1"),
// PercentileApprox.percentile_approx($"temp_high_evcc", lit(0.75)).as("temp_high_qu3"),
// PercentileApprox.percentile_approx($"temp_low_evcc", lit(0.5)).as("temp_low_median"),
// PercentileApprox.percentile_approx($"temp_low_evcc", lit(0.25)).as("temp_low_qu1"),
// PercentileApprox.percentile_approx($"temp_low_evcc", lit(0.75)).as("temp_low_qu3"),
// PercentileApprox.percentile_approx($"curr", lit(0.5)).as("curr_median"),
// PercentileApprox.percentile_approx($"curr", lit(0.25)).as("curr_qu1"),
// PercentileApprox.percentile_approx($"curr", lit(0.75)).as("curr_qu3"),
// PercentileApprox.percentile_approx($"volt", lit(0.5)).as("volt_median"),
// PercentileApprox.percentile_approx($"volt", lit(0.25)).as("volt_qu1"),
// PercentileApprox.percentile_approx($"volt", lit(0.75)).as("volt_qu3"),
// PercentileApprox.percentile_approx($"power", lit(0.5)).as("power_median"),
// PercentileApprox.percentile_approx($"power", lit(0.25)).as("power_qu1"),
// PercentileApprox.percentile_approx($"power", lit(0.75)).as("power_qu3")
)
.withColumn("Current_first_starttime", first("start_time").over(Windows.w_chargeid))
// .withColumn("probe_temp_before_charge_median", Windows.fun_median($"probe_temp_before_charge_array"))
// .withColumn("probe_temp_after_charge_median", Windows.fun_median($"probe_temp_after_charge_array"))
// .withColumn("ptbc_D", Windows.fun_ptac_D($"probe_temp_before_charge_array", $"probe_temp_before_charge_median"))
// .withColumn("ptac_D", Windows.fun_ptac_D($"probe_temp_after_charge_array", $"probe_temp_after_charge_median"))
// .withColumn("ptac_Dm", Windows.fun_mean($"ptac_D"))
// .withColumn("ptac_Ds", Windows.fun_std($"ptac_D"))
/** 加载历史chargeid **/
var history_id_df = spark.read.format("com.databricks.spark.csv")
.option("header", "false").option("delimiter", ",").option("inferSchema", "true").load(historyIdDFInputPath)
.toDF("history_vin", "history_id")
// .toDF("history_vin", "history_id", "xxx")
// .withColumn("history_first_starttime", split($"xxx", "\\$")(0))
// .withColumn("histroy_ptac_Dm_10", split($"xxx", "\\$")(1))
// .withColumn("histroy_ptac_Ds_10", split($"xxx", "\\$")(2))
// .withColumn("histroy_ptac_Dm_all_num", split($"xxx", "\\$")(3))
// .withColumn("histroy_ptac_Ds_all_num", split($"xxx", "\\$")(4))
// .withColumn("histroy_ptac_Dm_all", split($"xxx", "\\$")(5))
// .withColumn("histroy_ptac_Ds_all", split($"xxx", "\\$")(6))
// .drop("xxx")
// val p10 = udf((p: Seq[Double]) => {
// p.mkString(";")
// })
val dfa = charge_sliced
// .withColumn("ptac_Dm_array",collect_list("ptac_Dm").over(Windows.w_chargeid))
// .withColumn("ptac_Ds_array",collect_list("ptac_Ds").over(Windows.w_chargeid))
// .withColumn("current_ptac_Dm",p10($"ptac_Dm_array"))
// .withColumn("current_ptac_Ds",p10($"ptac_Ds_array"))
// .drop("ptac_Dm_array","ptac_Ds_array")
// .drop("vin_g")
// val get10 = udf((histroy_ptac_Dm_10_null: String, current_ptac_Dm_null: String) => {
// var ptac_Dm_10 = ""
// if (histroy_ptac_Dm_10_null.equals("null")){
// ptac_Dm_10 = current_ptac_Dm_null
// }else if(current_ptac_Dm_null.equals("null")){
// ptac_Dm_10 = histroy_ptac_Dm_10_null
// }
// else if (!histroy_ptac_Dm_10_null.equals("null") && !current_ptac_Dm_null.equals("null")){
// val all = histroy_ptac_Dm_10_null + ";" + current_ptac_Dm_null
// val all_arr = all.split(";")
// val length = all_arr.length
// if (length > 10){
// ptac_Dm_10 = all_arr.takeRight(10).mkString(";")
// }else{
// ptac_Dm_10 = all
// }
//
// }
// ptac_Dm_10
// })
//
//
// val getHistory = udf((current_ptac_Dm_null: String, histroy_ptac_Dm_all_num_null: Double) => {
// var history_all = 0.0
// if(current_ptac_Dm_null.equals("null")){
//
// history_all = histroy_ptac_Dm_all_num_null
// }
// else if(histroy_ptac_Dm_all_num_null == -999.0){
// val sum = current_ptac_Dm_null.split(";").map(_.toDouble).sum
//
// history_all = sum
// }
//
// else if(!current_ptac_Dm_null.equals("null") && histroy_ptac_Dm_all_num_null != -999.0){
// val sum = current_ptac_Dm_null.split(";").map(_.toDouble).sum + histroy_ptac_Dm_all_num_null
// history_all = sum
// }
// history_all
// })
//
//
// val getHistoryAllSum = udf((histroy_ptac_Dm_null: Double, current_ptac_Dm_null: String) => {
// var a = 0.0
// if (histroy_ptac_Dm_null == -999.0){
// val str_arr = current_ptac_Dm_null.split(";")
// val arr = str_arr.map{
// v=>
// val t = v.toDouble
// val x = t * t
// x
// }
// a = arr.sum
// }else if(current_ptac_Dm_null.equals("null")){
// a = histroy_ptac_Dm_null
// }
// else if (histroy_ptac_Dm_null != -999.0 && !current_ptac_Dm_null.equals("null")){
// val str_arr = current_ptac_Dm_null.split(";")
// val arr = str_arr.map{
// v=>
// val t = v.toDouble
// val x = t * t
// x
// }
// a = arr.sum + histroy_ptac_Dm_null
// }
// a
// })
val df = dfa.join(history_id_df, dfa("vin") === history_id_df("history_vin"), "outer")
.withColumn("history_vin_null2x", when($"history_vin".isNull, "x").otherwise($"history_vin"))
.withColumn("vin_null2x", when($"vin".isNull, "x").otherwise($"vin"))
.withColumn("v", Windows.vin($"vin_null2x", $"history_vin_null2x"))
.drop("history_vin", "vin", "history_vin_null2x", "vin_null2x")
.withColumnRenamed("v", "vin")
.withColumn("charge_id_null", when($"charge_id".isNull, 0).otherwise($"charge_id"))
.withColumn("history_id_null", when($"history_id".isNull, 0).otherwise($"history_id"))
.withColumn("charge_id_now", $"charge_id_null" + $"history_id_null")
.drop("charge_id_null", "history_id_null", "charge_id", "history_id")
.withColumnRenamed("charge_id_now", "charge_id")
// .withColumn("Current_first_starttime_null", when($"Current_first_starttime".isNull, -1).otherwise($"Current_first_starttime"))
// .withColumn("history_first_starttime_null", when($"history_first_starttime".isNull, -1).otherwise($"history_first_starttime"))
// .withColumn("first_starttime", Windows.first_starttime($"Current_first_starttime_null", $"history_first_starttime_null"))
// .drop("Current_first_starttime_null", "history_first_starttime_null", "history_first_starttime", "Current_first_starttime")
// .withColumn("current_ptac_Dm_null", when($"current_ptac_Dm".isNull, "null").otherwise($"current_ptac_Dm"))
// .withColumn("histroy_ptac_Dm_10_null", when($"histroy_ptac_Dm_10".isNull, "null").otherwise($"histroy_ptac_Dm_10"))
// .drop("current_ptac_Dm","histroy_ptac_Dm_10")
// .withColumn("ptac_Dm_10",get10($"histroy_ptac_Dm_10_null",$"current_ptac_Dm_null"))
// .withColumn("current_ptac_Ds_null", when($"current_ptac_Ds".isNull, "null").otherwise($"current_ptac_Ds"))
// .withColumn("histroy_ptac_Ds_10_null", when($"histroy_ptac_Ds_10".isNull, "null").otherwise($"histroy_ptac_Ds_10"))
// .drop("current_ptac_Ds","histroy_ptac_Ds_10")
// .withColumn("ptac_Ds_10",get10($"histroy_ptac_Ds_10_null",$"current_ptac_Ds_null"))
// .withColumn("histroy_ptac_Dm_all_num_null", when($"histroy_ptac_Dm_all_num".isNull, -999.0).otherwise($"histroy_ptac_Dm_all_num"))
// .withColumn("histroy_ptac_Ds_all_num_null", when($"histroy_ptac_Ds_all_num".isNull, -999.0).otherwise($"histroy_ptac_Ds_all_num"))
// .drop("histroy_ptac_Dm_all_num","histroy_ptac_Ds_all_num")
// .withColumn("ptac_Dm_all_num",getHistory($"current_ptac_Dm_null",$"histroy_ptac_Dm_all_num_null"))
// .withColumn("ptac_Ds_all_num",getHistory($"current_ptac_Ds_null",$"histroy_ptac_Ds_all_num_null"))
// .withColumn("histroy_ptac_Dm_all_null", when($"histroy_ptac_Dm_all".isNull, -999.0).otherwise($"histroy_ptac_Dm_all"))
// .drop("histroy_ptac_Dm_all")
// .withColumn("ptac_Dm_all",getHistoryAllSum($"histroy_ptac_Dm_all_num_null",$"current_ptac_Dm_null"))
// .withColumn("histroy_ptac_Ds_all_null", when($"histroy_ptac_Ds_all".isNull, "null").otherwise($"histroy_ptac_Ds_all"))
// .drop("histroy_ptac_Ds_all")
// .withColumn("ptac_Ds_all",getHistoryAllSum($"histroy_ptac_Ds_all_num_null",$"current_ptac_Ds_null"))
// /** 存储chargeId和需要的数据 **/
// val buffer = df.select("vin", "charge_id", "first_starttime", "ptac_Dm_10", "ptac_Ds_10", "ptac_Dm_all_num", "ptac_Ds_all_num", "ptac_Dm_all", "ptac_Ds_all")
// .withColumn("xxx", concat_ws("$", $"first_starttime", $"ptac_Dm_10", $"ptac_Ds_10", $"ptac_Dm_all", $"ptac_Ds_all", $"ptac_Dm_all", $"ptac_Ds_all"))
// .groupBy("vin")
// .agg(
// max("charge_id").as("charge_id"),
// last("xxx").as("xx")
// )
// buffer.coalesce(1).write.mode(SaveMode.Append).option("header", "false").csv(historyIdDFOutputPath)
// buffer.write.mode(SaveMode.Append).option("header", "true").csv(historyIdDFOutputPath)
/** 存储chargeId和需要的数据 **/
val buffer = df.select("vin", "charge_id")
// .withColumn("xxx", concat_ws("$", $"first_starttime", $"ptac_Dm_10", $"ptac_Ds_10", $"ptac_Dm_all", $"ptac_Ds_all", $"ptac_Dm_all", $"ptac_Ds_all"))
.groupBy("vin")
.agg(
max("charge_id").as("charge_id")
// last("xxx").as("xx")
)
buffer.coalesce(1).write.mode(SaveMode.Append).option("header", "false").csv(historyIdDFOutputPath)
val DF = df
// .withColumn("ptac_Dm_10_array", Windows.fun_2array($"ptac_Dm_10"))
// .withColumn("ptac_Dmm10", Windows.fun_mean($"ptac_Dm_10_array"))
// .withColumn("ptac_Dms10", Windows.fun_std($"ptac_Dm_10_array"))
// .drop("ptac_Dm_10_array")
// .withColumn("ptac_Ds_10_array", Windows.fun_2array($"ptac_Ds_10"))
// .withColumn("ptac_Dsm10", Windows.fun_mean($"ptac_Ds_10_array"))
// .withColumn("ptac_Dss10", Windows.fun_std($"ptac_Ds_10_array"))
// .drop("ptac_Ds_10_array")
// .withColumn("ptac_DmmAll", $"ptac_Dm_all_num"/$"charge_id")
// .withColumn("ptac_DsmAll", $"ptac_Ds_all_num"/$"charge_id")
// .withColumn("ptac_Dm_all_array", Windows.fun_2array($"ptac_Dm_all"))
// .withColumn("ptac_DmsAll", sqrt(($"ptac_Dm_all" - $"ptac_DmmAll" * $"ptac_DmmAll")/$"charge_id") )
//
// .drop("ptac_Dm_all_array")
// .withColumn("ptac_Ds_all_array", Windows.fun_2array($"ptac_Ds_all"))
//
// .withColumn("ptac_DssAll", sqrt(($"ptac_Ds_all" - $"ptac_DsmAll" * $"ptac_DsmAll")/$"charge_id") )
// .drop("ptac_Ds_all_array")
// .withColumn("ptac_DmmAll_abs", abs($"ptac_DmmAll"))
// .withColumn("ptac_Dmm10_abs", abs($"ptac_Dmm10"))
// .withColumn("ptac_Dm_abs", abs($"ptac_Dm"))
// .filter($"ptac_Dm".isNotNull)
// .drop("current_ptac_Dm_null","histroy_ptac_Dm_10_null","ptac_Dm_10","current_ptac_Ds_null","histroy_ptac_Ds_10_null","ptac_Ds_10","histroy_ptac_Dm_all_num_null","histroy_ptac_Ds_all_num_null",
// "ptac_Dm_all_num","ptac_Ds_all_num","histroy_ptac_Dm_all_null","histroy_ptac_Ds_all_null","ptac_Dm_all","ptac_Ds_all")
// .withColumn("d_T_med", abs($"probe_temp_after_charge_median" - $"probe_temp_before_charge_median"))
// .withColumn("temp_raising_rate", Windows.temp_raising_rate($"ptac_D", $"ptbc_D", $"d_T_med"))
val b = DF
// .withColumn("temp_high_range", $"temp_high_max" - $"temp_high_evcc_min")
// .withColumn("temp_low_range", $"temp_low_evcc_max" - $"temp_low_min")
// .withColumn("temp_mean_before_charge", Windows.fun_mean($"probe_temp_before_charge_array"))
// .withColumn("temp_std_before_charge", Windows.fun_std($"probe_temp_before_charge_array"))
// .withColumn("temp_max_before_charge", Windows.fun_max($"probe_temp_before_charge_array"))
// .withColumn("temp_min_before_charge", Windows.fun_min($"probe_temp_before_charge_array"))
// .withColumn("probe_temp_before_charge_diff", $"temp_max_before_charge" - $"temp_min_before_charge")
// .withColumn("temp_mean_after_charge", Windows.fun_mean($"probe_temp_after_charge_array"))
// .withColumn("temp_std_after_charge", Windows.fun_std($"probe_temp_after_charge_array"))
// .withColumn("temp_max_after_charge", Windows.fun_max($"probe_temp_after_charge_array"))
// .withColumn("temp_min_after_charge", Windows.fun_min($"probe_temp_after_charge_array"))
// .withColumn("probe_temp_after_charge_diff", $"temp_max_after_charge" - $"temp_min_after_charge")
// .withColumn("cell_volt_rise", Windows.fun_rise($"cell_volt_before_charge_array", $"cell_volt_after_charge_array"))
// .withColumn("volt_mean_before_charge", Windows.fun_mean($"cell_volt_before_charge_array"))
// .withColumn("probe_temp_rise", Windows.fun_rise($"probe_temp_before_charge_array", $"probe_temp_after_charge_array"))
// .withColumn("volt_std_before_charge", Windows.fun_std($"cell_volt_before_charge_array"))
// .withColumn("volt_mean_after_charge", Windows.fun_mean($"cell_volt_after_charge_array"))
// .withColumn("volt_std_after_charge", Windows.fun_std($"cell_volt_after_charge_array"))
// .withColumn("cell_volt_after_charge_diff", Windows.fun_max($"cell_volt_after_charge_array") - Windows.fun_min($"cell_volt_after_charge_array"))
// .withColumn("end_diff_among_max", Windows.fun_max_diff_with_mean($"cell_volt_after_charge_array"))
// .withColumn("end_diff_among_min", Windows.fun_min_diff_with_mean($"cell_volt_after_charge_array"))
.withColumn("Dod", lag("end_soc", 1, Double.NaN).over(Windows.w_chargeid) - $"start_soc")
// .withColumn("diff_volt_between_max_and_end", $"max_diff_with_mean" - $"end_diff_among_max")
// .withColumn("charge_freq", $"charge_id" * 1000 / $"miles")
// .withColumn("fullCharge", when($"end_soc" > 99, 1).otherwise(0))
// .withColumn("dvdiffdt", when($"end_soc" > 99, $"cell_volt_diff_max" / ($"end_time" - $"first_starttime")).otherwise(Double.NaN))
// .withColumnRenamed("cell_volt_before_charge_array","cell_volt_before_charge")
// .withColumnRenamed("cell_volt_after_charge_array","cell_volt_after_charge")
// .withColumnRenamed("probe_temp_before_charge_array","probe_temp_before_charge")
// .withColumnRenamed("probe_temp_after_charge_array","probe_temp_after_charge")
.withColumn("charge_interval", $"end_time" - $"start_time")
.withColumn("charge_type", Windows.charge_type($"charge_interval", $"charge_soc"))
.withColumn("start_time_str", ctime2date($"start_time"))
.withColumn("end_time_str", ctime2date($"end_time"))
b
}
.reduce(_ union _)
// val w_10= Window
// .partitionBy("vin")
//
// val w_101 = fDF
// .filter($"fullCharge" === 1)
// .select("vin","charge_id","fullCharge","cell_volt_diff_max","first_starttime")
//
// w_101
// .createOrReplaceTempView("data")
//
// val vdiff_10pre = spark.sql(
// """SELECT * from data order by charge_id limit 10
// """.stripMargin)
// .groupBy("vin")
// .agg(
// mean("cell_volt_diff_max").as("vdiff_10pre"),
// first("first_starttime").as("first_charge_time")
// )
//
//
// vdiff_10pre.write.mode(SaveMode.Overwrite).parquet(vdiff_10pre_output)
val chargedf = fDF.select("vin","charge_id","start_time","end_time","start_time_str","end_time_str","charge_interval","start_soc","end_soc","Dod")
.withColumn("id",$"vin").repartition($"id").write.mode(SaveMode.Overwrite).partitionBy("id").parquet(chargeSliceOutput)
val charge_table = fDF.select("vin", "charge_id", "start_time", "end_time", "charge_interval", "charge_type", "charge_soc", "curr_max", "curr_min", "curr_mean", "start_soc", "end_soc", "temp_high_max", "temp_low_min",
"probe_diff_max", "volt_max", "volt_min","volt_mean","miles", "power_max", "power_min", "power_mean","Ah","probe_mean_median")
//写入数据库
writeSQL(charge_table,sql_url,user,password,table)
}
}
FirstChargeSlice 为测试完成的新版charge切片,去掉了很多字段
package BatteryCareSparkJob.chargeslice
import java.text.SimpleDateFormat
import java.util.{Date, Properties}
import BatteryCareSparkJob.chargeslice
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.functions._
/**
* @Author: Houzhq
* @Date: 2019/8/6 11:20
* @Version 1.0
* @Note
*/
object FirstChargeSlice {
def main(args: Array[String]): Unit = {
def writeSQL(df:DataFrame, sql_url:String, user:String, password:String, table:String): Unit ={
//val sql_url="jdbc:mysql://10.170.245.98:3306/bigdata?useUnicode=true&characterEncoding=utf8"
val prop=new Properties()
prop.setProperty("user", user) //"root"
prop.setProperty("password", password) //"reachauto"
prop.setProperty("driver","com.mysql.jdbc.Driver")
df.write.mode(SaveMode.Append).jdbc(sql_url,table,prop)
}
def initializationSpark(app_name:String): SparkSession ={
val sparkconf = new SparkConf()
.set( "spark.serializer", "org.apache.spark.serializer.KryoSerializer" )
val spark = SparkSession
.builder()
.appName(app_name)
.config(sparkconf)
// .master("local[*]")
.getOrCreate()
spark
}
val spark = initializationSpark("charge_slices")
import spark.implicits._
def tranTimeToString(tm:String) :String={
val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val tim = fm.format(new Date(tm.toLong*1000))
tim
}
val ctime2date = udf((ctime:String) => tranTimeToString(ctime))
/**last2hoursNotCompleteDFInputPath 上个两小时未充完电(未切分)的数据输入存储路径**/
val last2hoursNotCompleteDFInputPath = args(0)
/**originalDataInputPath 本个两小时输入的原始数据**/
val originalDataInputPath = args(1)
/**batchSize 确定同时从原始数据获取的车辆数**/
val batchSize = args(2).toInt
/**last2hoursNotCompleteDFOutputPath 上个两小时未充完电(未切分)的数据输出存储路径(这里需要脚本控制,每次程序跑完将此目录中的数据cp到last2hoursNotCompleteDFInputPath)**/
val last2hoursNotCompleteDFOutputPath = args(3)
/**historyIdDFInputPath 历史charge_id等数据的输入路径**/
val historyIdDFInputPath = args(4)
/**historyIdDFOutputPath 历史charge_id等数据的输出路径 (这里需要脚本控制,每次程序跑完将此目录中的数据cp到historyIdDFInputPath)**/
val historyIdDFOutputPath = args(5)
/**前十次充电的数据**/
val vdiff_10pre_output = args(6)
/**充电切片输出路径**/
val chargeSliceOutput = args(7)
val sql_url = args(8)
val user = args(9)
val password = args(10)
val table = args(11)
/**vin 车架号 可以用逗号连接各个车架号,可以为空,为空为默认从本个两小时中找到所有车辆**/
val vin = args.lift(12)
/**加载上两个小时未充完数据**/
val last2hoursNotCompleteDF: DataFrame = spark.read.parquet(last2hoursNotCompleteDFInputPath)
.select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt","curr", "soc",
"celldata_error", "probedata_error","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
.drop("id")
val vins = vin match {
case None => FileSystem
.get(spark.sparkContext.hadoopConfiguration)
.listStatus(new Path(originalDataInputPath))
.flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))
// case None => FileSystem.get(spark.sparkContext.hadoopConfiguration).listStatus(new Path(originalDataInputPath)).flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))
case Some(v) => v.split(',')
}
val fDF = vins
.map{ v=> originalDataInputPath + "id=" + v }
.grouped(batchSize)
.map {
p =>
val vinArray = p.map { l => "(?<=id=).+".r.findFirstIn(l.replace(originalDataInputPath, "")).get.toString }
val last2hoursNotCompleteDF_vin = last2hoursNotCompleteDF.filter($"vin".isin(vinArray: _*))
/** 当前两小时全部数据 **/
val parquet_DF: DataFrame = spark.read.parquet(p: _*)
.select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt", "lon", "lat","curr", "soc",
"celldata_error", "probedata_error", "temp_high", "temp_low","cell_high","cell_low","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
.drop("lon", "lat","temp_high", "temp_low","cell_high","cell_low")
.union(last2hoursNotCompleteDF_vin)
.withColumn("last_charge_state",last("charge_state").over(Windows.w_vin_unboundedFollowing))
val notCompleteDF =parquet_DF
.filter($"last_charge_state" === 1)
.drop("last_charge_state")
notCompleteDF.withColumn("id", $"vin").repartition($"id").write.mode(SaveMode.Append).partitionBy("id").parquet(last2hoursNotCompleteDFOutputPath)
val todayDF = parquet_DF.filter($"last_charge_state" =!= 1).drop("last_charge_state")
val charge_unsliced = todayDF
.withColumn("time_interval_pre", $"ctime" - lag("ctime",1,0).over(Windows.w_vin))
.withColumn("time_interval_post", $"ctime" - lead("ctime", 1, 0).over(Windows.w_vin))
.filter($"charge_state" === 1)
.withColumn("charge_num_label",
when(($"ctime" - lag("ctime", 1, 0).over(Windows.w_vin)) > 1800, 1)
.otherwise(0))
.withColumn("charge_id", sum("charge_num_label").over(Windows.w_Preceding))
.drop("charge_num_label")
.withColumn("start_time", first("ctime").over(Windows.w_chargectime))
.withColumn("end_time", last("ctime").over(Windows.w_chargectime))
.withColumn("st+40",$"start_time" + 40)
.withColumn("et-30",$"end_time" - 30)
.withColumn("real_st",when($"ctime" >= $"st+40" && $"ctime" <= $"et-30",1).otherwise(0))
.filter($"real_st" === 1)
.drop("real_st","st+40","et-30")
.filter($"celldata_error" === 0)
.filter($"probedata_error" === 0)
.drop("celldata_error","probedata_error")
val charge_unsliced_2 = charge_unsliced
.withColumn("celldata_array", chargeslice.Windows.fun_data2array($"celldata"))
.withColumn("probedata_array", chargeslice.Windows.fun_probe2array($"probedata"))
.withColumn("probe_ctime_mean", chargeslice.Windows.fun_mean($"probedata_array"))
.drop("celldata","probedata")
.withColumn("power", $"volt" * $"curr")
.withColumn("curr_last", lag("curr", 1, 0).over(Windows.w_vin))
.withColumn("Ah_each_record", ($"curr" + $"curr_last") * 0.5 * $"time_interval_pre" / 3600)
.drop("time_interval_pre","time_interval_post","curr_last")
.withColumn("Ah",sum("Ah_each_record").over(Windows.w_chargectime))
.drop("Ah_each_record")
.withColumn("start_soc", first("soc").over(Windows.w_chargectime))
.withColumn("end_soc", last("soc").over(Windows.w_chargectime))
.drop("soc")
.withColumn("charge_soc", $"end_soc" - $"start_soc")
.withColumn("temp_diff", $"temp_high_evcc" - $"temp_low_evcc")
.drop("celldata_array")
val charge_sliced = charge_unsliced_2.groupBy("vin", "charge_id")
.agg(
mean("probe_ctime_mean").as("temp_mean"),
first("miles").as("miles"),
first("start_time").as("start_time"),
first("end_time").as("end_time"),
first("Ah").as("Ah"),
first("start_soc").as("start_soc"),
first("end_soc").as("end_soc"),
max("curr").as("curr_max"),
min("curr").as("curr_min"),
mean("curr").as("curr_mean"),
max("volt").as("volt_max"),
min("volt").as("volt_min"),
mean("volt").as("volt_mean"),
max("power").as("power_max"),
min("power").as("power_min"),
mean("power").as("power_mean"),
max($"temp_high_evcc").as("temp_high_max"),
min($"temp_low_evcc").as("temp_low_min"),
min($"temp_high_evcc").as("temp_high_evcc_min"),
first("charge_soc").as("charge_soc"),
max($"temp_diff").as("probe_diff_max")
)
.withColumn("Current_first_starttime", first("start_time").over(Windows.w_chargeid))
.withColumn("start_time_str", ctime2date($"start_time"))
.withColumn("end_time_str", ctime2date($"end_time"))
/** 加载历史chargeid **/
var history_id_df = spark.read.format("com.databricks.spark.csv")
.option("header", "false").option("delimiter", ",").option("inferSchema", "true").load(historyIdDFInputPath)
.toDF("history_vin", "history_id")
val df = charge_sliced.join(history_id_df, charge_sliced("vin") === history_id_df("history_vin"), "outer")
.withColumn("history_vin_null2x", when($"history_vin".isNull, "x").otherwise($"history_vin"))
.withColumn("vin_null2x", when($"vin".isNull, "x").otherwise($"vin"))
.withColumn("v", Windows.vin($"vin_null2x", $"history_vin_null2x"))
.drop("history_vin", "vin", "history_vin_null2x", "vin_null2x")
.withColumnRenamed("v", "vin")
.withColumn("charge_id_null", when($"charge_id".isNull, 0).otherwise($"charge_id"))
.withColumn("history_id_null", when($"history_id".isNull, 0).otherwise($"history_id"))
.withColumn("charge_id_now", $"charge_id_null" + $"history_id_null")
.drop("charge_id_null", "history_id_null", "charge_id", "history_id")
.withColumnRenamed("charge_id_now", "charge_id")
/** 存储chargeId和需要的数据 **/
val buffer = df.select("vin", "charge_id")
.groupBy("vin")
.agg(
max("charge_id").as("charge_id")
)
buffer.coalesce(1).write.mode(SaveMode.Append).option("header", "false").csv(historyIdDFOutputPath)
val b = df
.withColumn("Dod", lag("end_soc", 1, Double.NaN).over(Windows.w_chargeid) - $"start_soc")
.withColumn("charge_interval", $"end_time" - $"start_time")
.withColumn("charge_type", Windows.charge_type($"charge_interval", $"charge_soc"))
b
}
.reduce(_ union _)
val chargedf = fDF.select("vin","charge_id","start_time","end_time","start_time_str","end_time_str","charge_interval","start_soc","end_soc","Dod")
chargedf.withColumn("id",$"vin").repartition($"id").write.mode(SaveMode.Overwrite).partitionBy("id").parquet(chargeSliceOutput)
val charge_table = fDF.select("vin", "charge_id", "start_time", "end_time", "charge_interval", "charge_type", "charge_soc", "curr_max", "curr_min", "curr_mean", "start_soc", "end_soc", "temp_high_max", "temp_low_min",
"probe_diff_max", "volt_max", "volt_min","volt_mean","miles", "power_max", "power_min", "power_mean","Ah","temp_mean")
//写入数据库
writeSQL(charge_table,sql_url,user,password,table)
}
}