2019.08.06

今天重新修改了一版charge切片逻辑

NewChargeSlice 是基于原来的charge的注释的一版

package BatteryCareSparkJob.chargeslice

import java.text.SimpleDateFormat
import java.util.{Date, Properties}

import BatteryCareSparkJob.chargeslice
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.IntegerType

/**
  * @Author: Houzhq
  * @Date: 2019/8/6 9:42
  * @Version 1.0
  * @Note
  */
object NewChargeSlice {

  def main(args: Array[String]): Unit = {
    def writeSQL(df:DataFrame, sql_url:String, user:String, password:String, table:String): Unit ={

      //val sql_url="jdbc:mysql://10.170.245.98:3306/bigdata?useUnicode=true&characterEncoding=utf8"
      val prop=new Properties()
      prop.setProperty("user", user) //"root"
      prop.setProperty("password", password) //"reachauto"
      prop.setProperty("driver","com.mysql.jdbc.Driver")
      df.write.mode(SaveMode.Append).jdbc(sql_url,table,prop)
    }


    def initializationSpark(app_name:String): SparkSession ={
      val sparkconf = new SparkConf()
        .set( "spark.serializer", "org.apache.spark.serializer.KryoSerializer" )

      val spark = SparkSession
        .builder()
        .appName(app_name)
        .config(sparkconf)
        //      .master("local[*]")
        .getOrCreate()
      spark
    }

    val spark = initializationSpark("charge_slices")
    import spark.implicits._


    def tranTimeToString(tm:String) :String={
      val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      val tim = fm.format(new Date(tm.toLong*1000))
      tim
    }
    val ctime2date = udf((ctime:String) => tranTimeToString(ctime))




    /**last2hoursNotCompleteDFInputPath 上个两小时未充完电(未切分)的数据输入存储路径**/
    val last2hoursNotCompleteDFInputPath = args(0)

    /**originalDataInputPath 本个两小时输入的原始数据**/
    val originalDataInputPath = args(1)



    /**batchSize 确定同时从原始数据获取的车辆数**/
    val batchSize = args(2).toInt

    /**last2hoursNotCompleteDFOutputPath 上个两小时未充完电(未切分)的数据输出存储路径(这里需要脚本控制,每次程序跑完将此目录中的数据cp到last2hoursNotCompleteDFInputPath)**/
    val last2hoursNotCompleteDFOutputPath = args(3)

    /**historyIdDFInputPath 历史charge_id等数据的输入路径**/
    val historyIdDFInputPath = args(4)

    /**historyIdDFOutputPath 历史charge_id等数据的输出路径 (这里需要脚本控制,每次程序跑完将此目录中的数据cp到historyIdDFInputPath)**/
    val historyIdDFOutputPath = args(5)

    /**前十次充电的数据**/
    val vdiff_10pre_output = args(6)

    /**充电切片输出路径**/
    val chargeSliceOutput = args(7)

    val sql_url = args(8)
    val user = args(9)
    val password = args(10)
    val table = args(11)

    /**vin 车架号 可以用逗号连接各个车架号,可以为空,为空为默认从本个两小时中找到所有车辆**/
    val vin = args.lift(12)





    /**加载上两个小时未充完数据**/
    val last2hoursNotCompleteDF: DataFrame = spark.read.parquet(last2hoursNotCompleteDFInputPath)

//      .select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt",  "lon", "lat","curr", "soc",
//        "celldata_error", "probedata_error","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
//      .drop("id")

      .select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt","curr", "soc",
        "celldata_error", "probedata_error","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
      .drop("id")


    val vins = vin match {
      case None => FileSystem
        .get(spark.sparkContext.hadoopConfiguration)
        .listStatus(new Path(originalDataInputPath))
        .flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))

      //      case None => FileSystem.get(spark.sparkContext.hadoopConfiguration).listStatus(new Path(originalDataInputPath)).flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))

      case Some(v) => v.split(',')
    }


    val fDF = vins
      .map{ v=> originalDataInputPath + "id=" + v }
      .grouped(batchSize)
      .map {
        p =>

          val vinArray = p.map{  l => "(?<=id=).+".r.findFirstIn( l.replace(originalDataInputPath, "")).get.toString}

          val last2hoursNotCompleteDF_vin = last2hoursNotCompleteDF.filter($"vin".isin(vinArray:_*))


          /** 当前两小时全部数据 **/
          val parquet_DF: DataFrame = spark.read.parquet(p: _*)
            .select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt",  "lon", "lat","curr", "soc",
              "celldata_error", "probedata_error", "temp_high", "temp_low","cell_high","cell_low","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
            //            .withColumnRenamed("miles", "miles")
            .drop("lon", "lat","temp_high", "temp_low","cell_high","cell_low")

            .union(last2hoursNotCompleteDF_vin)
            .withColumn("last_charge_state",last("charge_state").over(Windows.w_vin_unboundedFollowing))


          val notCompleteDF =parquet_DF

            .filter($"last_charge_state" === 1)
            .drop("last_charge_state")
          notCompleteDF.withColumn("id", $"vin").repartition($"id").write.mode(SaveMode.Append).partitionBy("id").parquet(last2hoursNotCompleteDFOutputPath)




          val todayDF = parquet_DF.filter($"last_charge_state" =!= 1).drop("last_charge_state")



          val charge_unsliced = todayDF
            .withColumn("time_interval_pre", $"ctime" - lag("ctime",1,0).over(Windows.w_vin))
            .withColumn("time_interval_post", $"ctime" - lead("ctime", 1, 0).over(Windows.w_vin))
            .filter($"charge_state" === 1)
            .withColumn("charge_num_label",
              when(($"ctime" - lag("ctime", 1, 0).over(Windows.w_vin)) > 1800, 1)
                .otherwise(0))
            .withColumn("charge_id", sum("charge_num_label").over(Windows.w_Preceding))
            .drop("charge_num_label")

            .withColumn("start_time", first("ctime").over(Windows.w_chargectime))
            .withColumn("end_time", last("ctime").over(Windows.w_chargectime))

            .withColumn("st+40",$"start_time" + 40)
            .withColumn("et-30",$"end_time" - 30)
            .withColumn("real_st",when($"ctime" >=  $"st+40" && $"ctime" <=  $"et-30",1).otherwise(0))
            .filter($"real_st" === 1)
            .drop("real_st","st+40","et-30")
            .filter($"celldata_error" === 0)
            .filter($"probedata_error" === 0)
            .drop("celldata_error","probedata_error")

//          charge_unsliced.show(1000,false)

//          val aSeriesMax = new chargeslice.AnotherSeriesMax(IntegerType, 34)
//          val aSeriesMin = new chargeslice.AnotherSeriesMin(IntegerType, 34)
//          val array_sub_int = udf((a: Seq[Int], b: Int) => a.map(_ - b))
//
//
//          val pack_warmtime_ctime = new MyAnotherMax(IntegerType)
//          val pack_cooltime_ctime = new MyAnotherMin(IntegerType)


          val charge_unsliced_2 = charge_unsliced

            .withColumn("celldata_array", chargeslice.Windows.fun_data2array($"celldata"))
            .withColumn("probedata_array", chargeslice.Windows.fun_probe2array($"probedata"))
            .drop("celldata","probedata")
            .withColumn("power", $"volt" * $"curr")
            .withColumn("curr_last", lag("curr", 1, 0).over(Windows.w_vin))
            .withColumn("Ah_each_record", ($"curr" + $"curr_last") * 0.5 * $"time_interval_pre" / 3600)
            .drop("time_interval_pre","time_interval_post","curr_last")
            .withColumn("Ah",sum("Ah_each_record").over(Windows.w_chargectime))
            .drop("Ah_each_record")
            .withColumn("start_soc", first("soc").over(Windows.w_chargectime))
            .withColumn("end_soc", last("soc").over(Windows.w_chargectime))
            .drop("soc")
//            .withColumn("usage_time",max($"ctime").over(Windows.w_chargeid) - min($"ctime").over(Windows.w_chargeid))
//            .withColumn("cell_volt_before_charge_array", first("celldata_array").over(Windows.w_chargectime))
//            .withColumn("cell_volt_after_charge_array", last("celldata_array").over(Windows.w_chargectime))
//            .withColumn("probe_temp_before_charge_array", first("probedata_array").over(Windows.w_chargectime))
//            .withColumn("probe_temp_after_charge_array", last("probedata_array").over(Windows.w_chargectime))
            .withColumn("charge_soc", $"end_soc" - $"start_soc")
//            .withColumn("cell_mean", Windows.fun_mean($"celldata_array"))
//            .withColumn("cell_std", Windows.fun_std($"celldata_array"))
//            .withColumn("cell_diff", $"cell_high_evcc" - $"cell_low_evcc")
//            .withColumn("temp_mean", Windows.fun_mean($"probedata_array"))
//            .withColumn("temp_std", Windows.fun_std($"probedata_array"))
            .withColumn("temp_diff", $"temp_high_evcc" - $"temp_low_evcc")
//            .withColumn("max-fifthMax", Windows.fun_diff_among_max($"celldata_array"))
//            .withColumn("min-fifthMin", Windows.fun_diff_among_min($"celldata_array"))
//            .withColumn("max-mean", Windows.fun_max_diff_with_mean($"celldata_array"))
//            .withColumn("min-mean", Windows.fun_min_diff_with_mean($"celldata_array"))
//            .withColumn("probe(median-min)", Windows.fun_median($"probedata_array") - Windows.fun_min($"probedata_array"))
//            .withColumn("probe(max-median)", Windows.fun_max($"probedata_array") - Windows.fun_median($"probedata_array"))
            .drop("celldata_array")

          val charge_sliced = charge_unsliced_2.groupBy("vin", "charge_id")
            .agg(
              first("miles").as("miles"),
//              first("lon").as("lon"),
//              first("lat").as("lat"),
              first("start_time").as("start_time"),
              first("end_time").as("end_time"),
              first("Ah").as("Ah"),
//              first("usage_time").as("usage_time"),
              first("start_soc").as("start_soc"),
              first("end_soc").as("end_soc"),
              max("curr").as("curr_max"),
              min("curr").as("curr_min"),
              mean("curr").as("curr_mean"),
              max("volt").as("volt_max"),
              min("volt").as("volt_min"),
              mean("volt").as("volt_mean"),

              max("power").as("power_max"),
              min("power").as("power_min"),
              mean("power").as("power_mean"),

              max($"temp_high_evcc").as("temp_high_max"),
//              mean($"temp_high_evcc").as("temp_high_mean"),
              min($"temp_low_evcc").as("temp_low_min"),
//              mean($"temp_low_evcc").as("temp_low_mean"),
//              max($"temp_low_evcc").as("temp_low_evcc_max"),
              min($"temp_high_evcc").as("temp_high_evcc_min"),
              first("charge_soc").as("charge_soc"),
//              first("cell_volt_before_charge_array").as("cell_volt_before_charge_array"),
//              first("cell_volt_after_charge_array").as("cell_volt_after_charge_array"),
//              first("probe_temp_before_charge_array").as("probe_temp_before_charge_array"),
//              first("probe_temp_after_charge_array").as("probe_temp_after_charge_array"),
//              first("temp_high_evcc").as("temp_high_evcc"),
//              first("temp_low_evcc").as("temp_low_evcc"),
//              array_sub_int(aSeriesMax($"ctime", $"probedata_array"), min($"ctime")).as("probe_warmtime"),
//              array_sub_int(aSeriesMin($"ctime", $"probedata_array"), min($"ctime")).as("probe_cooltime"),
//              (pack_warmtime_ctime($"ctime", $"temp_high_evcc") - min($"ctime")).as("pack_warmtime"),
//              (pack_cooltime_ctime($"ctime", $"temp_low_evcc") - min($"ctime")).as("pack_cooltime"),
//              max("cell_mean").as("cell_volt_mean_max"),
//              min("cell_mean").as("cell_volt_mean_min"),
//              PercentileApprox.percentile_approx($"cell_mean", lit(0.5)).as("cell_volt_mean_median"),
//              max($"cell_std").as("cell_volt_std_max"),
//              min($"cell_std").as("cell_volt_std_min"),
//              PercentileApprox.percentile_approx($"cell_std", lit(0.5)).as("cell_volt_std_median"),
//              max($"cell_diff").as("cell_volt_diff_max"),
//              min($"cell_diff").as("cell_volt_diff_min"),
//              PercentileApprox.percentile_approx($"cell_diff", lit(0.5)).as("cell_volt_diff_median"),
//              max($"temp_mean").as("probe_mean_max"),
//              min($"temp_mean").as("probe_mean_min"),
//              PercentileApprox.percentile_approx($"temp_mean", lit(0.5)).as("probe_mean_median"),
//              max($"temp_std").as("probe_std_max"),
//              min($"temp_std").as("probe_std_min"),
//              PercentileApprox.percentile_approx($"temp_std", lit(0.5)).as("probe_std_median"),
              max($"temp_diff").as("probe_diff_max")
//              min($"temp_diff").as("probe_diff_min"),
//              PercentileApprox.percentile_approx($"temp_diff", lit(0.5)).as("probe_diff_median"),
//              max("max-fifthMax").as("diff_among_max"),
//              max("min-fifthMin").as("diff_among_min"),
//              max("max-mean").as("max_diff_with_mean"),
//              max("min-mean").as("min_diff_with_mean"),
//              max("probe(median-min)").as("probe_diff_med_low_max"),
//              max("probe(max-median)").as("probe_diff_high_med_max"),
//              PercentileApprox.percentile_approx($"cell_mean", lit(0.25)).as("cell_volt_mean_qu1"),
//              PercentileApprox.percentile_approx($"cell_mean", lit(0.75)).as("cell_volt_mean_qu3"),
//              PercentileApprox.percentile_approx($"cell_std", lit(0.25)).as("cell_volt_std_qu1"),
//              PercentileApprox.percentile_approx($"cell_std", lit(0.75)).as("cell_volt_std_qu3"),
//              PercentileApprox.percentile_approx($"cell_diff", lit(0.25)).as("cell_volt_diff_qu1"),
//              PercentileApprox.percentile_approx($"cell_diff", lit(0.75)).as("cell_volt_diff_qu3"),
//              PercentileApprox.percentile_approx($"temp_mean", lit(0.25)).as("probe_mean_qu1"),
//              PercentileApprox.percentile_approx($"temp_mean", lit(0.75)).as("probe_mean_qu3"),
//              PercentileApprox.percentile_approx($"temp_std", lit(0.25)).as("probe_std_qu1"),
//              PercentileApprox.percentile_approx($"temp_std", lit(0.75)).as("probe_std_qu3"),
//              PercentileApprox.percentile_approx($"temp_diff", lit(0.25)).as("probe_diff_qu1"),
//              PercentileApprox.percentile_approx($"temp_diff", lit(0.75)).as("probe_diff_qu3"),
//              PercentileApprox.percentile_approx($"temp_high_evcc", lit(0.5)).as("temp_high_median"),
//              PercentileApprox.percentile_approx($"temp_high_evcc", lit(0.25)).as("temp_high_qu1"),
//              PercentileApprox.percentile_approx($"temp_high_evcc", lit(0.75)).as("temp_high_qu3"),
//              PercentileApprox.percentile_approx($"temp_low_evcc", lit(0.5)).as("temp_low_median"),
//              PercentileApprox.percentile_approx($"temp_low_evcc", lit(0.25)).as("temp_low_qu1"),
//              PercentileApprox.percentile_approx($"temp_low_evcc", lit(0.75)).as("temp_low_qu3"),
//              PercentileApprox.percentile_approx($"curr", lit(0.5)).as("curr_median"),
//              PercentileApprox.percentile_approx($"curr", lit(0.25)).as("curr_qu1"),
//              PercentileApprox.percentile_approx($"curr", lit(0.75)).as("curr_qu3"),
//              PercentileApprox.percentile_approx($"volt", lit(0.5)).as("volt_median"),
//              PercentileApprox.percentile_approx($"volt", lit(0.25)).as("volt_qu1"),
//              PercentileApprox.percentile_approx($"volt", lit(0.75)).as("volt_qu3"),
//              PercentileApprox.percentile_approx($"power", lit(0.5)).as("power_median"),
//              PercentileApprox.percentile_approx($"power", lit(0.25)).as("power_qu1"),
//              PercentileApprox.percentile_approx($"power", lit(0.75)).as("power_qu3")
            )
            .withColumn("Current_first_starttime", first("start_time").over(Windows.w_chargeid))
//            .withColumn("probe_temp_before_charge_median", Windows.fun_median($"probe_temp_before_charge_array"))
//            .withColumn("probe_temp_after_charge_median", Windows.fun_median($"probe_temp_after_charge_array"))

//            .withColumn("ptbc_D", Windows.fun_ptac_D($"probe_temp_before_charge_array", $"probe_temp_before_charge_median"))
//            .withColumn("ptac_D", Windows.fun_ptac_D($"probe_temp_after_charge_array", $"probe_temp_after_charge_median"))
//            .withColumn("ptac_Dm", Windows.fun_mean($"ptac_D"))
//            .withColumn("ptac_Ds", Windows.fun_std($"ptac_D"))


          /** 加载历史chargeid **/
          var history_id_df = spark.read.format("com.databricks.spark.csv")
            .option("header", "false").option("delimiter", ",").option("inferSchema", "true").load(historyIdDFInputPath)
            .toDF("history_vin", "history_id")

          //            .toDF("history_vin", "history_id", "xxx")
//            .withColumn("history_first_starttime", split($"xxx", "\\$")(0))
//            .withColumn("histroy_ptac_Dm_10", split($"xxx", "\\$")(1))
//            .withColumn("histroy_ptac_Ds_10", split($"xxx", "\\$")(2))
//            .withColumn("histroy_ptac_Dm_all_num", split($"xxx", "\\$")(3))
//            .withColumn("histroy_ptac_Ds_all_num", split($"xxx", "\\$")(4))
//            .withColumn("histroy_ptac_Dm_all", split($"xxx", "\\$")(5))
//            .withColumn("histroy_ptac_Ds_all", split($"xxx", "\\$")(6))
//            .drop("xxx")

//          val p10 = udf((p: Seq[Double]) => {
//            p.mkString(";")
//          })

          val dfa = charge_sliced
//            .withColumn("ptac_Dm_array",collect_list("ptac_Dm").over(Windows.w_chargeid))
//            .withColumn("ptac_Ds_array",collect_list("ptac_Ds").over(Windows.w_chargeid))
//            .withColumn("current_ptac_Dm",p10($"ptac_Dm_array"))
//            .withColumn("current_ptac_Ds",p10($"ptac_Ds_array"))
//            .drop("ptac_Dm_array","ptac_Ds_array")
//            .drop("vin_g")


//          val get10 = udf((histroy_ptac_Dm_10_null: String, current_ptac_Dm_null: String) => {
//            var ptac_Dm_10 = ""
//            if (histroy_ptac_Dm_10_null.equals("null")){
//              ptac_Dm_10 = current_ptac_Dm_null
//            }else if(current_ptac_Dm_null.equals("null")){
//              ptac_Dm_10 = histroy_ptac_Dm_10_null
//            }
//            else if (!histroy_ptac_Dm_10_null.equals("null") && !current_ptac_Dm_null.equals("null")){
//              val all = histroy_ptac_Dm_10_null + ";" + current_ptac_Dm_null
//              val all_arr = all.split(";")
//              val length  = all_arr.length
//              if (length > 10){
//                ptac_Dm_10 = all_arr.takeRight(10).mkString(";")
//              }else{
//                ptac_Dm_10 = all
//              }
//
//            }
//            ptac_Dm_10
//          })
//
//
//          val getHistory = udf((current_ptac_Dm_null: String, histroy_ptac_Dm_all_num_null: Double) => {
//            var history_all = 0.0
//            if(current_ptac_Dm_null.equals("null")){
//
//              history_all = histroy_ptac_Dm_all_num_null
//            }
//            else if(histroy_ptac_Dm_all_num_null == -999.0){
//              val sum = current_ptac_Dm_null.split(";").map(_.toDouble).sum
//
//              history_all = sum
//            }
//
//            else if(!current_ptac_Dm_null.equals("null") && histroy_ptac_Dm_all_num_null != -999.0){
//              val sum = current_ptac_Dm_null.split(";").map(_.toDouble).sum + histroy_ptac_Dm_all_num_null
//              history_all = sum
//            }
//            history_all
//          })
//
//
//          val getHistoryAllSum = udf((histroy_ptac_Dm_null: Double, current_ptac_Dm_null: String) => {
//            var a = 0.0
//            if (histroy_ptac_Dm_null == -999.0){
//              val str_arr = current_ptac_Dm_null.split(";")
//              val arr = str_arr.map{
//                v=>
//                  val t = v.toDouble
//                  val x = t * t
//                  x
//              }
//              a = arr.sum
//            }else if(current_ptac_Dm_null.equals("null")){
//              a = histroy_ptac_Dm_null
//            }
//            else if (histroy_ptac_Dm_null != -999.0 && !current_ptac_Dm_null.equals("null")){
//              val str_arr = current_ptac_Dm_null.split(";")
//              val arr = str_arr.map{
//                v=>
//                  val t = v.toDouble
//                  val x = t * t
//                  x
//              }
//              a = arr.sum + histroy_ptac_Dm_null
//            }
//            a
//          })

          val df = dfa.join(history_id_df, dfa("vin") === history_id_df("history_vin"), "outer")
            .withColumn("history_vin_null2x", when($"history_vin".isNull, "x").otherwise($"history_vin"))
            .withColumn("vin_null2x", when($"vin".isNull, "x").otherwise($"vin"))
            .withColumn("v", Windows.vin($"vin_null2x", $"history_vin_null2x"))
            .drop("history_vin", "vin", "history_vin_null2x", "vin_null2x")
            .withColumnRenamed("v", "vin")
            .withColumn("charge_id_null", when($"charge_id".isNull, 0).otherwise($"charge_id"))
            .withColumn("history_id_null", when($"history_id".isNull, 0).otherwise($"history_id"))
            .withColumn("charge_id_now", $"charge_id_null" + $"history_id_null")
            .drop("charge_id_null", "history_id_null", "charge_id", "history_id")
            .withColumnRenamed("charge_id_now", "charge_id")
//            .withColumn("Current_first_starttime_null", when($"Current_first_starttime".isNull, -1).otherwise($"Current_first_starttime"))
//            .withColumn("history_first_starttime_null", when($"history_first_starttime".isNull, -1).otherwise($"history_first_starttime"))
//            .withColumn("first_starttime", Windows.first_starttime($"Current_first_starttime_null", $"history_first_starttime_null"))
//            .drop("Current_first_starttime_null", "history_first_starttime_null", "history_first_starttime", "Current_first_starttime")
//            .withColumn("current_ptac_Dm_null", when($"current_ptac_Dm".isNull, "null").otherwise($"current_ptac_Dm"))
//            .withColumn("histroy_ptac_Dm_10_null", when($"histroy_ptac_Dm_10".isNull, "null").otherwise($"histroy_ptac_Dm_10"))
//            .drop("current_ptac_Dm","histroy_ptac_Dm_10")
//            .withColumn("ptac_Dm_10",get10($"histroy_ptac_Dm_10_null",$"current_ptac_Dm_null"))
//            .withColumn("current_ptac_Ds_null", when($"current_ptac_Ds".isNull, "null").otherwise($"current_ptac_Ds"))
//            .withColumn("histroy_ptac_Ds_10_null", when($"histroy_ptac_Ds_10".isNull, "null").otherwise($"histroy_ptac_Ds_10"))
//            .drop("current_ptac_Ds","histroy_ptac_Ds_10")
//            .withColumn("ptac_Ds_10",get10($"histroy_ptac_Ds_10_null",$"current_ptac_Ds_null"))
//            .withColumn("histroy_ptac_Dm_all_num_null", when($"histroy_ptac_Dm_all_num".isNull, -999.0).otherwise($"histroy_ptac_Dm_all_num"))
//            .withColumn("histroy_ptac_Ds_all_num_null", when($"histroy_ptac_Ds_all_num".isNull, -999.0).otherwise($"histroy_ptac_Ds_all_num"))
//            .drop("histroy_ptac_Dm_all_num","histroy_ptac_Ds_all_num")
//            .withColumn("ptac_Dm_all_num",getHistory($"current_ptac_Dm_null",$"histroy_ptac_Dm_all_num_null"))
//            .withColumn("ptac_Ds_all_num",getHistory($"current_ptac_Ds_null",$"histroy_ptac_Ds_all_num_null"))
//            .withColumn("histroy_ptac_Dm_all_null", when($"histroy_ptac_Dm_all".isNull, -999.0).otherwise($"histroy_ptac_Dm_all"))
//            .drop("histroy_ptac_Dm_all")
//            .withColumn("ptac_Dm_all",getHistoryAllSum($"histroy_ptac_Dm_all_num_null",$"current_ptac_Dm_null"))
//            .withColumn("histroy_ptac_Ds_all_null", when($"histroy_ptac_Ds_all".isNull, "null").otherwise($"histroy_ptac_Ds_all"))
//            .drop("histroy_ptac_Ds_all")
//            .withColumn("ptac_Ds_all",getHistoryAllSum($"histroy_ptac_Ds_all_num_null",$"current_ptac_Ds_null"))


//          /** 存储chargeId和需要的数据 **/
//          val buffer = df.select("vin", "charge_id", "first_starttime", "ptac_Dm_10", "ptac_Ds_10", "ptac_Dm_all_num", "ptac_Ds_all_num", "ptac_Dm_all", "ptac_Ds_all")
//            .withColumn("xxx", concat_ws("$", $"first_starttime", $"ptac_Dm_10", $"ptac_Ds_10", $"ptac_Dm_all", $"ptac_Ds_all", $"ptac_Dm_all", $"ptac_Ds_all"))
//            .groupBy("vin")
//            .agg(
//              max("charge_id").as("charge_id"),
//              last("xxx").as("xx")
//            )
//          buffer.coalesce(1).write.mode(SaveMode.Append).option("header", "false").csv(historyIdDFOutputPath)
          //    buffer.write.mode(SaveMode.Append).option("header", "true").csv(historyIdDFOutputPath)


          /** 存储chargeId和需要的数据 **/
          val buffer = df.select("vin", "charge_id")
//            .withColumn("xxx", concat_ws("$", $"first_starttime", $"ptac_Dm_10", $"ptac_Ds_10", $"ptac_Dm_all", $"ptac_Ds_all", $"ptac_Dm_all", $"ptac_Ds_all"))
            .groupBy("vin")
            .agg(
              max("charge_id").as("charge_id")
//              last("xxx").as("xx")
            )
          buffer.coalesce(1).write.mode(SaveMode.Append).option("header", "false").csv(historyIdDFOutputPath)


          val DF = df
//            .withColumn("ptac_Dm_10_array", Windows.fun_2array($"ptac_Dm_10"))
//            .withColumn("ptac_Dmm10", Windows.fun_mean($"ptac_Dm_10_array"))
//            .withColumn("ptac_Dms10", Windows.fun_std($"ptac_Dm_10_array"))
//            .drop("ptac_Dm_10_array")
//            .withColumn("ptac_Ds_10_array", Windows.fun_2array($"ptac_Ds_10"))
//            .withColumn("ptac_Dsm10", Windows.fun_mean($"ptac_Ds_10_array"))
//            .withColumn("ptac_Dss10", Windows.fun_std($"ptac_Ds_10_array"))
//            .drop("ptac_Ds_10_array")
//            .withColumn("ptac_DmmAll", $"ptac_Dm_all_num"/$"charge_id")
//            .withColumn("ptac_DsmAll", $"ptac_Ds_all_num"/$"charge_id")
//            .withColumn("ptac_Dm_all_array", Windows.fun_2array($"ptac_Dm_all"))


//            .withColumn("ptac_DmsAll", sqrt(($"ptac_Dm_all" - $"ptac_DmmAll" * $"ptac_DmmAll")/$"charge_id") )
//
//            .drop("ptac_Dm_all_array")
//            .withColumn("ptac_Ds_all_array", Windows.fun_2array($"ptac_Ds_all"))
//
//            .withColumn("ptac_DssAll", sqrt(($"ptac_Ds_all" - $"ptac_DsmAll" * $"ptac_DsmAll")/$"charge_id") )
//            .drop("ptac_Ds_all_array")
//            .withColumn("ptac_DmmAll_abs", abs($"ptac_DmmAll"))
//            .withColumn("ptac_Dmm10_abs", abs($"ptac_Dmm10"))
//            .withColumn("ptac_Dm_abs", abs($"ptac_Dm"))
//            .filter($"ptac_Dm".isNotNull)

//            .drop("current_ptac_Dm_null","histroy_ptac_Dm_10_null","ptac_Dm_10","current_ptac_Ds_null","histroy_ptac_Ds_10_null","ptac_Ds_10","histroy_ptac_Dm_all_num_null","histroy_ptac_Ds_all_num_null",
//              "ptac_Dm_all_num","ptac_Ds_all_num","histroy_ptac_Dm_all_null","histroy_ptac_Ds_all_null","ptac_Dm_all","ptac_Ds_all")

//            .withColumn("d_T_med", abs($"probe_temp_after_charge_median" - $"probe_temp_before_charge_median"))

//            .withColumn("temp_raising_rate", Windows.temp_raising_rate($"ptac_D", $"ptbc_D", $"d_T_med"))

          val b = DF
//            .withColumn("temp_high_range", $"temp_high_max" - $"temp_high_evcc_min")
//            .withColumn("temp_low_range", $"temp_low_evcc_max" - $"temp_low_min")
//            .withColumn("temp_mean_before_charge", Windows.fun_mean($"probe_temp_before_charge_array"))
//            .withColumn("temp_std_before_charge", Windows.fun_std($"probe_temp_before_charge_array"))
//            .withColumn("temp_max_before_charge", Windows.fun_max($"probe_temp_before_charge_array"))
//            .withColumn("temp_min_before_charge", Windows.fun_min($"probe_temp_before_charge_array"))
//            .withColumn("probe_temp_before_charge_diff", $"temp_max_before_charge" - $"temp_min_before_charge")
//            .withColumn("temp_mean_after_charge", Windows.fun_mean($"probe_temp_after_charge_array"))
//            .withColumn("temp_std_after_charge", Windows.fun_std($"probe_temp_after_charge_array"))
//            .withColumn("temp_max_after_charge", Windows.fun_max($"probe_temp_after_charge_array"))
//            .withColumn("temp_min_after_charge", Windows.fun_min($"probe_temp_after_charge_array"))
//            .withColumn("probe_temp_after_charge_diff", $"temp_max_after_charge" - $"temp_min_after_charge")
//            .withColumn("cell_volt_rise", Windows.fun_rise($"cell_volt_before_charge_array", $"cell_volt_after_charge_array"))
//            .withColumn("volt_mean_before_charge", Windows.fun_mean($"cell_volt_before_charge_array"))
//            .withColumn("probe_temp_rise", Windows.fun_rise($"probe_temp_before_charge_array", $"probe_temp_after_charge_array"))
//            .withColumn("volt_std_before_charge", Windows.fun_std($"cell_volt_before_charge_array"))
//            .withColumn("volt_mean_after_charge", Windows.fun_mean($"cell_volt_after_charge_array"))
//            .withColumn("volt_std_after_charge", Windows.fun_std($"cell_volt_after_charge_array"))
//            .withColumn("cell_volt_after_charge_diff", Windows.fun_max($"cell_volt_after_charge_array") - Windows.fun_min($"cell_volt_after_charge_array"))
//            .withColumn("end_diff_among_max", Windows.fun_max_diff_with_mean($"cell_volt_after_charge_array"))
//            .withColumn("end_diff_among_min", Windows.fun_min_diff_with_mean($"cell_volt_after_charge_array"))
            .withColumn("Dod", lag("end_soc", 1, Double.NaN).over(Windows.w_chargeid) - $"start_soc")
//            .withColumn("diff_volt_between_max_and_end", $"max_diff_with_mean" - $"end_diff_among_max")
//            .withColumn("charge_freq", $"charge_id" * 1000 / $"miles")
//            .withColumn("fullCharge", when($"end_soc" > 99, 1).otherwise(0))
//            .withColumn("dvdiffdt", when($"end_soc" > 99, $"cell_volt_diff_max" / ($"end_time" - $"first_starttime")).otherwise(Double.NaN))
//            .withColumnRenamed("cell_volt_before_charge_array","cell_volt_before_charge")
//            .withColumnRenamed("cell_volt_after_charge_array","cell_volt_after_charge")
//            .withColumnRenamed("probe_temp_before_charge_array","probe_temp_before_charge")
//            .withColumnRenamed("probe_temp_after_charge_array","probe_temp_after_charge")
            .withColumn("charge_interval", $"end_time" - $"start_time")
            .withColumn("charge_type", Windows.charge_type($"charge_interval", $"charge_soc"))
            .withColumn("start_time_str", ctime2date($"start_time"))
            .withColumn("end_time_str", ctime2date($"end_time"))

          b
      }
      .reduce(_ union _)

    //    val w_10= Window
    //      .partitionBy("vin")
    //
    //    val w_101 = fDF
    //      .filter($"fullCharge" === 1)
    //      .select("vin","charge_id","fullCharge","cell_volt_diff_max","first_starttime")
    //
    //    w_101
    //      .createOrReplaceTempView("data")
    //
    //    val vdiff_10pre = spark.sql(
    //      """SELECT * from data order by charge_id limit 10
    //      """.stripMargin)
    //      .groupBy("vin")
    //      .agg(
    //        mean("cell_volt_diff_max").as("vdiff_10pre"),
    //        first("first_starttime").as("first_charge_time")
    //      )
    //
    //
    //    vdiff_10pre.write.mode(SaveMode.Overwrite).parquet(vdiff_10pre_output)



    val chargedf = fDF.select("vin","charge_id","start_time","end_time","start_time_str","end_time_str","charge_interval","start_soc","end_soc","Dod")

      .withColumn("id",$"vin").repartition($"id").write.mode(SaveMode.Overwrite).partitionBy("id").parquet(chargeSliceOutput)


    val charge_table = fDF.select("vin", "charge_id", "start_time", "end_time", "charge_interval", "charge_type", "charge_soc", "curr_max", "curr_min", "curr_mean", "start_soc", "end_soc", "temp_high_max", "temp_low_min",
      "probe_diff_max", "volt_max", "volt_min","volt_mean","miles", "power_max", "power_min", "power_mean","Ah","probe_mean_median")

    //写入数据库
    writeSQL(charge_table,sql_url,user,password,table)
  }
}
FirstChargeSlice 为测试完成的新版charge切片,去掉了很多字段
package BatteryCareSparkJob.chargeslice

import java.text.SimpleDateFormat
import java.util.{Date, Properties}

import BatteryCareSparkJob.chargeslice
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.apache.spark.sql.functions._

/**
  * @Author: Houzhq
  * @Date: 2019/8/6 11:20
  * @Version 1.0
  * @Note
  */
object FirstChargeSlice {

  def main(args: Array[String]): Unit = {

    def writeSQL(df:DataFrame, sql_url:String, user:String, password:String, table:String): Unit ={

      //val sql_url="jdbc:mysql://10.170.245.98:3306/bigdata?useUnicode=true&characterEncoding=utf8"
      val prop=new Properties()
      prop.setProperty("user", user) //"root"
      prop.setProperty("password", password) //"reachauto"
      prop.setProperty("driver","com.mysql.jdbc.Driver")
      df.write.mode(SaveMode.Append).jdbc(sql_url,table,prop)
    }


    def initializationSpark(app_name:String): SparkSession ={
      val sparkconf = new SparkConf()
        .set( "spark.serializer", "org.apache.spark.serializer.KryoSerializer" )

      val spark = SparkSession
        .builder()
        .appName(app_name)
        .config(sparkconf)
        //      .master("local[*]")
        .getOrCreate()
      spark
    }

    val spark = initializationSpark("charge_slices")
    import spark.implicits._


    def tranTimeToString(tm:String) :String={
      val fm = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
      val tim = fm.format(new Date(tm.toLong*1000))
      tim
    }
    val ctime2date = udf((ctime:String) => tranTimeToString(ctime))

    /**last2hoursNotCompleteDFInputPath 上个两小时未充完电(未切分)的数据输入存储路径**/
    val last2hoursNotCompleteDFInputPath = args(0)

    /**originalDataInputPath 本个两小时输入的原始数据**/
    val originalDataInputPath = args(1)



    /**batchSize 确定同时从原始数据获取的车辆数**/
    val batchSize = args(2).toInt

    /**last2hoursNotCompleteDFOutputPath 上个两小时未充完电(未切分)的数据输出存储路径(这里需要脚本控制,每次程序跑完将此目录中的数据cp到last2hoursNotCompleteDFInputPath)**/
    val last2hoursNotCompleteDFOutputPath = args(3)

    /**historyIdDFInputPath 历史charge_id等数据的输入路径**/
    val historyIdDFInputPath = args(4)

    /**historyIdDFOutputPath 历史charge_id等数据的输出路径 (这里需要脚本控制,每次程序跑完将此目录中的数据cp到historyIdDFInputPath)**/
    val historyIdDFOutputPath = args(5)

    /**前十次充电的数据**/
    val vdiff_10pre_output = args(6)

    /**充电切片输出路径**/
    val chargeSliceOutput = args(7)

    val sql_url = args(8)
    val user = args(9)
    val password = args(10)
    val table = args(11)

    /**vin 车架号 可以用逗号连接各个车架号,可以为空,为空为默认从本个两小时中找到所有车辆**/
    val vin = args.lift(12)


    /**加载上两个小时未充完数据**/
    val last2hoursNotCompleteDF: DataFrame = spark.read.parquet(last2hoursNotCompleteDFInputPath)
      .select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt","curr", "soc",
        "celldata_error", "probedata_error","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
      .drop("id")

    val vins = vin match {
      case None => FileSystem
        .get(spark.sparkContext.hadoopConfiguration)
        .listStatus(new Path(originalDataInputPath))
        .flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))

      //      case None => FileSystem.get(spark.sparkContext.hadoopConfiguration).listStatus(new Path(originalDataInputPath)).flatMap(p => "(?<=id\\=).+".r.findFirstIn(p.getPath.toString))

      case Some(v) => v.split(',')
    }


    val fDF = vins
      .map{ v=> originalDataInputPath + "id=" + v }
      .grouped(batchSize)
      .map {
        p =>

          val vinArray = p.map { l => "(?<=id=).+".r.findFirstIn(l.replace(originalDataInputPath, "")).get.toString }

          val last2hoursNotCompleteDF_vin = last2hoursNotCompleteDF.filter($"vin".isin(vinArray: _*))

          /** 当前两小时全部数据 **/
          val parquet_DF: DataFrame = spark.read.parquet(p: _*)
            .select("vin", "ctime", "celldata", "probedata","miles", "charge_state_evcc", "charge_state", "volt",  "lon", "lat","curr", "soc",
              "celldata_error", "probedata_error", "temp_high", "temp_low","cell_high","cell_low","temp_high_evcc","temp_low_evcc","cell_high_evcc","cell_low_evcc" )
            .drop("lon", "lat","temp_high", "temp_low","cell_high","cell_low")

            .union(last2hoursNotCompleteDF_vin)
            .withColumn("last_charge_state",last("charge_state").over(Windows.w_vin_unboundedFollowing))


          val notCompleteDF =parquet_DF

            .filter($"last_charge_state" === 1)
            .drop("last_charge_state")
          notCompleteDF.withColumn("id", $"vin").repartition($"id").write.mode(SaveMode.Append).partitionBy("id").parquet(last2hoursNotCompleteDFOutputPath)




          val todayDF = parquet_DF.filter($"last_charge_state" =!= 1).drop("last_charge_state")



          val charge_unsliced = todayDF
            .withColumn("time_interval_pre", $"ctime" - lag("ctime",1,0).over(Windows.w_vin))
            .withColumn("time_interval_post", $"ctime" - lead("ctime", 1, 0).over(Windows.w_vin))
            .filter($"charge_state" === 1)
            .withColumn("charge_num_label",
              when(($"ctime" - lag("ctime", 1, 0).over(Windows.w_vin)) > 1800, 1)
                .otherwise(0))
            .withColumn("charge_id", sum("charge_num_label").over(Windows.w_Preceding))
            .drop("charge_num_label")

            .withColumn("start_time", first("ctime").over(Windows.w_chargectime))
            .withColumn("end_time", last("ctime").over(Windows.w_chargectime))

            .withColumn("st+40",$"start_time" + 40)
            .withColumn("et-30",$"end_time" - 30)
            .withColumn("real_st",when($"ctime" >=  $"st+40" && $"ctime" <=  $"et-30",1).otherwise(0))
            .filter($"real_st" === 1)
            .drop("real_st","st+40","et-30")
            .filter($"celldata_error" === 0)
            .filter($"probedata_error" === 0)
            .drop("celldata_error","probedata_error")



          val charge_unsliced_2 = charge_unsliced
            .withColumn("celldata_array", chargeslice.Windows.fun_data2array($"celldata"))
            .withColumn("probedata_array", chargeslice.Windows.fun_probe2array($"probedata"))

            .withColumn("probe_ctime_mean", chargeslice.Windows.fun_mean($"probedata_array"))

            .drop("celldata","probedata")
            .withColumn("power", $"volt" * $"curr")
            .withColumn("curr_last", lag("curr", 1, 0).over(Windows.w_vin))
            .withColumn("Ah_each_record", ($"curr" + $"curr_last") * 0.5 * $"time_interval_pre" / 3600)
            .drop("time_interval_pre","time_interval_post","curr_last")
            .withColumn("Ah",sum("Ah_each_record").over(Windows.w_chargectime))
            .drop("Ah_each_record")
            .withColumn("start_soc", first("soc").over(Windows.w_chargectime))
            .withColumn("end_soc", last("soc").over(Windows.w_chargectime))
            .drop("soc")
            .withColumn("charge_soc", $"end_soc" - $"start_soc")
            .withColumn("temp_diff", $"temp_high_evcc" - $"temp_low_evcc")
            .drop("celldata_array")

          val charge_sliced = charge_unsliced_2.groupBy("vin", "charge_id")
            .agg(
              mean("probe_ctime_mean").as("temp_mean"),
              first("miles").as("miles"),
              first("start_time").as("start_time"),
              first("end_time").as("end_time"),
              first("Ah").as("Ah"),
              first("start_soc").as("start_soc"),
              first("end_soc").as("end_soc"),
              max("curr").as("curr_max"),
              min("curr").as("curr_min"),
              mean("curr").as("curr_mean"),
              max("volt").as("volt_max"),
              min("volt").as("volt_min"),
              mean("volt").as("volt_mean"),
              max("power").as("power_max"),
              min("power").as("power_min"),
              mean("power").as("power_mean"),
              max($"temp_high_evcc").as("temp_high_max"),
              min($"temp_low_evcc").as("temp_low_min"),
              min($"temp_high_evcc").as("temp_high_evcc_min"),
              first("charge_soc").as("charge_soc"),
              max($"temp_diff").as("probe_diff_max")
            )
            .withColumn("Current_first_starttime", first("start_time").over(Windows.w_chargeid))
            .withColumn("start_time_str", ctime2date($"start_time"))
            .withColumn("end_time_str", ctime2date($"end_time"))

          /** 加载历史chargeid **/
          var history_id_df = spark.read.format("com.databricks.spark.csv")
            .option("header", "false").option("delimiter", ",").option("inferSchema", "true").load(historyIdDFInputPath)
            .toDF("history_vin", "history_id")


          val df = charge_sliced.join(history_id_df, charge_sliced("vin") === history_id_df("history_vin"), "outer")
            .withColumn("history_vin_null2x", when($"history_vin".isNull, "x").otherwise($"history_vin"))
            .withColumn("vin_null2x", when($"vin".isNull, "x").otherwise($"vin"))
            .withColumn("v", Windows.vin($"vin_null2x", $"history_vin_null2x"))
            .drop("history_vin", "vin", "history_vin_null2x", "vin_null2x")
            .withColumnRenamed("v", "vin")
            .withColumn("charge_id_null", when($"charge_id".isNull, 0).otherwise($"charge_id"))
            .withColumn("history_id_null", when($"history_id".isNull, 0).otherwise($"history_id"))
            .withColumn("charge_id_now", $"charge_id_null" + $"history_id_null")
            .drop("charge_id_null", "history_id_null", "charge_id", "history_id")
            .withColumnRenamed("charge_id_now", "charge_id")


          /** 存储chargeId和需要的数据 **/
          val buffer = df.select("vin", "charge_id")
            .groupBy("vin")
            .agg(
              max("charge_id").as("charge_id")
            )
          buffer.coalesce(1).write.mode(SaveMode.Append).option("header", "false").csv(historyIdDFOutputPath)


          val b = df
            .withColumn("Dod", lag("end_soc", 1, Double.NaN).over(Windows.w_chargeid) - $"start_soc")
            .withColumn("charge_interval", $"end_time" - $"start_time")
            .withColumn("charge_type", Windows.charge_type($"charge_interval", $"charge_soc"))

          b
      }
      .reduce(_ union _)


    val chargedf = fDF.select("vin","charge_id","start_time","end_time","start_time_str","end_time_str","charge_interval","start_soc","end_soc","Dod")

    chargedf.withColumn("id",$"vin").repartition($"id").write.mode(SaveMode.Overwrite).partitionBy("id").parquet(chargeSliceOutput)


    val charge_table = fDF.select("vin", "charge_id", "start_time", "end_time", "charge_interval", "charge_type", "charge_soc", "curr_max", "curr_min", "curr_mean", "start_soc", "end_soc", "temp_high_max", "temp_low_min",
      "probe_diff_max", "volt_max", "volt_min","volt_mean","miles", "power_max", "power_min", "power_mean","Ah","temp_mean")

    //写入数据库
    writeSQL(charge_table,sql_url,user,password,table)

  }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值