Spark高级操作之json复杂和嵌套数据结构的操作二_mob604756ed02fe的技术博客_51CTO博客
(38条消息) SparkSql处理嵌套json数据_aitiejue5146的博客-CSDN博客
(38条消息) spark sql操作复杂嵌套数组 json数据_mys_35088的博客-CSDN博客
Spark高级操作之json复杂和嵌套数据结构的操作二 - 云+社区 - 腾讯云 (tencent.com)
代码示例:
package com.bosch.xlong.dwd
import com.bosch.xlong.dwd.utils._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{MapType, StringType}
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
import scala.collection.mutable.ListBuffer
object VehicleStreamToDailyCsv {
def main(args: Array[String]): Unit = {
val inputPath: String = "/mnt/stvcdpbatchvehicledev/ods/ods_vehicleStream_deltaTable/"
val outPutPath: String = "/mnt/stvcdpbatchvehicledev/dwd/dwd_vehicle_csv"
//mount存储容器
val storageAccountName: String = "stvcdpbatchvehicledev"
val storageAccountKey: String = "******"
val inputContainer: String = "ods"
val outputContainer: String = "dwd"
if (!CheckPathExists(inputPath)) MountDatalakeContainer(storageAccountName, storageAccountKey, inputContainer)
println("---->> mount " + inputPath + " successful <<----")
if (!CheckPathExists(outPutPath)) MountDatalakeContainer(storageAccountName, storageAccountKey, outputContainer)
println("---->> mount " + outPutPath + " successful <<----")
VehicleToDailyCsv(inputPath)
}
def VehicleToDailyCsv(inputPath: String): Unit = {
val spark: SparkSession = SparkSession.builder().appName("ccuPoc").master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
//获取昨天的年月日
val (yesterdayYear, yesterdayMonth, yesterdayDay) = getYesterdayPath
val yesterdayDate: String = getYesterdayDate
//Step1: 合并昨天实时流小文件
spark.sql("OPTIMIZE delta.`"+ inputPath +"`"+
"WHERE " + yesterdayYear +
" AND " + yesterdayMonth +
" AND " + yesterdayDay +
" ZORDER BY (createAt)")
println("---->> 合并昨天小文件:" + yesterdayYear + "/" + yesterdayMonth + "/" + yesterdayDay + "/ 合并成功<<----")
import spark.implicits._
//Step2: 获取昨天中实际运行的vehicleName
val vehicleNameList: ListBuffer[String] = GetVehicleName(inputPath) //获得所有vehicleName
val dailyTrueVehicleNameFilePathList: ListBuffer[String] = GetDailyVehicleName(inputPath, vehicleNameList, yesterdayYear, yesterdayMonth, yesterdayDay)
println("---->> Active vehicles yesterday :" + dailyTrueVehicleNameFilePathList + " <<----")
//Step3: 根据vehicleName分别解析字段,生成昨天每辆车的csv文件
for (vehicleName <- dailyTrueVehicleNameFilePathList) {
val outputPath: String = "/mnt/stvcdpbatchvehicledev/dwd/dwd_vehicle_csv/vehicleName=" + vehicleName + "/" + yesterdayYear + "/" + yesterdayMonth + "/" + yesterdayDay + ""
val ccuDeltaDF: DataFrame = spark.read.format("delta").load(inputPath)
.where("vehicleName = '" + vehicleName + "'")
.where(yesterdayYear)
.where(yesterdayMonth)
.where(yesterdayDay)
.select(
get_json_object($"body".cast("string"), "$").alias("body")
)
//平展嵌套json(body)
val bodyColumn: Column = from_json($"body", MapType(StringType, StringType))
val bodyDF: DataFrame = ccuDeltaDF.select(explode(map_keys(bodyColumn))).distinct()
val bodyKeys: Array[String] = bodyDF.collect().map(f => f.get(0).toString).sorted
val bodyKeyCols: Array[Column] = bodyKeys.map(f => bodyColumn.getItem(f).as(f.toString))
val bodyDataframe: DataFrame = ccuDeltaDF.select(bodyKeyCols: _*) // json一层全展开
.withColumn("createAtYear", substring($"createAt", 1, 4))
.withColumn("createAtMonth", substring($"createAt", 6, 2))
.withColumn("createAtDay", substring($"createAt", 9, 2))
.withColumn("createAtHour", substring($"createAt", 12, 2))
//平展嵌套json(body_data)
val dataColumn: Column = from_json($"data", MapType(StringType, StringType))
val dataDF: DataFrame = bodyDataframe.select(explode(map_keys(dataColumn))).distinct()
val dataKeys: Array[String] = dataDF.collect().map(f => f.get(0).toString).sorted
val dataKeyCols: Array[Column] = dataKeys.map(f => dataColumn.getItem(f).as(f.toString))
val OutputDataframe: DataFrame = bodyDataframe.select($"*" +: dataKeyCols: _*)
//清洗不需要的columns
val colsToRemove: List[String] = List("rawData", "data", "latitude", "longitude", "gpsDirection", "gpsSpeed")
val CleanedOutputDataframe: DataFrame = OutputDataframe.drop(colsToRemove: _*)
CleanedOutputDataframe
.sort("createAt")
.coalesce(1)
.write
.option("mergeSchema", "true")
.option("header", "true")
.format("csv")
.save(outputPath)
println("---->> 成功生成 " + yesterdayDate + " 车辆: " + vehicleName + ".csv<<----")
//Step4: 自定义`.csv`文件名
val BeforeModifyFilePath: String = outputPath + "/" + GetOutputCsvFileName(outputPath)
val AfterModifyFilePath: String = outputPath + "/vehicle_" + vehicleName + "_" + yesterdayDate + ".csv"
ModifyCsvFileName(BeforeModifyFilePath, AfterModifyFilePath)
println("---->> 成功重命名 /vehicle_" + vehicleName + "_" + yesterdayDate + ".csv <<----")
}
}
}