spark解析嵌套JSON数组
一 数据示例
数据是带有时间戳的json数组
数据格式: xxx|[{},{}]
1610352196000|[{"cp_game_id":1658,"category":"cp_api","event":{"event_time":1610348596000,"event_name":"dungeon_flow"},"data":{"role_name":"xiaohao","role_vip":10,"dungeon_type":"主线关卡","dungeon_id":10916,"dungeon_name":"关卡23-24","chapter_id":33,"chapter_name":"23. 异化之地"}},{"cp_game_id":1658,"category":"cp_api","event":{"event_time":1610350804000,"event_name":"dungeon_flow"},"data":{"role_name":"我们一样酷","role_vip":8,"dungeon_type":"主线关卡","dungeon_id":10911,"dungeon_name":"关卡23-19","chapter_id":33,"chapter_name":"23. 异化之地"}}]
读取
val tmp = sc.textFile("in/test_Json.log")
tmp.foreach(println) //打印数据
二 拆分数据
json数组可根据 },{ 来切分数据。
分为两个步骤:
1 实现将 },{ 替换成 }\n timeServer| {
2 再扁体化数据
object Json_DataFrame {
def main(args: Array[String]): Unit = {
val start = System.currentTimeMillis()
val spark = SparkUtil.getSparkSession(this.getClass.getSimpleName, Constants.SPARK_LOCAL_MODE) // 初始化spark
val sc = spark.sparkContext
//读取数据
val tmp = sc.textFile("in/test_Json.log")
tmp.foreach(println)
//隐形转换
import org.apache.spark.sql.functions._
import spark.implicits._
/**
* 拆分数据 实现将 },{ 替换成 }\n timeServer| {
* 再通过\n 扁体化数据
*/
val df = tmp.map(
line => {
val data = line.split("\\|")
val timeServer = data(0)
val dataStr = data(1).replaceAll("\\]|\\[", "")
val result = timeServer + "|" + dataStr
result.replaceAll("\\}\\,\\{", s"\\}\\\n$timeServer\\|\\{") // },{ 替换成 }\n timeServer|{
})
.flatMap(_.split("\n")).toDF()
df.show(false) //打印数据
val end = System.currentTimeMillis()
println(s"=================== 耗时: ${(end - start) / 1000} 秒 ===================")
}
}
单条数据被拆分出来
再通过map算子将时间戳timeServer提取出来就可以
.map(
line => {
val data = line.split("\\|")
(data(0), data(1))
}).toDF("timeServer", "value")
三 拆分嵌套子json
使用get_json_object获取json里面字段的值
object Json_DataFrame {
def main(args: Array[String]): Unit = {
val start = System.currentTimeMillis()
val spark = SparkUtil.getSparkSession(this.getClass.getSimpleName, Constants.SPARK_LOCAL_MODE) // 初始化spark
val sc = spark.sparkContext
//读取数据
val tmp = sc.textFile("in/test_Json.log")
//隐形转换
import org.apache.spark.sql.functions._
import spark.implicits._
/**
* 拆分数据 实现将 },{ 替换成 }\n timeServer| {
* 再通过\n 扁体化数据
*/
val df = tmp.map(
line => {
val data = line.split("\\|")
val timeServer = data(0)
val dataStr = data(1).replaceAll("\\]|\\[", "") // 去掉 []
val result = timeServer + "|" + dataStr
result.replaceAll("\\}\\,\\{", s"\\}\\\n$timeServer\\|\\{") // },{ 替换成 }\n timeServer|
})
.flatMap(_.split("\n"))
.map(
line => {
val data = line.split("\\|")
(data(0), data(1))
}).toDF("timeServer", "value")
val jsonDf = df.select(
$"timeServer",
get_json_object($"value", "$.cp_game_id").alias("cp_game_id"),
get_json_object($"value", "$.event").alias("event"),
get_json_object($"value", "$.data").alias("data")
)
jsonDf.show(false)
// 获取子JSON里面字段
val resultDf = jsonDf.select(
$"timeServer",
$"cp_game_id",
get_json_object($"event", "$.event_time").alias("event_time"),
get_json_object($"event", "$.event_name").alias("event_name"),
$"data" // 使用get_json_object获取子字段
)
resultDf.show()
sc.stop()
val end = System.currentTimeMillis()
println(s"=================== 耗时: ${(end - start) / 1000} 秒 ===================")
}
}
效果
到这来,就完成了特殊JSON数组解析