Spark解析json文件
现在有一个文件名为 op.log 的json文件内容是:
1593136280858|{"cm":{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"C6816QZ0@gmail.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"},"ap":"app","et":[{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]}
1593136280858|{"cm":{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"NW0S962J@gmail.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"},"ap":"app","et":[{"ett":"1593063223807","en":"loading","kv":{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}},{"ett":"1593095105466","en":"ad","kv":{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}},{"ett":"1593051718208","en":"notification","kv":{"ap_time":"1593095336265","action":"2","type":"3","content":""}},{"ett":"1593100021275","en":"comment","kv":{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}},{"ett":"1593105344120","en":"praise","kv":{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}}]}
1.找json在线解析看一下结构,是这样的
1593136280858|{
"cm": {
"ln": "-55.0",
"sv": "V2.9.6",
"os": "8.0.4",
"g": "C6816QZ0@gmail.com",
"mid": "489",
"nw": "3G",
"l": "es",
"vc": "4",
"hw": "640*960",
"ar": "MX",
"uid": "489",
"t": "1593123253541",
"la": "5.2",
"md": "sumsung-18",
"vn": "1.3.4",
"ba": "Sumsung",
"sr": "I"
},
"ap": "app",
"et": [{
"ett": "1593050051366",
"en": "loading",
"kv": {
"extend2": "",
"loading_time": "14",
"action": "3",
"extend1": "",
"type": "2",
"type1": "201",
"loading_way": "1"
}
}, {
"ett": "1593108791764",
"en": "ad",
"kv": {
"activityId": "1",
"displayMills": "78522",
"entry": "1",
"action": "1",
"contentType": "0"
}
}, {
"ett": "1593111271266",
"en": "notification",
"kv": {
"ap_time": "1593097087883",
"action": "1",
"type": "1",
"content": ""
}
}, {
"ett": "1593066033562",
"en": "active_background",
"kv": {
"active_source": "3"
}
}, {
"ett": "1593135644347",
"en": "comment",
"kv": {
"p_comment_id": 1,
"addtime": "1593097573725",
"praise_count": 973,
"other_id": 5,
"comment_id": 9,
"reply_count": 40,
"userid": 7,
"content": "辑赤蹲慰鸽抿肘捎"
}
}]
}
2.spark 读取json并转成DataFrame进行数据表格化 进而分析数据
spark-shell版本
//加载存储在hdfs上的json文件
scala> val fileRDD=sc.textFile("hdfs://192.168.116.60:9000/kb09file/op.log")
//按照"|" 切割
scala> val jsonStrRDD=fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))
//将手机号作为id保留放在最后
scala> val jsonRDD=jsonStrRDD.map(x=>{var jsonStr=x._2;jsonStr
= jsonStr.substring(0,jsonStr.length-1);jsonStr+",\"id\":\""+x._1+"\"}"}) //加双引号才是json字符串 很重要
//变成DataFrame 进行后续操作
scala> val jsonDF=jsonRDD.toDF
scala> jsonDF.printSchema
root
|-- value: string (nullable = true)
//导入包 便于使用get_json_object函数
scala> import spark.implicits._
scala> import org.apache.spark.sql.functions._
scala> import org.apache.spark.sql._
scala> import org.apache.spark.sql.types._
scala> jsonDF.select(get_json_object($"value","$.cm").alias("cm")).printSchema
root
|-- cm: string (nullable = true)
scala> jsonDF.select(get_json_object($"value","$.cm").alias("cm")).show
+--------------------+
| cm|
+--------------------+
|{"ln":"-55.0","sv...|
|{"ln":"-114.9","s...|
+--------------------+
scala> jsonDF.select(get_json_object($"value","$.cm").alias("cm"),get_json_object($"value","$.ap").alias("ap"),
get_json_object($"value","$.et").alias("et"),get_json_object($"value","$.id").alias("id")).show()
+--------------------+---+--------------------+-------------+
| cm| ap| et| id|
+--------------------+---+--------------------+-------------+
|{"ln":"-55.0","sv...|app|[{"ett":"15930500...|1593136280858|
|{"ln":"-114.9","s...|app|[{"ett":"15930632...|1593136280858|
+--------------------+---+--------------------+-------------+
//将json字符串结构化
//表头 cm ap et id
//列 a1 b1 c1 d1
scala> val jsonDF2 = jsonDF.select(get_json_object($"value","$.cm").alias("cm"),get_json_object($"value","$.ap").alias("ap")
,get_json_object($"value","$.et").alias("et"),get_json_object($"value","$.id").alias("id"))
scala> jsonDF2.printSchema
root
|-- cm: string (nullable = true)
|-- ap: string (nullable = true)
|-- et: string (nullable = true)
|-- id: string (nullable = true)
scala> jsonDF2.select($"id",$"ap",$"cm",$"et").show
+-------------+---+--------------------+--------------------+
| id| ap| cm| et|
+-------------+---+--------------------+--------------------+
|1593136280858|app|{"ln":"-55.0","sv...|[{"ett":"15930500...|
|1593136280858|app|{"ln":"-114.9","s...|[{"ett":"15930632...|
+-------------+---+--------------------+--------------------+
scala> val jsonDF3 = jsonDF2.select($"id",$"ap",get_json_object($"cm","$.ln") alias("ln"),
get_json_object($"cm","$.sv") alias("sv"),get_json_object($"cm","$.os") alias("os"),
get_json_object($"cm","$.g") alias("g"),get_json_object($"cm","$.mid") alias("mid"),
get_json_object($"cm","$.nw") alias("nw"),get_json_object($"cm","$.l") alias("l"),
get_json_object($"cm","$.vc") alias("vc"),get_json_object($"cm","$.hw") alias("hw"),
get_json_object($"cm","$.ar") alias("ar"),get_json_object($"cm","$.uid") alias("uid"),
get_json_object($"cm","$.t") alias("t"),get_json_object($"cm","$.la") alias("la"),
get_json_object($"cm","$.md") alias("md"),get_json_object($"cm","$.vn") alias("vn"),
get_json_object($"cm","$.ba") alias("ba"),get_json_object($"cm","$.sr") alias("sr"),$"et")
scala> jsonDF3.printSchema
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- et: string (nullable = true)
scala> jsonDF3.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| et|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[{"ett":"15930500...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[{"ett":"15930632...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
//from_json把字符串
[
{"ett":"a1","en":"a2","kv":"a3"},
{"ett":"b1","en":"b2","kv":"b3"},
{"ett":"c1","en":"c2","kv":"c3"}
] 结构化
ett en kv
a1 a2 a3
b1 b2 b3
c1 c2 c3
val jsonDF4 = jsonDF3.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).alias("event"))
jsonDF4大脑想到的结构是这样:
id ap mid nw *** event
1593136280858 app 12345 dsdwe **** [[ett en kv], [b1 b2 b3],[c1 c2 c3] ]
id ap mid nw *** ett en kv
1593136280858 app 12345 dsdwe **** a1 a2 a3
1593136280858 app 12345 dsdwe **** b1 b2 b3
1593136280858 app 12345 dsdwe **** c1 c2 c3
------------思考结束-----------------
开干。。。。。
第一步
id ap mid nw *** event
1593136280858 app 12345 dsdwe **** [[ett en kv], [b1 b2 b3],[c1 c2 c3] ]
搞成
id ap mid nw *** event
1593136280858 app 12345 dsdwe **** [ett en kv]
1593136280858 app 12345 dsdwe **** [b1 b2 b3]
1593136280858 app 12345 dsdwe **** [c1 c2 c3]
scala> val jsonDF5 = jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"event").alias("event"))
第二步
id ap mid nw *** ett en kv
1593136280858 app 12345 dsdwe **** a1 a2 a3
1593136280858 app 12345 dsdwe **** b1 b2 b3
1593136280858 app 12345 dsdwe **** c1 c2 c3
scala> val jsonDF5 = jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
explode($"event").alias("event"))
scala> val jsonDF6 = jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
($"event.kv").alias("kv"),($"event.ett").alias("ett"),($"event.en").alias("en"))
scala> jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en"
,$"kv").filter($"en"==="loading").show(false)
scala> val loadingDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr"
,$"ett",$"en",$"kv").filter($"en"==="loading")
scala> loadingDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.extend2") alias("extend2"),get_json_object($"kv","$.loading_time") alias("loading_time"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.extend1") alias("extend1"),get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.type1") alias("type1"),get_json_object($"kv","$.loading_way") alias("loading_way")).printSchema
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- ett: string (nullable = true)
|-- en: string (nullable = true)
|-- extend2: string (nullable = true)
|-- loading_time: string (nullable = true)
|-- action: string (nullable = true)
|-- extend1: string (nullable = true)
|-- type: string (nullable = true)
|-- type1: string (nullable = true)
|-- loading_way: string (nullable = true)
scala> val loadingDF1 = loadingDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",get_json_object($"kv","$.extend2") alias("extend2"),get_json_object($"kv","$.loading_time") alias("loading_time"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.extend1") alias("extend1"),get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.type1") alias("type1"),get_json_object($"kv","$.loading_way") alias("loading_way"))
scala> loadingDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en|extend2|loading_time|action|extend1|type|type1|loading_way|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593050051366|loading| | 14| 3| | 2| 201| 1|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|1593063223807|loading| | 0| 3| | 1| 102| 1|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
scala> val adDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="ad")
scala> val adDF1 = adDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.activityId") alias("activityId"),get_json_object($"kv","$.displayMills") alias("displayMills"),get_json_object($"kv","$.entry") alias("entry"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.contentType") alias("contentType"))
scala> adDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en|activityId|displayMills|entry|action|contentType|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593108791764| ad| 1| 78522| 1| 1| 0|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|1593095105466| ad| 1| 1966| 3| 2| 0|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
scala> val notificationDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="notification")
scala> val notificationDF1 = notificationDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.ap_time") alias("ap_time"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.content") alias("content"))
scala> notificationDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en| ap_time|action|type|content|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593111271266|notification|1593097087883| 1| 1| |
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|1593051718208|notification|1593095336265| 2| 3| |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
scala> val active_backgroundDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="active_background")
scala> val active_backgroundDF1 = active_backgroundDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.active_source") alias("active_source"))
scala> active_backgroundDF1.show
+-------------+---+-----+------+-----+------------------+---+---+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en|active_source|
+-------------+---+-----+------+-----+------------------+---+---+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|1593136280858|app|-55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4|640*960| MX|489|1593123253541|5.2|sumsung-18|1.3.4|Sumsung| I|1593066033562|active_background| 3|
+-------------+---+-----+------+-----+------------------+---+---+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
scala> val commentDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="comment")
scala> val commentDF1 = commentDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.p_comment_id") alias("p_comment_id"),get_json_object($"kv","$.addtime") alias("addtime"),get_json_object($"kv","$.praise_count") alias("praise_count"),get_json_object($"kv","$.other_id") alias("other_id"),get_json_object($"kv","$.comment_id") alias("comment_id"),get_json_object($"kv","$.reply_count") alias("reply_count"),get_json_object($"kv","$.userid") alias("userid"),get_json_object($"kv","$.content") alias("content"))
scala> commentDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+--------------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en|p_comment_id| addtime|praise_count|other_id|comment_id|reply_count|userid| content|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+--------------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593135644347|comment| 1|1593097573725| 973| 5| 9| 40| 7| 辑赤蹲慰鸽抿肘捎|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|1593100021275|comment| 4|1593098946009| 220| 4| 9| 151| 4|抄应螟皮釉倔掉汉蛋蕾街羡晶|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+--------------------------+
scala> val praiseDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="praise")
scala> val praiseDF1 = praiseDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba"
,$"sr",$"ett",$"en",get_json_object($"kv","$.target_id") alias("target_id"),get_json_object($"kv","$.id") alias("id1"),
get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.add_time") alias("add_time"),get_json_object($"kv","$.userid") alias("userid"))
scala> praiseDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en|target_id|id1|type| add_time|userid|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4|Huawei-8|1.0.1|Huawei| O|1593105344120|praise| 9| 7| 1|1593098545976| 8|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
scala> loadingDF1.join(adDF1,Seq("id","id"),"inner").join(notificationDF1,Seq("id","id"),"inner").join(active_backgroundDF1,Seq("id","id"),"inner").join(commentDF1,Seq("id","id"),"inner").show
Idea scala版本
package Json
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType
/**
* @Author Bright
* @Date 2020/11/25
* @Description
*/
object jsonToDF {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").appName("jsonToDF").getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
//加载文件RDD
val fileRDD = sc.textFile("in/op.log")
val opjsonStr = fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))
.map(x=>{
var jsonStr = x._2;
jsonStr = jsonStr.substring(0, jsonStr.length - 1);
jsonStr+",\"id\":\""+x._1+"\"}"})
//RDD转成DF
val opjsonDF = opjsonStr.toDF()
//opjsonDF.printSchema()
//opjsonDF.show(false)
//json最内层结构
val et = new StructType()
.add($"ett".string)
.add($"en".string)
.add($"kv".string)
//json中层结构
val cm = new StructType()
.add($"ln".string).add($"sv".string).add($"os".string).add($"g".string).add($"mid".string).add($"nw".string)
.add($"hw".string).add($"ar".string).add($"uid".string).add($"t".string).add($"la".string).add($"md".string)
.add($"vn".string).add($"ba".string).add($"sr".string)
//json 外层套中层 完成 最外层结构
val schema = new StructType()
.add($"cm".struct(cm))
.add($"ap".string)
.add($"et".array(et))
.add($"id".string)
//使用from_json方法 由json数据 与结构匹配
val frame = opjsonDF.select(from_json($"value",schema).alias("values"))
// frame.printSchema()
// frame.show(false)
//将列名数据查询出来 et层使用explode拆开并取别名为event
val frame2 = frame.select(
$"values.ap", $"values.cm.ln", $"values.cm.sv", $"values.cm.os", $"values.cm.g", $"values.cm.mid", $"values.cm.nw",
$"values.cm.hw", $"values.cm.ar", $"values.cm.uid", $"values.cm.t", $"values.cm.la", $"values.cm.md", $"values.cm.vn",
$"values.cm.ba", $"values.cm.sr",
explode($"values.et").alias("event")
)
// frame2.printSchema()
// frame2.show(false)
//将ett en kv 单独查出来
val frame3 = frame2.select(
$"ap", $"ln", $"sv", $"os", $"g", $"mid", $"nw", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr",
$"event.ett", $"event.en", $"event.kv"
)
// frame3.printSchema()
// frame3.show(false)
//过滤出来en层数据的6种情况 分别查询出来
val loadingDF = frame3.where($"en" === "loading")
.select(
$"ap", $"ln", $"sv", $"os", $"g", $"mid", $"nw", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba",
$"sr", $"ett", $"en",
get_json_object($"kv", "$.extend2").alias("extend2"),
get_json_object($"kv", "$.loading_time").alias("loading_time"),
get_json_object($"kv", "$.action").alias("action"),
get_json_object($"kv", "$.extend1").alias("extend1"),
get_json_object($"kv", "$.type").alias("type"),
get_json_object($"kv", "$.loading_way").alias("loading_way")
)
loadingDF.show(false)
//写入保存到hdfs上面 文件端口:9000 需要赋权限且文件夹不存在才行 会自动生成新文件夹
loadingDF.write.save("hdfs://192.168.116.60:9000/kb09file2/output/loadingDF")
val adDF: DataFrame = frame3.where($"en" === "ad")
.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",
get_json_object($"kv", "$.activityId").alias("activityId"),
get_json_object($"kv", "$.displayMills").alias("displayMills"),
get_json_object($"kv", "$.entry").alias("entry"),
get_json_object($"kv", "$.action").alias("action"),
get_json_object($"kv", "$.contentType").alias("contentType")
)
adDF.show(false)
val notificationDF: DataFrame = frame3.where($"en" === "notification")
.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",
get_json_object($"kv", "$.ap_time").alias("ap_time"),
get_json_object($"kv", "$.action").alias("action"),
get_json_object($"kv", "$.type").alias("type"),
get_json_object($"kv", "$.content").alias("content")
)
notificationDF.show(false)
val activeBackgroundDF: DataFrame = frame3.where($"en" === "active_background")
.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",
get_json_object($"kv", "$.active_source").alias("active_source")
)
activeBackgroundDF.show(false)
val commentDF: DataFrame = frame3.where($"en" === "comment")
.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",
get_json_object($"kv", "$.p_comment_id").alias("p_comment_id"),
get_json_object($"kv", "$.addtime").alias("addtime"),
get_json_object($"kv", "$.praise_count").alias("praise_count"),
get_json_object($"kv", "$.other_id").alias("other_id"),
get_json_object($"kv", "$.comment_id").alias("comment_id"),
get_json_object($"kv", "$.reply_count").alias("reply_count"),
get_json_object($"kv", "$.userid").alias("userid"),
get_json_object($"kv", "$.content").alias("content")
)
commentDF.show(false)
val praiseDF: DataFrame = frame3.where($"en" === "praise")
.select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",
get_json_object($"kv", "$.target_id").alias("target_id"),
get_json_object($"kv", "$.id").alias("id1"),
get_json_object($"kv", "$.type").alias("type"),
get_json_object($"kv", "$.add_time").alias("add_time"),
get_json_object($"kv", "$.userid").alias("userid")
)
praiseDF.show(false)
}
}
//输出结果:
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----------+
|ap |ln |sv |os |g |mid|nw |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |extend2|loading_time|action|extend1|type|loading_way|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593050051366|loading| |14 |3 | |2 |1 |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593063223807|loading| |0 |3 | |1 |1 |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----------+
SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|ap |ln |sv |os |g |mid|nw |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |activityId|displayMills|entry|action|contentType|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593108791764|ad |1 |78522 |1 |1 |0 |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593095105466|ad |1 |1966 |3 |2 |0 |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|ap |ln |sv |os |g |mid|nw |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |ap_time |action|type|content|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593111271266|notification|1593097087883|1 |1 | |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593051718208|notification|1593095336265|2 |3 | |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
+---+-----+------+-----+------------------+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|ap |ln |sv |os |g |mid|nw |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |active_source|
+---+-----+------+-----+------------------+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|app|-55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960|MX |489|1593123253541|5.2|sumsung-18|1.3.4|Sumsung|I |1593066033562|active_background|3 |
+---+-----+------+-----+------------------+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+-------------+
|ap |ln |sv |os |g |mid|nw |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |p_comment_id|addtime |praise_count|other_id|comment_id|reply_count|userid|content |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+-------------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593135644347|comment|1 |1593097573725|973 |5 |9 |40 |7 |辑赤蹲慰鸽抿肘捎 |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8 |1.0.1|Huawei |O |1593100021275|comment|4 |1593098946009|220 |4 |9 |151 |4 |抄应螟皮釉倔掉汉蛋蕾街羡晶|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+-------------+
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|ap |ln |sv |os |g |mid|nw |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |target_id|id1|type|add_time |userid|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8|1.0.1|Huawei|O |1593105344120|praise|9 |7 |1 |1593098545976|8 |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+