Spark解析json格式文件(spark-shell和Idea scala版本)


现在有一个文件名为 op.log 的json文件内容是:

1593136280858|{"cm":{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"C6816QZ0@gmail.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"},"ap":"app","et":[{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]}
1593136280858|{"cm":{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"NW0S962J@gmail.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"},"ap":"app","et":[{"ett":"1593063223807","en":"loading","kv":{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}},{"ett":"1593095105466","en":"ad","kv":{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}},{"ett":"1593051718208","en":"notification","kv":{"ap_time":"1593095336265","action":"2","type":"3","content":""}},{"ett":"1593100021275","en":"comment","kv":{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}},{"ett":"1593105344120","en":"praise","kv":{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}}]}

1.找json在线解析看一下结构,是这样的


	 1593136280858|{
	"cm": {
		"ln": "-55.0",
		"sv": "V2.9.6",
		"os": "8.0.4",
		"g": "C6816QZ0@gmail.com",
		"mid": "489",
		"nw": "3G",
		"l": "es",
		"vc": "4",
		"hw": "640*960",
		"ar": "MX",
		"uid": "489",
		"t": "1593123253541",
		"la": "5.2",
		"md": "sumsung-18",
		"vn": "1.3.4",
		"ba": "Sumsung",
		"sr": "I"
	},
	"ap": "app",
	"et": [{
		"ett": "1593050051366",
		"en": "loading",
		"kv": {
			"extend2": "",
			"loading_time": "14",
			"action": "3",
			"extend1": "",
			"type": "2",
			"type1": "201",
			"loading_way": "1"
		}
	}, {
		"ett": "1593108791764",
		"en": "ad",
		"kv": {
			"activityId": "1",
			"displayMills": "78522",
			"entry": "1",
			"action": "1",
			"contentType": "0"
		}
	}, {
		"ett": "1593111271266",
		"en": "notification",
		"kv": {
			"ap_time": "1593097087883",
			"action": "1",
			"type": "1",
			"content": ""
		}
	}, {
		"ett": "1593066033562",
		"en": "active_background",
		"kv": {
			"active_source": "3"
		}
	}, {
		"ett": "1593135644347",
		"en": "comment",
		"kv": {
			"p_comment_id": 1,
			"addtime": "1593097573725",
			"praise_count": 973,
			"other_id": 5,
			"comment_id": 9,
			"reply_count": 40,
			"userid": 7,
			"content": "辑赤蹲慰鸽抿肘捎"
		}
	}]
}

2.spark 读取json并转成DataFrame进行数据表格化 进而分析数据

spark-shell版本

//加载存储在hdfs上的json文件
scala> val fileRDD=sc.textFile("hdfs://192.168.116.60:9000/kb09file/op.log")

//按照"|" 切割 
scala> val jsonStrRDD=fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))

//将手机号作为id保留放在最后
scala> val jsonRDD=jsonStrRDD.map(x=>{var jsonStr=x._2;jsonStr 
= jsonStr.substring(0,jsonStr.length-1);jsonStr+",\"id\":\""+x._1+"\"}"})   //加双引号才是json字符串 很重要

//变成DataFrame 进行后续操作
scala> val jsonDF=jsonRDD.toDF

scala> jsonDF.printSchema
root
 |-- value: string (nullable = true)

//导入包 便于使用get_json_object函数
scala> import spark.implicits._
scala> import org.apache.spark.sql.functions._
scala> import org.apache.spark.sql._
scala> import org.apache.spark.sql.types._

scala> jsonDF.select(get_json_object($"value","$.cm").alias("cm")).printSchema
root
 |-- cm: string (nullable = true)

scala> jsonDF.select(get_json_object($"value","$.cm").alias("cm")).show
+--------------------+
|                  cm|
+--------------------+
|{"ln":"-55.0","sv...|
|{"ln":"-114.9","s...|
+--------------------+

scala> jsonDF.select(get_json_object($"value","$.cm").alias("cm"),get_json_object($"value","$.ap").alias("ap"),
get_json_object($"value","$.et").alias("et"),get_json_object($"value","$.id").alias("id")).show()
+--------------------+---+--------------------+-------------+
|                  cm| ap|                  et|           id|
+--------------------+---+--------------------+-------------+
|{"ln":"-55.0","sv...|app|[{"ett":"15930500...|1593136280858|
|{"ln":"-114.9","s...|app|[{"ett":"15930632...|1593136280858|
+--------------------+---+--------------------+-------------+

//将json字符串结构化
//表头   cm   ap   et  id  
//列     a1   b1   c1  d1
scala> val jsonDF2 = jsonDF.select(get_json_object($"value","$.cm").alias("cm"),get_json_object($"value","$.ap").alias("ap")
,get_json_object($"value","$.et").alias("et"),get_json_object($"value","$.id").alias("id"))

scala> jsonDF2.printSchema
root
 |-- cm: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- et: string (nullable = true)
 |-- id: string (nullable = true)

scala> jsonDF2.select($"id",$"ap",$"cm",$"et").show
+-------------+---+--------------------+--------------------+
|           id| ap|                  cm|                  et|
+-------------+---+--------------------+--------------------+
|1593136280858|app|{"ln":"-55.0","sv...|[{"ett":"15930500...|
|1593136280858|app|{"ln":"-114.9","s...|[{"ett":"15930632...|
+-------------+---+--------------------+--------------------+


scala> val jsonDF3 = jsonDF2.select($"id",$"ap",get_json_object($"cm","$.ln") alias("ln"),
get_json_object($"cm","$.sv") alias("sv"),get_json_object($"cm","$.os") alias("os"),
get_json_object($"cm","$.g") alias("g"),get_json_object($"cm","$.mid") alias("mid"),
get_json_object($"cm","$.nw") alias("nw"),get_json_object($"cm","$.l") alias("l"),
get_json_object($"cm","$.vc") alias("vc"),get_json_object($"cm","$.hw") alias("hw"),
get_json_object($"cm","$.ar") alias("ar"),get_json_object($"cm","$.uid") alias("uid"),
get_json_object($"cm","$.t") alias("t"),get_json_object($"cm","$.la") alias("la"),
get_json_object($"cm","$.md") alias("md"),get_json_object($"cm","$.vn") alias("vn"),
get_json_object($"cm","$.ba") alias("ba"),get_json_object($"cm","$.sr") alias("sr"),$"et")

scala> jsonDF3.printSchema
root
 |-- id: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- et: string (nullable = true)

scala> jsonDF3.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|           id| ap|    ln|    sv|   os|                 g|mid| nw|  l| vc|      hw| ar|uid|            t|   la|        md|   vn|     ba| sr|                  et|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es|  4| 640*960| MX|489|1593123253541|  5.2|sumsung-18|1.3.4|Sumsung|  I|[{"ett":"15930500...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt|  8|640*1136| MX|490|1593121224789|-44.4|  Huawei-8|1.0.1| Huawei|  O|[{"ett":"15930632...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+

//from_json把字符串 
[
{"ett":"a1","en":"a2","kv":"a3"},
{"ett":"b1","en":"b2","kv":"b3"},
{"ett":"c1","en":"c2","kv":"c3"}
]   结构化
    ett    en    kv 
    a1     a2    a3
	b1	   b2    b3
	c1     c2    c3
	
val jsonDF4 =  jsonDF3.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).alias("event"))

jsonDF4大脑想到的结构是这样:
       id           ap      mid     nw     ***     event
1593136280858      app      12345   dsdwe  ****   [[ett en kv],	[b1 b2 b3],[c1 c2 c3] ] 


       id           ap      mid     nw     ***     ett  en  kv
1593136280858      app      12345   dsdwe  ****    a1   a2  a3	
1593136280858      app      12345   dsdwe  ****    b1   b2   b3  
1593136280858      app      12345   dsdwe  ****    c1   c2   c3  
------------思考结束-----------------

开干。。。。。
第一步
       id           ap      mid     nw     ***     event
1593136280858      app      12345   dsdwe  ****   [[ett en kv],	[b1 b2 b3],[c1 c2 c3] ]
搞成
		id           ap      mid     nw     ***     event
 1593136280858      app      12345   dsdwe  ****   [ett en kv]
 1593136280858      app      12345   dsdwe  ****   [b1 b2 b3]
 1593136280858      app      12345   dsdwe  ****   [c1 c2 c3]
scala> val jsonDF5 = jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"event").alias("event"))
第二步
		id           ap      mid     nw     ***     ett    en   kv
 1593136280858      app      12345   dsdwe  ****   a1    a2     a3
 1593136280858      app      12345   dsdwe  ****   b1    b2      b3
 1593136280858      app      12345   dsdwe  ****   c1    c2     c3

		
scala> val jsonDF5 = jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
explode($"event").alias("event"))

scala> val jsonDF6 = jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
($"event.kv").alias("kv"),($"event.ett").alias("ett"),($"event.en").alias("en"))

scala> jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en"
,$"kv").filter($"en"==="loading").show(false)

scala> val loadingDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr"
,$"ett",$"en",$"kv").filter($"en"==="loading")

scala> loadingDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.extend2") alias("extend2"),get_json_object($"kv","$.loading_time") alias("loading_time"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.extend1") alias("extend1"),get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.type1") alias("type1"),get_json_object($"kv","$.loading_way") alias("loading_way")).printSchema
root
 |-- id: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- ett: string (nullable = true)
 |-- en: string (nullable = true)
 |-- extend2: string (nullable = true)
 |-- loading_time: string (nullable = true)
 |-- action: string (nullable = true)
 |-- extend1: string (nullable = true)
 |-- type: string (nullable = true)
 |-- type1: string (nullable = true)
 |-- loading_way: string (nullable = true)


scala> val loadingDF1 = loadingDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
$"sr",$"ett",$"en",get_json_object($"kv","$.extend2") alias("extend2"),get_json_object($"kv","$.loading_time") alias("loading_time"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.extend1") alias("extend1"),get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.type1") alias("type1"),get_json_object($"kv","$.loading_way") alias("loading_way"))
scala> loadingDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
|           id| ap|    ln|    sv|   os|                 g|mid| nw|  l| vc|      hw| ar|uid|            t|   la|        md|   vn|     ba| sr|          ett|     en|extend2|loading_time|action|extend1|type|type1|loading_way|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es|  4| 640*960| MX|489|1593123253541|  5.2|sumsung-18|1.3.4|Sumsung|  I|1593050051366|loading|       |          14|     3|       |   2|  201|          1|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt|  8|640*1136| MX|490|1593121224789|-44.4|  Huawei-8|1.0.1| Huawei|  O|1593063223807|loading|       |           0|     3|       |   1|  102|          1|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----+-----------+



scala> val adDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="ad")
scala> val adDF1 = adDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.activityId") alias("activityId"),get_json_object($"kv","$.displayMills") alias("displayMills"),get_json_object($"kv","$.entry") alias("entry"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.contentType") alias("contentType"))
scala> adDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|           id| ap|    ln|    sv|   os|                 g|mid| nw|  l| vc|      hw| ar|uid|            t|   la|        md|   vn|     ba| sr|          ett| en|activityId|displayMills|entry|action|contentType|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es|  4| 640*960| MX|489|1593123253541|  5.2|sumsung-18|1.3.4|Sumsung|  I|1593108791764| ad|         1|       78522|    1|     1|          0|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt|  8|640*1136| MX|490|1593121224789|-44.4|  Huawei-8|1.0.1| Huawei|  O|1593095105466| ad|         1|        1966|    3|     2|          0|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+


scala> val notificationDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="notification")
scala> val notificationDF1 = notificationDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.ap_time") alias("ap_time"),get_json_object($"kv","$.action") alias("action"),get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.content") alias("content"))
scala> notificationDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|           id| ap|    ln|    sv|   os|                 g|mid| nw|  l| vc|      hw| ar|uid|            t|   la|        md|   vn|     ba| sr|          ett|          en|      ap_time|action|type|content|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es|  4| 640*960| MX|489|1593123253541|  5.2|sumsung-18|1.3.4|Sumsung|  I|1593111271266|notification|1593097087883|     1|   1|       |
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt|  8|640*1136| MX|490|1593121224789|-44.4|  Huawei-8|1.0.1| Huawei|  O|1593051718208|notification|1593095336265|     2|   3|       |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+


scala> val active_backgroundDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="active_background")
scala> val active_backgroundDF1 = active_backgroundDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.active_source") alias("active_source"))
scala> active_backgroundDF1.show
+-------------+---+-----+------+-----+------------------+---+---+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|           id| ap|   ln|    sv|   os|                 g|mid| nw|  l| vc|     hw| ar|uid|            t| la|        md|   vn|     ba| sr|          ett|               en|active_source|
+-------------+---+-----+------+-----+------------------+---+---+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|1593136280858|app|-55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es|  4|640*960| MX|489|1593123253541|5.2|sumsung-18|1.3.4|Sumsung|  I|1593066033562|active_background|            3|
+-------------+---+-----+------+-----+------------------+---+---+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+


scala> val commentDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="comment")
scala> val commentDF1 = commentDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.p_comment_id") alias("p_comment_id"),get_json_object($"kv","$.addtime") alias("addtime"),get_json_object($"kv","$.praise_count") alias("praise_count"),get_json_object($"kv","$.other_id") alias("other_id"),get_json_object($"kv","$.comment_id") alias("comment_id"),get_json_object($"kv","$.reply_count") alias("reply_count"),get_json_object($"kv","$.userid") alias("userid"),get_json_object($"kv","$.content") alias("content"))
scala> commentDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+--------------------------+
|           id| ap|    ln|    sv|   os|                 g|mid| nw|  l| vc|      hw| ar|uid|            t|   la|        md|   vn|     ba| sr|          ett|     en|p_comment_id|      addtime|praise_count|other_id|comment_id|reply_count|userid|                   content|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+--------------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es|  4| 640*960| MX|489|1593123253541|  5.2|sumsung-18|1.3.4|Sumsung|  I|1593135644347|comment|           1|1593097573725|         973|       5|         9|         40|     7|          辑赤蹲慰鸽抿肘捎|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt|  8|640*1136| MX|490|1593121224789|-44.4|  Huawei-8|1.0.1| Huawei|  O|1593100021275|comment|           4|1593098946009|         220|       4|         9|        151|     4|抄应螟皮釉倔掉汉蛋蕾街羡晶|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+--------------------------+

scala> val praiseDF = jsonDF6.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv").filter($"en"==="praise")
scala> val praiseDF1 = praiseDF.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba"
,$"sr",$"ett",$"en",get_json_object($"kv","$.target_id") alias("target_id"),get_json_object($"kv","$.id") alias("id1"),
get_json_object($"kv","$.type") alias("type"),get_json_object($"kv","$.add_time") alias("add_time"),get_json_object($"kv","$.userid") alias("userid"))

scala>  praiseDF1.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|           id| ap|    ln|    sv|   os|                 g|mid| nw|  l| vc|      hw| ar|uid|            t|   la|      md|   vn|    ba| sr|          ett|    en|target_id|id1|type|     add_time|userid|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt|  8|640*1136| MX|490|1593121224789|-44.4|Huawei-8|1.0.1|Huawei|  O|1593105344120|praise|        9|  7|   1|1593098545976|     8|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+






scala> loadingDF1.join(adDF1,Seq("id","id"),"inner").join(notificationDF1,Seq("id","id"),"inner").join(active_backgroundDF1,Seq("id","id"),"inner").join(commentDF1,Seq("id","id"),"inner").show


Idea scala版本

package Json

import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.StructType

/**
  * @Author Bright
  * @Date 2020/11/25
  * @Description
  */
object jsonToDF {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[*]").appName("jsonToDF").getOrCreate()
    val sc = spark.sparkContext

    import spark.implicits._

    //加载文件RDD
    val fileRDD = sc.textFile("in/op.log")
    val opjsonStr = fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))
      .map(x=>{
        var jsonStr = x._2;
        jsonStr = jsonStr.substring(0, jsonStr.length - 1);
        jsonStr+",\"id\":\""+x._1+"\"}"})

    //RDD转成DF
    val opjsonDF = opjsonStr.toDF()
    //opjsonDF.printSchema()
    //opjsonDF.show(false)

    //json最内层结构
    val et = new StructType()
      .add($"ett".string)
      .add($"en".string)
      .add($"kv".string)

    //json中层结构
    val cm = new StructType()
      .add($"ln".string).add($"sv".string).add($"os".string).add($"g".string).add($"mid".string).add($"nw".string)
      .add($"hw".string).add($"ar".string).add($"uid".string).add($"t".string).add($"la".string).add($"md".string)
      .add($"vn".string).add($"ba".string).add($"sr".string)

    //json 外层套中层 完成 最外层结构
    val schema = new StructType()
      .add($"cm".struct(cm))
      .add($"ap".string)
      .add($"et".array(et))
      .add($"id".string)

    //使用from_json方法 由json数据 与结构匹配
    val frame = opjsonDF.select(from_json($"value",schema).alias("values"))
//    frame.printSchema()
//    frame.show(false)

    //将列名数据查询出来 et层使用explode拆开并取别名为event
    val frame2 = frame.select(
      $"values.ap", $"values.cm.ln", $"values.cm.sv", $"values.cm.os", $"values.cm.g", $"values.cm.mid", $"values.cm.nw",
      $"values.cm.hw", $"values.cm.ar", $"values.cm.uid", $"values.cm.t", $"values.cm.la", $"values.cm.md", $"values.cm.vn",
      $"values.cm.ba", $"values.cm.sr",
      explode($"values.et").alias("event")
    )

//    frame2.printSchema()
//    frame2.show(false)

    //将ett en kv 单独查出来
    val frame3 = frame2.select(
      $"ap", $"ln", $"sv", $"os", $"g", $"mid", $"nw", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr",
      $"event.ett", $"event.en", $"event.kv"
    )

//    frame3.printSchema()
//    frame3.show(false)

    //过滤出来en层数据的6种情况  分别查询出来
    val loadingDF = frame3.where($"en" === "loading")
      .select(
        $"ap", $"ln", $"sv", $"os", $"g", $"mid", $"nw", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba",
        $"sr", $"ett", $"en",
        get_json_object($"kv", "$.extend2").alias("extend2"),
        get_json_object($"kv", "$.loading_time").alias("loading_time"),
        get_json_object($"kv", "$.action").alias("action"),
        get_json_object($"kv", "$.extend1").alias("extend1"),
        get_json_object($"kv", "$.type").alias("type"),
        get_json_object($"kv", "$.loading_way").alias("loading_way")
      )
    loadingDF.show(false)

    //写入保存到hdfs上面  文件端口:9000  需要赋权限且文件夹不存在才行  会自动生成新文件夹
    loadingDF.write.save("hdfs://192.168.116.60:9000/kb09file2/output/loadingDF")

    val adDF: DataFrame = frame3.where($"en" === "ad")
      .select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
        $"sr",$"ett",$"en",
        get_json_object($"kv", "$.activityId").alias("activityId"),
        get_json_object($"kv", "$.displayMills").alias("displayMills"),
        get_json_object($"kv", "$.entry").alias("entry"),
        get_json_object($"kv", "$.action").alias("action"),
        get_json_object($"kv", "$.contentType").alias("contentType")
      )
    adDF.show(false)

    val notificationDF: DataFrame = frame3.where($"en" === "notification")
      .select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
        $"sr",$"ett",$"en",
        get_json_object($"kv", "$.ap_time").alias("ap_time"),
        get_json_object($"kv", "$.action").alias("action"),
        get_json_object($"kv", "$.type").alias("type"),
        get_json_object($"kv", "$.content").alias("content")
      )
    notificationDF.show(false)

    val activeBackgroundDF: DataFrame = frame3.where($"en" === "active_background")
      .select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
        $"sr",$"ett",$"en",
        get_json_object($"kv", "$.active_source").alias("active_source")
      )
    activeBackgroundDF.show(false)

    val commentDF: DataFrame = frame3.where($"en" === "comment")
      .select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
        $"sr",$"ett",$"en",
        get_json_object($"kv", "$.p_comment_id").alias("p_comment_id"),
        get_json_object($"kv", "$.addtime").alias("addtime"),
        get_json_object($"kv", "$.praise_count").alias("praise_count"),
        get_json_object($"kv", "$.other_id").alias("other_id"),
        get_json_object($"kv", "$.comment_id").alias("comment_id"),
        get_json_object($"kv", "$.reply_count").alias("reply_count"),
        get_json_object($"kv", "$.userid").alias("userid"),
        get_json_object($"kv", "$.content").alias("content")
      )
    commentDF.show(false)

    val praiseDF: DataFrame = frame3.where($"en" === "praise")
      .select($"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",
        $"sr",$"ett",$"en",
        get_json_object($"kv", "$.target_id").alias("target_id"),
        get_json_object($"kv", "$.id").alias("id1"),
        get_json_object($"kv", "$.type").alias("type"),
        get_json_object($"kv", "$.add_time").alias("add_time"),
        get_json_object($"kv", "$.userid").alias("userid")
      )
    praiseDF.show(false)
  }
}


//输出结果:
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----------+
|ap |ln    |sv    |os   |g                 |mid|nw |hw      |ar |uid|t            |la   |md        |vn   |ba     |sr |ett          |en     |extend2|loading_time|action|extend1|type|loading_way|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2  |sumsung-18|1.3.4|Sumsung|I  |1593050051366|loading|       |14          |3     |       |2   |1          |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8  |1.0.1|Huawei |O  |1593063223807|loading|       |0           |3     |       |1   |1          |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+-------+------------+------+-------+----+-----------+

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|ap |ln    |sv    |os   |g                 |mid|nw |hw      |ar |uid|t            |la   |md        |vn   |ba     |sr |ett          |en |activityId|displayMills|entry|action|contentType|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2  |sumsung-18|1.3.4|Sumsung|I  |1593108791764|ad |1         |78522       |1    |1     |0          |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8  |1.0.1|Huawei |O  |1593095105466|ad |1         |1966        |3    |2     |0          |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+

+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|ap |ln    |sv    |os   |g                 |mid|nw |hw      |ar |uid|t            |la   |md        |vn   |ba     |sr |ett          |en          |ap_time      |action|type|content|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2  |sumsung-18|1.3.4|Sumsung|I  |1593111271266|notification|1593097087883|1     |1   |       |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8  |1.0.1|Huawei |O  |1593051718208|notification|1593095336265|2     |3   |       |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+------------+-------------+------+----+-------+

+---+-----+------+-----+------------------+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|ap |ln   |sv    |os   |g                 |mid|nw |hw     |ar |uid|t            |la |md        |vn   |ba     |sr |ett          |en               |active_source|
+---+-----+------+-----+------------------+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+
|app|-55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960|MX |489|1593123253541|5.2|sumsung-18|1.3.4|Sumsung|I  |1593066033562|active_background|3            |
+---+-----+------+-----+------------------+---+---+-------+---+---+-------------+---+----------+-----+-------+---+-------------+-----------------+-------------+

+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+-------------+
|ap |ln    |sv    |os   |g                 |mid|nw |hw      |ar |uid|t            |la   |md        |vn   |ba     |sr |ett          |en     |p_comment_id|addtime      |praise_count|other_id|comment_id|reply_count|userid|content      |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+-------------+
|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |640*960 |MX |489|1593123253541|5.2  |sumsung-18|1.3.4|Sumsung|I  |1593135644347|comment|1           |1593097573725|973         |5       |9         |40         |7     |辑赤蹲慰鸽抿肘捎     |
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8  |1.0.1|Huawei |O  |1593100021275|comment|4           |1593098946009|220         |4       |9         |151        |4     |抄应螟皮釉倔掉汉蛋蕾街羡晶|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+-------------+------------+--------+----------+-----------+------+-------------+

+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|ap |ln    |sv    |os   |g                 |mid|nw |hw      |ar |uid|t            |la   |md      |vn   |ba    |sr |ett          |en    |target_id|id1|type|add_time     |userid|
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+
|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490|3G |640*1136|MX |490|1593121224789|-44.4|Huawei-8|1.0.1|Huawei|O  |1593105344120|praise|9        |7  |1   |1593098545976|8     |
+---+------+------+-----+------------------+---+---+--------+---+---+-------------+-----+--------+-----+------+---+-------------+------+---------+---+----+-------------+------+

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值