JSON数据示例
1593136280858|{“cm”{“ln”:"-55.0",“sv”:“V2.9.6”,“os”:“8.0.4”,“g”:“C6816QZ0@gmail.com”,“mid”:“489”,“nw”:“3G”,“l”:“es”,“vc”:“4”,“hw”:“640*960”,“ar”:“MX”,“uid”:“489”,“t”:“1593123253541”,“la”:“5.2”,“md”:“sumsung-18”,“vn”:“1.3.4”,“ba”:“Sumsung”,“sr”:“I”},“ap”:“app”,“et”:[{“ett”:“1593050051366”,“en”:“loading”,“kv”:{“extend2”:"",“loading_time”:“14”,“action”:“3”,“extend1”:"",“type”:“2”,“type1”:“201”,“loading_way”:“1”}},{“ett”:“1593108791764”,“en”:“ad”,“kv”:{“activityId”:“1”,“displayMills”:“78522”,“entry”:“1”,“action”:“1”,“contentType”:“0”}},{“ett”:“1593111271266”,“en”:“notification”,“kv”:{“ap_time”:“1593097087883”,“action”:“1”,“type”:“1”,“content”:""}},{“ett”:“1593066033562”,“en”:“active_background”,“kv”:{“active_source”:“3”}},{“ett”:“1593135644347”,“en”:“comment”,“kv”:{“p_comment_id”:1,“addtime”:“1593097573725”,“praise_count”:973,“other_id”:5,“comment_id”:9,“reply_count”:40,“userid”:7,“content”:“辑赤蹲慰鸽抿肘捎”}}]}
读取
//从HDFS上读取预存JSON文件
scala> val fileRDD = sc.textFile("hdfs://192.168.**.**:9000/test/op.1og")
拆分
//将读取的JSON的文件按|拆分,即取出开头的数字
scala> val jsonStrRDD = fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))
//将数字存放至最后,并添加字段名“id”
scala> val jsonRDD=jsonStrRDD.map(x=>{var jsonStr=x._2; jsonStr = jsonStr.substring(0,jsonStr.length-1);jsonStr+",\"id\":\""+x._1+"\"}" })
转DataFrema
scala> val jsonDF = jsonRDD.toDF
//jsonDF结构
root
|-- value: string (nullable = true)
导包
import spark.implicits._
import org.apache.spark.sql._
拆分jsonDF
//通过get_json_object()可以获取字段中的字段
//将整个字段拆分成:cm、ap、et、id
scala> val jsonDF2 = jsonDF.select(get_json_object($"value","$.cm").alias("cm"),get_json_object($"value","$.ap").alias("ap"),get_json_object($"value","$.et").alias("et"),get_json_object($"value","$.id").alias("id"))
//jsonDF2结构
root
|-- cm: string (nullable = true)
|-- ap: string (nullable = true)
|-- et: string (nullable = true)
|-- id: string (nullable = true)
拆分jsonDF2(get_json_object)
//同样使用get_json_object拆分
scala> val jsonDF3 = jsonDF2.select($"id",$"ap",get_json_object($"cm","$.ln").alias("ln"),get_json_object($"cm","$.sv").alias("sv"),get_json_object($"cm","$.os").alias("os"),get_json_object($"cm","$.g").alias("g"),get_json_object($"cm","$.mid").alias("mid"),get_json_object($"cm","$.nw").alias("nw"),get_json_object($"cm","$.l").alias("l"),get_json_object($"cm","$.vc").alias("vc"),get_json_object($"cm","$.hw").alias("hw"),get_json_object($"cm","$.ar").alias("ar"),get_json_object($"cm","$.uid").alias("uid"),get_json_object($"cm","$.t").alias("t"),get_json_object($"cm","$.la").alias("la"),get_json_object($"cm","$.md").alias("md"),get_json_object($"cm","$.vn").alias("vn"),get_json_object($"cm","$.ba").alias("ba"),get_json_object($"cm","$.sr").alias("sr"),$"et")
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| et|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[{"ett":"15930500...|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|640*1136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[{"ett":"15930632...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+--------------------+
//jsonDF3结构
scala> jsonDF3.printSchema
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- et: string (nullable = true)
将指定字符串结构化(from_json)
//from_json:将字符串结构化
//将jsonDF3中的“et”结构化,取别名“event”
scala> val jsonDF4=jsonDF3.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).alias("event")).show(false)
//jsonDF4结构
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- event: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- ett: string (nullable = true)
| | |-- en: string (nullable = true)
| | |-- kv: string (nullable = true)
将字符串转列(explods)
//explods:行转列
//将“event”行转列
val scala> jsonDF5=jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"event") .alias("event"))
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- event: struct (nullable = true)
| |-- ett: string (nullable = true)
| |-- en: string (nullable = true)
| |-- kv: string (nullable = true)
重新排列
scala> val jsonDF6=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").show(false)
//jsonDF6结构
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- ett: string (nullable = true)
|-- en: string (nullable = true)
|-- kv: string (nullable = true)
展示处理后的数据
scala> jsonDF6.show
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-----------------+--------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| ett| en| kv|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-----------------+--------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593050051366| loading|{"extend2":"","lo...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593108791764| ad|{"activityId":"1"...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593111271266| notification|{"ap_time":"15930...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593066033562|active_background|{"active_source":...|
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640*960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|1593135644347| comment|{"p_comment_id":1...|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-----------------+--------------------+
拆分KV字段
- 由于kv中的字段是根据en字段所决定,所以需要根据不同情况拆分
- 当en值为loadiing时:
scala> val jsonDF6_kv_loading=jsonDF6.filter(x=>x.getAs[String]("en")=="loading").select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.loading_time").alias("loading_time"),get_json_object($"kv","$.action").alias("action"),get_json_object($"kv","$.extend1").alias("extend1"),get_json_object($"kv","$.type").alias("type")).show(false)
//jsonDF6_kv_loading数据展示
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+------+-------+----+
|id |ap |ln |sv |os |g |mid|nw |l |vc |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |loading_time|action|extend1|type|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+------+-------+----+
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593050051366|loading|14 |3 | |2 |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+------------+------+-------+----+
- 当en值为ad时:
scala> val jsonDF6_kv_ad=jsonDF6.filter(x=>x.getAs[String]("en")=="ad").select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.activityId").alias("activityId"),get_json_object($"kv","$.displayMills").alias("displayMills"),get_json_object($"kv","$.entry").alias("entry"),get_json_object($"kv","$.action").alias("action"),get_json_object($"kv","$.contentType").alias("contentType")).show(false)
//jsonDF6_kv_ad数据展示
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|id |ap |ln |sv |os |g |mid|nw |l |vc |hw |ar |uid|t |la |md |vn |ba |sr |ett |en |activityId|displayMills|entry|action|contentType|
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
|1593136280858|app|-55.0 |V2.9.6|8.0.4|C6816QZ0@gmail.com|489|3G |es |4 |640*960 |MX |489|1593123253541|5.2 |sumsung-18|1.3.4|Sumsung|I |1593108791764|ad |1 |78522 |1 |1 |0 |
+-------------+---+------+------+-----+------------------+---+---+---+---+--------+---+---+-------------+-----+----------+-----+-------+---+-------------+---+----------+------------+-----+------+-----------+
- 当en值为comment时
- 当en值为active_background时
- 当en值为notification时