Spark 高级操作之json复杂和嵌套数据结构的操作

下面几个是本文重点要讲的方法
get_json_object()
from_json()
to_json()
explode()

准备阶段
准备一个json标准格式文件op.json,内容如下

1593136280858|{"cm":{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"C6816QZ0@gmail.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"},"ap":"app","et":[{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]}
1593136280858|{"cm":{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"NW0S962J@gmail.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"},"ap":"app","et":[{"ett":"1593063223807","en":"loading","kv":{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}},{"ett":"1593095105466","en":"ad","kv":{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}},{"ett":"1593051718208","en":"notification","kv":{"ap_time":"1593095336265","action":"2","type":"3","content":""}},{"ett":"1593100021275","en":"comment","kv":{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}},{"ett":"1593105344120","en":"praise","kv":{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}}]}

在通过 | 分隔符进行分割,再map转换为元祖,最后通过字符串拼接,得到jsDF1

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._

val jsrdd= sc.textFile("hdfs://192.168.153.10:9000/kb09workspace/op.json").map(_.split('|')).map(x => (x(0), x(1))).map(x => {var str = x._2;str = str.substring(1, str.length);"{\"id\":\"" + x._1 + "\"," + str})
val jsDF1=jsrdd.toDF

jsrdd内容如下,即把他们全部变成标准json格式

{"id":"1593136280858","cm":{"ln":"-55.0","sv":"V2.9.6","os":"8.0.4","g":"C6816QZ0@gmail.com","mid":"489","nw":"3G","l":"es","vc":"4","hw":"640*960","ar":"MX","uid":"489","t":"1593123253541","la":"5.2","md":"sumsung-18","vn":"1.3.4","ba":"Sumsung","sr":"I"},"ap":"app","et":[{"ett":"1593050051366","en":"loading","kv":{"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},{"ett":"1593108791764","en":"ad","kv":{"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{"ett":"1593111271266","en":"notification","kv":{"ap_time":"1593097087883","action":"1","type":"1","content":""}},{"ett":"1593066033562","en":"active_background","kv":{"active_source":"3"}},{"ett":"1593135644347","en":"comment","kv":{"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]}
{"id":"1593136280858","cm":{"ln":"-114.9","sv":"V2.7.8","os":"8.0.4","g":"NW0S962J@gmail.com","mid":"490","nw":"3G","l":"pt","vc":"8","hw":"640*1136","ar":"MX","uid":"490","t":"1593121224789","la":"-44.4","md":"Huawei-8","vn":"1.0.1","ba":"Huawei","sr":"O"},"ap":"app","et":[{"ett":"1593063223807","en":"loading","kv":{"extend2":"","loading_time":"0","action":"3","extend1":"","type":"1","type1":"102","loading_way":"1"}},{"ett":"1593095105466","en":"ad","kv":{"activityId":"1","displayMills":"1966","entry":"3","action":"2","contentType":"0"}},{"ett":"1593051718208","en":"notification","kv":{"ap_time":"1593095336265","action":"2","type":"3","content":""}},{"ett":"1593100021275","en":"comment","kv":{"p_comment_id":4,"addtime":"1593098946009","praise_count":220,"other_id":4,"comment_id":9,"reply_count":151,"userid":4,"content":"抄应螟皮釉倔掉汉蛋蕾街羡晶"}},{"ett":"1593105344120","en":"praise","kv":{"target_id":9,"id":7,"type":1,"add_time":"1593098545976","userid":8}}]}

1、使用get_json_object
将每一个:的键值对取出来

val jsDF2=jsDF1.select(
  get_json_object($"value", "$.id").as("id"), 
  get_json_object($"value", "$.cm").as("cm"), 
  get_json_object($"value", "$.ap").as("ap"), 
  get_json_object($"value", "$.et").as("et"))
jsDF2.show

在这里插入图片描述
这一个是将 cm一列里的 : 键值对拿出来

val jsDF3= jsDF2.select($"id", $"ap", $"cm", $"et",
  get_json_object($"cm", "$.ln").as("ln"),
  get_json_object($"cm", "$.sv").as("sv"),
  get_json_object($"cm", "$.os").as("os"),
  get_json_object($"cm", "$.g").as("g"),
  get_json_object($"cm", "$.nw").as("nw"),
  get_json_object($"cm", "$.l").as("l"),
  get_json_object($"cm", "$.hw").as("hw"),
  get_json_object($"cm", "$.ar").as("ar"),
  get_json_object($"cm", "$.uid").as("uid"),
  get_json_object($"cm", "$.t").as("t"),
  get_json_object($"cm", "$.la").as("la"),
  get_json_object($"cm", "$.md").as("md"),
  get_json_object($"cm", "$.vn").as("vn"),
  get_json_object($"cm", "$.ba").as("ba"),
  get_json_object($"cm", "$.sr").as("sr"))

在这里插入图片描述
2、from_json(字段,schema)
将et列中还有 : 键值对拿出来

val jsDF4 =jsDF3.select(
$"id",$"ap",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$”uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
    from_json($"et",ArrayType(StructType(
    StructField("ett", StringType)::
    StructField("en", StringType)::
    StructField("kv", StringType)::Nil ))).as("event")) 

在这里插入图片描述
3、explode 行转列函数
就是把 et 列(内容为数组)的每一个元素 : 键值对 拿出来

val jsDF5=jsDF4.select($"id", $"ap",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"event").as("event"))
val jsDF6=jsDF5.select($"id", $"ap",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv")

在这里插入图片描述
最后一步根据en值不同进行分表,把kv里的 : 键值对拿出来

    val jsDF7= jsDF6.filter($"en" === "loading").select(
      $"id", $"ap", $"sv", $"os", $"g", $"mid", $"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr", $"ett", $"en",
      get_json_object($"kv", "$.extend2").as("extend2"),
      get_json_object($"kv", "$.loading_time").as("loading_time"),
      get_json_object($"kv", "$.action").as("action"),
      get_json_object($"kv", "$.extend1").as("extend1"),
      get_json_object($"kv", "$.type").as("type"),
      get_json_object($"kv", "$.type1").as("type1"),
      get_json_object($"kv", "$.loading_way").as("loading_way")
    )

在这里插入图片描述

    val jsDF8= jsDF6.filter($"en" === "ad").select(
      $"id", $"ap", $"sv", $"os", $"g", $"mid", $"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr", $"ett", $"en",
      get_json_object($"kv", "$.activityId").as("activityId"),
      get_json_object($"kv", "$.displayMills").as("displayMills"),
      get_json_object($"kv", "$.entry").as("entry"),
      get_json_object($"kv", "$.action").as("action"),
      get_json_object($"kv", "$.contentType").as("contentType")
    )

在这里插入图片描述

    val jsDF9= jsDF6.filter($"en" === "notification").select(
      $"id", $"ap", $"sv", $"os", $"g", $"mid", $"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr", $"ett", $"en",
      get_json_object($"kv", "$.ap_time").as("ap_time"),
      get_json_object($"kv", "$.action").as("action"),
      get_json_object($"kv", "$.type").as("type"),
      get_json_object($"kv", "$.content").as("content")
    )

在这里插入图片描述

    val jsDF10= jsDF6.filter($"en" === "active_background").select(
      $"id", $"ap", $"sv", $"os", $"g", $"mid", $"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr", $"ett", $"en",
      get_json_object($"kv", "$.active_source").as("active_source")
    )

在这里插入图片描述

    val jsDF11= jsDF6.filter($"en" === "comment").select(
      $"id", $"ap", $"sv", $"os", $"g", $"mid", $"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr", $"ett", $"en",
      get_json_object($"kv", "$.p_comment_id").as("p_comment_id"),
      get_json_object($"kv", "$.addtime").as("addtime"),
      get_json_object($"kv", "$.praise_count").as("praise_count"),
      get_json_object($"kv", "$.other_id").as("other_id"),
      get_json_object($"kv", "$.comment_id").as("comment_id"),
      get_json_object($"kv", "$.reply_count").as("reply_count"),
      get_json_object($"kv", "$.userid").as("userid"),
      get_json_object($"kv","$.content").as("content")
    )

在这里插入图片描述

    val jsDF12= jsDF6.filter($"en" === "praise").select(
      $"id", $"ap", $"sv", $"os", $"g", $"mid", $"nw", $"l", $"vc", $"hw", $"ar", $"uid", $"t", $"la", $"md", $"vn", $"ba", $"sr", $"ett", $"en",
      get_json_object($"kv", "$.target_id").as("target_id"),
      get_json_object($"kv", "$.id").as("id"),
      get_json_object($"kv", "$.type").as("type"),
      get_json_object($"kv", "$.add_time").as("add_time"),
      get_json_object($"kv", "$.userid").as("userid")
    )

在这里插入图片描述
最后可以将得出来的结果表导入到mysql中或者hive

//导入到hive
val hiveContext = new org.apache.spark.sql.hive.HiveContext(sc)
jsDF2.registerTempTable("jsDF2")
hiveContext.sql("create table json2 as select * from jsDF2")

//导入到mysql
    val url="jdbc:mysql://192.168.153.10:3306/kb09db"
    val prop=new Properties()
    prop.setProperty("user","root")
    prop.setProperty("password","ok")
    prop.setProperty("driver","com.mysql.jdbc.Driver")
    jsDF2.write.mode("overwrite").jdbc(url,"jsDF2",prop) //覆盖
    jsDF2.write.mode("overwrite").jdbc(url,"jsDF2",prop) //追加
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值