spark json字符串转dataframe,dataframe结构转json处理

spark json字会串转dataframe,dataframe结构转json处理

package mongo

import org.apache.spark.sql.SparkSession

object InsertTable {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[*]").config("hive.metastore.uris", "thrift://10.200.102.230:9083")
      .enableHiveSupport().getOrCreate()
    import spark.implicits._
    val jsonStr ="""{"t":"tt","k":"kkk","v":"[100]","t1":"bb","animal_interpretation":{"is_large_animal":"ddd","is_mammal":false}}"""
    //一条数据字段不一至会报错,不管是一级还是json->struct里面的,和表结构不至就会报错
    // 如果一条完整,其它的少字段,就会补null,不报错,这个不知道是不是全量采样,
    //    val jsonStr2 ="""{"t":"tt","k":"kkk","v":"[100]","t1":"bb","animal_interpretation":{"is_large_animal":"ddd"}}"""
    val msgdf = Seq(jsonStr).toDF("msg")
    msgdf.createOrReplaceTempView("msg_table")
    val jsonDF = spark.sql("select msg from msg_table")
    val rdd = jsonDF.rdd.map(_.getString(0))
    rdd.toDS().show(false)
    //+--------------------------------------------------------------------------------------------------------------+
    //|value                                                                                                         |
    //+--------------------------------------------------------------------------------------------------------------+
    //|{"t":"tt","k":"kkk","v":"[100]","t1":"bb","animal_interpretation":{"is_large_animal":"ddd","is_mammal":false}}|
    //+--------------------------------------------------------------------------------------------------------------+
    var jsonDataDF = spark.read.json(rdd.toDS())
    jsonDataDF.show(false)
    jsonDataDF.printSchema()
    //+---------------------+---+---+---+-----+
    //|animal_interpretation|k  |t  |t1 |v    |
    //+---------------------+---+---+---+-----+
    //|[ddd, false]         |kkk|tt |bb |[100]|
    //+---------------------+---+---+---+-----+
    //test
    import org.apache.spark.sql.functions._
    jsonDataDF = jsonDataDF.drop("k").withColumn("K", lit("kkkkk"))
    //展开josn
    jsonDataDF.select($"animal_interpretation.*").show(false)
    //+---------------+---------+
    //|is_large_animal|is_mammal|
    //+---------------+---------+
    //|            ddd|    false|
    //+---------------+---------+
    jsonDataDF.printSchema()
    //root
    // |-- animal_interpretation: struct (nullable = true)
    // |    |-- is_large_animal: string (nullable = true)
    // |    |-- is_mammal: boolean (nullable = true)
    // |-- t: string (nullable = true)
    // |-- t1: string (nullable = true)
    // |-- v: string (nullable = true)
    // |-- K: string (nullable = false)

    //overwrite append
    //    jsonDataDF.write.mode("append").saveAsTable("ods_db.testsvae")


  }

}

一个小工具

/**
    * 1.比较两个DF,真实数据源DF比hive表的DF多的列转成json string 并存入ext_col列中
    * 2.出现非基本类型的转成json字符串
    *
    * @param hiveDataFrame
    * @param dataDataFrame
    * @return
    */
  def compareSchema(hiveDataFrame: DataFrame, dataDataFrame: DataFrame): DataFrame = {
    hiveDataFrame.show(false)
    dataDataFrame.show(false)
    var newDataDataFrame = dataDataFrame
    val fieldNameDifferenceSet = dataDataFrame.schema.fieldNames.toSet -- hiveDataFrame.schema.fieldNames.toSet
    if (fieldNameDifferenceSet.size > 0) {
      newDataDataFrame = newDataDataFrame.withColumn(DataDBConstant.cd_ext_col, to_json(struct(fieldNameDifferenceSet.toSeq.map(name => col(name)): _*))).drop(fieldNameDifferenceSet.toList: _*)
    }
    newDataDataFrame.schema.foreach(structname => {
      if (structname.dataType.simpleString.startsWith("array")) {
        newDataDataFrame = newDataDataFrame.withColumn(structname.name, to_json(col(structname.name)))
      } else if (structname.dataType.simpleString.startsWith("map")) {
        newDataDataFrame = newDataDataFrame.withColumn(structname.name, to_json(col(structname.name))) //map(
      } else if (structname.dataType.simpleString.startsWith("struct")) {
        newDataDataFrame = newDataDataFrame.withColumn(structname.name, to_json(col(structname.name)))
      }
    })
    if (!newDataDataFrame.columns.contains(DataDBConstant.cd_ext_col)) {
      newDataDataFrame = newDataDataFrame.withColumn(DataDBConstant.cd_ext_col, lit(null) cast ("string"))
    }
    val fieldNameDifferenceSet2 = hiveDataFrame.schema.fieldNames.toSet -- newDataDataFrame.schema.fieldNames.toSet
    if (fieldNameDifferenceSet2.size > 0) {
      val addNullCol = hiveDataFrame.select(fieldNameDifferenceSet2.toSeq.map(name => col(name)): _*)
      addNullCol.schema.foreach(col => {
        newDataDataFrame = newDataDataFrame.withColumn(col.name, lit(null) cast (col.dataType))
      })
    }
    newDataDataFrame
  }

 

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值