spark json字会串转dataframe,dataframe结构转json处理
package mongo
import org.apache.spark.sql.SparkSession
object InsertTable {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").config("hive.metastore.uris", "thrift://10.200.102.230:9083")
.enableHiveSupport().getOrCreate()
import spark.implicits._
val jsonStr ="""{"t":"tt","k":"kkk","v":"[100]","t1":"bb","animal_interpretation":{"is_large_animal":"ddd","is_mammal":false}}"""
//一条数据字段不一至会报错,不管是一级还是json->struct里面的,和表结构不至就会报错
// 如果一条完整,其它的少字段,就会补null,不报错,这个不知道是不是全量采样,
// val jsonStr2 ="""{"t":"tt","k":"kkk","v":"[100]","t1":"bb","animal_interpretation":{"is_large_animal":"ddd"}}"""
val msgdf = Seq(jsonStr).toDF("msg")
msgdf.createOrReplaceTempView("msg_table")
val jsonDF = spark.sql("select msg from msg_table")
val rdd = jsonDF.rdd.map(_.getString(0))
rdd.toDS().show(false)
//+--------------------------------------------------------------------------------------------------------------+
//|value |
//+--------------------------------------------------------------------------------------------------------------+
//|{"t":"tt","k":"kkk","v":"[100]","t1":"bb","animal_interpretation":{"is_large_animal":"ddd","is_mammal":false}}|
//+--------------------------------------------------------------------------------------------------------------+
var jsonDataDF = spark.read.json(rdd.toDS())
jsonDataDF.show(false)
jsonDataDF.printSchema()
//+---------------------+---+---+---+-----+
//|animal_interpretation|k |t |t1 |v |
//+---------------------+---+---+---+-----+
//|[ddd, false] |kkk|tt |bb |[100]|
//+---------------------+---+---+---+-----+
//test
import org.apache.spark.sql.functions._
jsonDataDF = jsonDataDF.drop("k").withColumn("K", lit("kkkkk"))
//展开josn
jsonDataDF.select($"animal_interpretation.*").show(false)
//+---------------+---------+
//|is_large_animal|is_mammal|
//+---------------+---------+
//| ddd| false|
//+---------------+---------+
jsonDataDF.printSchema()
//root
// |-- animal_interpretation: struct (nullable = true)
// | |-- is_large_animal: string (nullable = true)
// | |-- is_mammal: boolean (nullable = true)
// |-- t: string (nullable = true)
// |-- t1: string (nullable = true)
// |-- v: string (nullable = true)
// |-- K: string (nullable = false)
//overwrite append
// jsonDataDF.write.mode("append").saveAsTable("ods_db.testsvae")
}
}
一个小工具
/**
* 1.比较两个DF,真实数据源DF比hive表的DF多的列转成json string 并存入ext_col列中
* 2.出现非基本类型的转成json字符串
*
* @param hiveDataFrame
* @param dataDataFrame
* @return
*/
def compareSchema(hiveDataFrame: DataFrame, dataDataFrame: DataFrame): DataFrame = {
hiveDataFrame.show(false)
dataDataFrame.show(false)
var newDataDataFrame = dataDataFrame
val fieldNameDifferenceSet = dataDataFrame.schema.fieldNames.toSet -- hiveDataFrame.schema.fieldNames.toSet
if (fieldNameDifferenceSet.size > 0) {
newDataDataFrame = newDataDataFrame.withColumn(DataDBConstant.cd_ext_col, to_json(struct(fieldNameDifferenceSet.toSeq.map(name => col(name)): _*))).drop(fieldNameDifferenceSet.toList: _*)
}
newDataDataFrame.schema.foreach(structname => {
if (structname.dataType.simpleString.startsWith("array")) {
newDataDataFrame = newDataDataFrame.withColumn(structname.name, to_json(col(structname.name)))
} else if (structname.dataType.simpleString.startsWith("map")) {
newDataDataFrame = newDataDataFrame.withColumn(structname.name, to_json(col(structname.name))) //map(
} else if (structname.dataType.simpleString.startsWith("struct")) {
newDataDataFrame = newDataDataFrame.withColumn(structname.name, to_json(col(structname.name)))
}
})
if (!newDataDataFrame.columns.contains(DataDBConstant.cd_ext_col)) {
newDataDataFrame = newDataDataFrame.withColumn(DataDBConstant.cd_ext_col, lit(null) cast ("string"))
}
val fieldNameDifferenceSet2 = hiveDataFrame.schema.fieldNames.toSet -- newDataDataFrame.schema.fieldNames.toSet
if (fieldNameDifferenceSet2.size > 0) {
val addNullCol = hiveDataFrame.select(fieldNameDifferenceSet2.toSeq.map(name => col(name)): _*)
addNullCol.schema.foreach(col => {
newDataDataFrame = newDataDataFrame.withColumn(col.name, lit(null) cast (col.dataType))
})
}
newDataDataFrame
}