1 json 数据集
{"pid":123456,"aid":"abcdefg","country":"MX","places":[{"place":7,"type":"open"},{"place":10,"type":"open"},{"place":13,"type":"open"}]}
2 代码
package dir1
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
object TestShow {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[2]")
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
val schema: StructType = new StructType()
.add("aid", StringType, true)
.add("country", StringType, true)
.add("pid", StringType, true)
.add("places", StringType, true)
val frame: DataFrame = spark.read.format("json").schema(schema).load("E:\\workspace\\test\\src\\main\\data\\test.json")
/*frame.printSchema();
frame.show(10,false);*/
/*
root
|-- aid: string (nullable = true)
|-- country: string (nullable = true)
|-- pid: string (nullable = true)
|-- places: string (nullable = true)
+-------+-------+-----+---------------------------------------------------------------------------------+
|aid |country|pid |places |
+-------+-------+-----+---------------------------------------------------------------------------------+
|abcdefg|MX |12456|[{"place":7,"type":"open"},{"place":10,"type":"open"},{"place":13,"type":"open"}]|
+-------+-------+-----+---------------------------------------------------------------------------------+
*/
//导入隐式转换
import spark.implicits._
/* val frame1: DataFrame = frame.select(
regexp_replace($"places","\\[|\\]","").as("value") //去除首尾括号
).select(
regexp_replace($"value","\\}\\,\\{","\\}\\;\\{").as("value") //转换不同json数组中间的,号为;号
).select(
split($"value","\\;").as("value") //按;号切割
).select(
explode($"value").as("value") //炸开
).select(
get_json_object($"value","$.place").as("place") //取出place字段
)*/
val frame1: DataFrame = frame.select(
$"aid",
$"country",
$"pid",
explode(split(regexp_replace(regexp_replace($"places","\\[|\\]",""),"\\}\\,\\{","\\}\\;\\{"),";")).as("col")
).select(
$"aid",
$"country",
$"pid",
get_json_object($"col","$.place").as("place")
)
frame1.printSchema()
frame1.show(10,false);
}
}