数据:
val eventsFromJSONDF = Seq( (0, """{"device_id": 0, "device_type": "sensor-ipad", "ip": "68.161.225.1", "cca3": "USA", "cn": "United States", "temp": 25, "signal": 23, "battery_level": 8, "c02_level": 917, "timestamp" :1475600496 }""").toDF("id", "json") //转换成dataframe,分为两列,一列叫id,一列叫json
------------get_json_object方法,直接从json列获取device_id数据
val jsDF = eventsFromJSONDF.select($"id" , get_json_object($"json", "$.device_id")
------------from_json方法,定义json的表格,将列中的JSON字符串解析为DataFram,通常用在json表格里面套了一个json的情况
先建表
val jsonSchema = new StructType()
.add("battery_level", LongType)
.add("c02_level", LongType)
.add("cca3", StringType)
.add("cn", StringType)
.add("device_id", LongType)
.add("device_type", StringType)
.add("signal", LongType)
.add("ip", StringType) .add("temp", LongType) .add("timestamp", TimestampType)
val frame:DataFrame=eventsDS.select($"id",from_json($"device",jsonSchema) .as("column1"))//"$id"和"$device"是列的选择器,表示选DataFrame中的id和device列。
//from_json($"device", jsonSchema)
将device
列中的JSON字符串解析为DataFrame,jsonSchema
是解析的JSON模式即定义json的表格。后续可以使用explode("column1")方法将数据”炸“出来,当一个json里面套了两个json,两个json都是数据,这时候就可以将它们炸出来,如下图例子:
该数据在teacher那里有两个json数据
这时候使用explode("")方法可以将其炸开,分成两组张老师和王老师数据显示
----------------to_json方法,将数据转换成json类型
val jsonDF2:DataFrame=aaa.select(to_json(struct("id","device_type","ip")).as("jsonstr")) //struct("id","device_type","ip"),将要转换的数据变成结构化对象,to_json(struct("id","device_type","ip"))将结构化对象转换为JSON字符串。
练习
数据如
{"address":"中国.江苏.徐州","area":4300.21,"classess":[{"classesName":"地下矿山瓦斯检测实验室","num":10}],"level":"211","schoolName":"中国矿业大学","teachers":[{"name":"张院士","year":50},{"name":"王院士","year":60}]}
{"address":"中国.江苏.南京","area":1000.21,"classess":[{"classesName":"园林设计","num":20}],"level":"双一流","schoolName":"南京林业大学","teachers":[{"name":"张院士","year":50},{"name":"王院士","year":60}]}
尝试将其提取出来
答案代码如下
import org.apache.spark.sql.{SparkSession, types} import org.apache.spark.sql.functions.{explode, from_json, get_json_object} import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType} object fff { def main(args: Array[String]): Unit = { // 创建SparkSession val spark=SparkSession.builder().appName("jsonchang").master("local[*]").getOrCreate() // 导入隐式转换 import spark.implicits._ // 读取文本文件 val aaa=spark.sparkContext.textFile("D:/File/wechat/op2.log") // 将文本文件转换为DataFrame val bbb=aaa.toDF("aa") // 从json中提取字段 val jsonObject1 = bbb.select( get_json_object($"aa", "$.address").as("address"), get_json_object($"aa", "$.area").as("area"), get_json_object($"aa", "$.classess").as("classess"), get_json_object($"aa", "$.level").as("level"), get_json_object($"aa", "$.schoolName").as("schoolName"), get_json_object($"aa", "$.teachers").as("teachers")) // 定义json的schema val schema:types.ArrayType= ArrayType( StructType( Array( StructField("name",StringType), StructField("year",IntegerType) ) ) ) val schema2:types.ArrayType= ArrayType( StructType( Array( StructField("classesName",StringType), StructField("num",IntegerType) ) ) ) // 将json转换为DataFrame val jsonObject2=jsonObject1.select($"address",$"area" ,from_json($"classess",schema2).as("classess"), $"level",$"schoolName", $"teachers") // 展开json中的数组 val jsonObject3=jsonObject2.select($"address",$"area",explode($"classess")as("classess"),$"level",$"schoolName", $"teachers") // 将json转换为DataFrame val jsonObject4=jsonObject3.select($"address",$"area",$"classess.*",$"level" ,$"schoolName",from_json($"teachers",schema).as("teachers")) // 展开json中的数组 val jsonObject5=jsonObject4.select($"address",$"area",$"classesName",$"num",$"level" ,$"schoolName",explode($"teachers").as("teachers")) // 显示DataFrame val ddd=jsonObject5.show() // 将DataFrame写入Mysql //MysqlConnect.dataFrameToMysql(jsonObject5,MysqlConnect.table,1) } }