使用idea进行编程操作
创建文件 data/movie1.txt
{"mid":1,"rate":6,"uid":"u001","ts":15632433243} {"mid":1,"rate":4,"uid":"u002","ts":15632433263} {"mid":1,"rate":5,"uid":"u003","ts":15632403263} {"mid":1,"rate":3,"uid":"u004","ts":15632403963} {"mid":1,"rate":4,"uid":"u004","ts":15632403963} {"mid":2,"rate":5,"uid":"u001","ts":15632433243} {"mid":2,"rate":4,"uid":"u002","ts":15632433263} {"mid":2,"rate":7,"uid":"u005","ts":15632403963} {"mid":2,"rate":6,"uid":"u005","ts":15632403963} {"mid":3,"rate":2,"uid":"u001","ts":15632433243} {"mid":3,"rate":3,"uid":"u005","ts":15632403963} {"mid":3,"rate":8,"uid":"u005","ts":15632403963} {"mid":3,"rate":7,"uid":"u005","ts":15632403963}
1. 导入依赖
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<spark.version>3.2.3</spark.version>
<scala.version>2.12.12</scala.version>
</properties>
<dependencies>
<!-- scala的依赖 -->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<!-- spark core 即为spark内核 ,其他高级组件都要依赖spark core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
<!--spark sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
</dependency>
</dependencies>
2.通过读取JSON格式的的文件获取数据
2.1 获取spark-sql的编程会话
//1 获取spark的会话
val session: SparkSession = SparkSession
.builder()
.master("local[*]")
.appName("sp")
.getOrCreate()
2.2 使用会话对象读取JSON文件
// 加载结构化数据 eg:加载json
val df: DataFrame = session.read.json("data/movie1.txt")
// df DataFrame 是spark-sql的核心编程抽象,
// 其数据结构是 Schema + RDD 【Schema是表数据的表述,即表结构】
df.show() // 展示数据
df.printSchema() //打印表结构
2.3 结果解释
由于JSON文件本身是具备结构的,因此在进行读取的时候,会做字段和字段类型的推断
/**
* df的结构
* root
|-- mid: long (nullable = true)
|-- rate: long (nullable = true)
|-- ts: long (nullable = true)
|-- uid: string (nullable = true)
*/
2.4 使用SQL进行数据处理
//将解析JSON的df 映射成一张临时表
df.createTempView("tb_json")
// 通过sql来操作df
val res: DataFrame = session.sql(
"""
|select *
|from tb_json
|where _corrupt_record is null
|""".stripMargin)
res.show()
2.5 释放资源
session.stop() // 释放资源
2.6 注意事项
当读取的JSON文件中有破损的JSON数据,或者JSON中的数据不一致的情况
例如如下数据 movie1.txt
{"mid":1,"rate":6,"uid":"u001","ts":15632433243,"name":"xxx"} {"mid":1,"rate":4,"uid":"u002","ts":15632433263} {"mid":1,"rate":5,"uid":"u003","ts":15632403263} {"mid":1,"rate":3,"uid":"u004","ts":15632403963} {"mid":1,"rate":4,"uid":"u004","ts":15632403963} {"mid":2,"rate":5,"uid":"u001","ts":15632433243} {"mid":2,"rate":4,"uid":"u002","ts":15632433263} {"mid":2,"rate":5,"uid": {"mid":2,"rate":7,"uid":"u005","ts":15632403963} {"mid":2,"rate":6,"uid":"u005","ts":15632403963} {"mid":3,"rate":2,"uid":"u001","ts":15632433243} {"mid":3,"rate":1,"uid":"u002","ts":15632433263} {"mid":3,"rate":3,"uid":"u005","ts":15632403963} {"mid":3,"rate":8,"uid":"u005","ts":15632403963} {"mid":3,"rate":7,"uid":"u005","ts":15632403963}
按照上述方式进行读取并且展示
可以发现如下结论:
json 如果数据格式不正确 在df的结构中开头会多一列 _corrupt_record 当该字段的值为null时,表示数据格式正确
json 表字段按照JSON所有的key进行解析,没有该字段的数据,将被赋予null值
+--------------------+----+----+----+-----------+----+
| _corrupt_record| mid|name|rate| ts| uid|
+--------------------+----+----+----+-----------+----+
| null| 1| xxx| 6|15632433243|u001|
| null| 1|null| 4|15632433263|u002|
| null| 1|null| 5|15632403263|u003|
| null| 1|null| 3|15632403963|u004|
| null| 1|null| 4|15632403963|u004|
| null| 2|null| 5|15632433243|u001|
| null| 2|null| 4|15632433263|u002|
|{"mid":2,"rate":5...|null|null|null| null|null|
| null| 2|null| 7|15632403963|u005|
| null| 2|null| 6|15632403963|u005|
| null| 3|null| 2|15632433243|u001|
| null| 3|null| 1|15632433263|u002|
| null| 3|null| 3|15632403963|u005|
| null| 3|null| 8|15632403963|u005|
| null| 3|null| 7|15632403963|u005|
+--------------------+----+----+----+-----------+----+
root
|-- _corrupt_record: string (nullable = true)
|-- mid: long (nullable = true)
|-- name: string (nullable = true)
|-- rate: long (nullable = true)
|-- ts: long (nullable = true)
|-- uid: string (nullable = true)
可以根据_corrupt_record 是否为null 进行数据过滤
df.createTempView("tb_json")
val res: DataFrame = session.sql(
"""
|select
| mid,
| rate,
| ts,
| uid
|from tb_json
|where _corrupt_record is null
|""".stripMargin)
res.show()
结果如下:
+---+----+-----------+----+
|mid|rate| ts| uid|
+---+----+-----------+----+
| 1| 6|15632433243|u001|
| 1| 4|15632433263|u002|
| 1| 5|15632403263|u003|
| 1| 3|15632403963|u004|
| 1| 4|15632403963|u004|
| 2| 5|15632433243|u001|
| 2| 4|15632433263|u002|
| 2| 7|15632403963|u005|
| 2| 6|15632403963|u005|
| 3| 2|15632433243|u001|
| 3| 1|15632433263|u002|
| 3| 3|15632403963|u005|
| 3| 8|15632403963|u005|
| 3| 7|15632403963|u005|
+---+----+-----------+----+