package com.shujia.sql
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo03SourceAPI {
//Spark SQL中常见的DataSourceAPI
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession
.builder()
.appName("Demo03SourceAPI")
.master("local")
.getOrCreate()
//读取不同类型的数据
//1.文本文件 csv
val stuDF: DataFrame = spark
.read
.format("csv") //文本文件统一用csv读取
.option("sep", ",") //最好在读取文本类文件的时候加上sep分隔符
.schema("id String,name String,age Int,gender String,clazz String") //给数据附上结构(列名及列的类型)
.load("Spark/data/students.txt") //指定读取文件的路径
stuDF.show(5)
stuDF.printSchema()//打印结构
val newStuDF: DataFrame = stuDF.withColumnRenamed("id", "new_id") //重命名
stuDF.printSchema()
//2.json
spark
.read
.format("json")
.load("Spark/data/stu/students.json")
.show(5)
//3.parquet
spark
.read
.format("parquet")
.load("Spark/data/stu/students.parquet")
.show(5)
//4.orc
spark
.read
.format("orc")
.load("Spark/data/stu/students.orc")
.show(5)
// //5.jdbc
// spark.read
// .format("jdbc")
// .option("url", "jdbc:mysql://rm-bp1h7v927zia3t8iwho.mysql.rds.aliyuncs.com:3306/stu016?useSSL=false")
// .option("dbtable", "student")
// .option("user", "shujia016")
// .option("password", "123456")
// .load()
// .show()
//保存数据
//以csv格式保存
stuDF
.write
.format("csv")
.option("sep","|")
/**
* 指定保存的模式:
* Append 追加
* Overwrite 覆盖
* ErrorIfExists 存在即报错
* Ignore 忽略
*/
.mode(SaveMode.Overwrite)
.save("Spark/data/csv")//指定保存的目录
/**
* json格式占用空间会更大 因为每一条数据都保存了一遍格式
* parquet,orc都会对数据进行压缩,压缩的方式是snappy,也都会保存数据的结构(格式)
* orc的压缩率最高
* parquet适合存储嵌套的格式
*/
//以json格式保存
stuDF.write.format("json").mode(SaveMode.Overwrite).save("Spark/data/json")
//以parquet格式保存
stuDF.write.format("parquet").mode(SaveMode.Overwrite).save("Spark/data/parquet")
//以orc格式保存
stuDF.write.format("orc").mode(SaveMode.Overwrite).save("Spark/data/orc")
}
}
+----------+------+---+------+--------+
| id| name|age|gender| clazz|
+----------+------+---+------+--------+
|1500100001|施笑槐| 22| 女|文科六班|
|1500100002|吕金鹏| 24| 男|文科六班|
|1500100003|单乐蕊| 22| 女|理科六班|
|1500100004|葛德曜| 24| 男|理科三班|
|1500100005|宣谷芹| 22| 女|理科五班|
+----------+------+---+------+--------+
only showing top 5 rowsroot
|-- id: string (nullable = true)
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- gender: string (nullable = true)
|-- clazz: string (nullable = true)root
|-- id: string (nullable = true)
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- gender: string (nullable = true)
|-- clazz: string (nullable = true)
+---+--------+------+----------+------+
|age| clazz|gender| id| name|
+---+--------+------+----------+------+
| 22|文科六班| 女|1500100001|施笑槐|
| 24|文科六班| 男|1500100002|吕金鹏|
| 22|理科六班| 女|1500100003|单乐蕊|
| 24|理科三班| 男|1500100004|葛德曜|
| 22|理科五班| 女|1500100005|宣谷芹|
+---+--------+------+----------+------+
only showing top 5 rows
+----------+------+---+------+--------+
| id| name|age|gender| clazz|
+----------+------+---+------+--------+
|1500100001|施笑槐| 22| 女|文科六班|
|1500100002|吕金鹏| 24| 男|文科六班|
|1500100003|单乐蕊| 22| 女|理科六班|
|1500100004|葛德曜| 24| 男|理科三班|
|1500100005|宣谷芹| 22| 女|理科五班|
+----------+------+---+------+--------+
only showing top 5 rows
+----------+------+---+------+--------+
| id| name|age|gender| clazz|
+----------+------+---+------+--------+
|1500100001|施笑槐| 22| 女|文科六班|
|1500100002|吕金鹏| 24| 男|文科六班|
|1500100003|单乐蕊| 22| 女|理科六班|
|1500100004|葛德曜| 24| 男|理科三班|
|1500100005|宣谷芹| 22| 女|理科五班|
+----------+------+---+------+--------+
only showing top 5 rows