使用IDEA开发Spark SQL-★★★★★
准备环境
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive-thriftserver_2.11</artifactId>
<version>${spark.version}</version>
</dependency>
创建/获取DataFrame/DataSet
- 前面学习过DataFrame和DataSet底层都是对RDD进行了封装,添加了Schema和SQL操作
- 所以如果
要通过RDD创建DataFrame/DataSet那么就需要给RDD添加Schema
- 1.
样例类
- 2.
手动指定列名和类型
- 3.
指定StructType
方式1: 样例类添加Schema-★★★★★
package cn.hanjiaxiaozhi.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* Author hanjiaxiaozhi
* Date 2020/7/23 15:18
* Desc 演示通过样例类给RDD添加Schema来创建/获取DataFrame/DataSet
*/
object CreateDFDS1{
case class Person(id:Int,name:String,age:Int)
def main(args: Array[String]): Unit = {
//1.创建SparkSQL执行环境-SparkSession
val spark: SparkSession = SparkSession.builder().appName("sql").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//2.准备RDD
val fileRDD: RDD[String] = sc.textFile("D:\\data\\spark\\person.txt")
//3.将RDD和样例类进行关联
val personRDD: RDD[Person] = fileRDD.map(line => {
val arr: Array[String] = line.split(" ") //每一行
Person(arr(0).toInt, arr(1), arr(2).toInt)
})
//4.将RDD转为DataFrame/DataSet
//注意:RDD是在Spark一诞生就有的,而DataFrame是在Spark后续版本才有的,所以RDD的API中是没有toDF的方法的
//所以如果想让RDD能调用toDF那么可以使用隐式转换!
import spark.implicits._
val personDF: DataFrame = personRDD.toDF
//val personDS: Dataset[Person] = personRDD.toDS
//5.可以操作DataFrame/DataSet
personDF.show(10,false)//false表示不阶段,也就是列名过长时也不用...表示
personDF.printSchema()
}
}
## 方式2: 手动指定列名和类型添加Schema
package cn.hanjiaxiaozhi.sql
import cn.hanjiaxiaozhi.sql.CreateDFDS1.Person
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* Author hanjiaxiaozhi
* Date 2020/7/23 15:18
* Desc 演示通过手动指定列名和类型给RDD添加Schema来创建/获取DataFrame/DataSet
*/
object CreateDFDS2 {
def main(args: Array[String]): Unit = {
//1.创建SparkSQL执行环境-SparkSession
val spark: SparkSession = SparkSession.builder().appName("sql").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//2.准备RDD
val fileRDD: RDD[String] = sc.textFile("D:\\data\\spark\\person.txt")
//3.手动指定类型给RDD添加Schema
//tupleRDD中有类型,但是没有列名
val tupleRDD: RDD[(Int, String, Int)] = fileRDD.map(line => {
val arr: Array[String] = line.split(" ")
(arr(0).toInt, arr(1), arr(2).toInt)
})
//4.将RDD转为DataFrame/DataSet的时候再指定列名添加Schema
import spark.implicits._
val personDF: DataFrame = tupleRDD.toDF("id","name","age")
//5.可以操作DataFrame/DataSet
personDF.show(10,false)//false表示不阶段,也就是列名过长时也不用...表示
personDF.printSchema()
}
}
方式3: 指定StructType添加Schema
package cn.hanjiaxiaozhi.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
* Author hanjiaxiaozhi
* Date 2020/7/23 15:18
* Desc 演示通过手动指定StructType给RDD添加Schema来创建/获取DataFrame/DataSet
*/
object CreateDFDS3 {
def main(args: Array[String]): Unit = {
//1.创建SparkSQL执行环境-SparkSession
val spark: SparkSession = SparkSession.builder().appName("sql").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//2.准备RDD
val fileRDD: RDD[String] = sc.textFile("D:\\data\\spark\\person.txt")
//3.指定StructType给RDD添加Schema
//将RDD转为RDD[Row]
val rowRDD: RDD[Row] = fileRDD.map(line => {
val arr: Array[String] = line.split(" ")
Row(arr(0).toInt, arr(1), arr(2).toInt)
})
//4.然后再给rowRDD指定StructType添加Schema并转为DataFrame
/*val schema = StructType(
StructField("id", IntegerType, true) ::
StructField("name", StringType, true) ::
StructField("age", IntegerType, true) :: Nil
)*/
val schema = StructType(List(
StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true))
)
val personDF: DataFrame = spark.createDataFrame(rowRDD,schema)
//5.可以操作DataFrame/DataSet
personDF.show(10,false)//false表示不阶段,也就是列名过长时也不用...表示
personDF.printSchema()
}
}
各种花式查询-★★★★★
package cn.hanjiaxiaozhi.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* Author hanjiaxiaozhi
* Date 2020/7/23 16:12
* Desc 演示SparkSQL的各种花式查询
*/
object FlowerQueryDemo {
case class Person(id: Int, name: String, age: Int)
def main(args: Array[String]): Unit = {
//创建SparkSQL执行环境-SparkSession
val spark: SparkSession = SparkSession.builder().appName("sql").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//准备RDD
val fileRDD: RDD[String] = sc.textFile("D:\\data\\spark\\person.txt")
//将RDD和样例类进行关联
val personRDD: RDD[Person] = fileRDD.map(line => {
val arr: Array[String] = line.split(" ") //每一行
Person(arr(0).toInt, arr(1), arr(2).toInt)
})
//将RDD转为DataFrame/DataSet
import spark.implicits._
val personDF: DataFrame = personRDD.toDF
personDF.show(10, false) //false表示不截断,也就是列名过长时也不用...表示
personDF.printSchema()
/**
* +---+--------+---+
* |id |name |age|
* +---+--------+---+
* |1 |zhangsan|20 |
* |2 |lisi |29 |
* |3 |wangwu |25 |
* |4 |zhaoliu |30 |
* |5 |tianqi |35 |
* |6 |kobe |40 |
* +---+--------+---+
*
* root
* |-- id: integer (nullable = false)
* |-- name: string (nullable = true)
* |-- age: integer (nullable = false)
*/
//=================前面已经准备好了DataFrame,接下来开始进行查询==============
//TODO 1.使用DSL风格进行查询
//1.查询指定字段name,age
personDF.select("name", "age").show(false)
//2.查询name,age,age+1
personDF.select($"name", $"age", $"age" + 1).show(false)
personDF.select('name, 'age, 'age + 1).show(false)
//注意:$和' 都是将字符串转为列对象,然后就支持符合运算了
//3.查询age>25
personDF.filter($"age" > 25).show(false)
personDF.filter('age > 25).show(false)
//4.查询age>25的人数
val count: Long = personDF.filter('age > 25).count()
println("age>25的人数为:" + count)
//5.查询各个年龄的人数
personDF.groupBy("age").count().show(false)
//6.查询年龄最大的前2个
personDF.orderBy($"age".desc).show(2, false) //desc表示逆序,大的在前
println("===========================================================")
//TODO 2.使用SQL风格进行查询
//0.注册一个表名
//personDF.registerTempTable("t_person")//@deprecated("Use createOrReplaceTempView(viewName) instead.", "2.0.0")
//personDF.createOrReplaceGlobalTempView("t_person")//使用的时候得用_global_temp.t_person
personDF.createOrReplaceTempView("t_person")//创建/替换了一个临时表
//1.查询指定字段name,age
val sql =
"""
|select name,age
|from t_person
|""".stripMargin
spark.sql(sql).show(false)
//2.查询name,age,age+1
spark.sql(sql).show(false)
//3.查询age>25
spark.sql("select name,age from t_person where age > 25").show(false)
//4.查询age>25的人数
spark.sql("select count(*) from t_person where age > 25").show(false)
//5.查询各个年龄的人数
spark.sql("select age,count(*) from t_person group by age").show(false)
//6.查询年龄最大的前2个
spark.sql("select name,age from t_person order by age desc limit 2").show(false)
}
}
各种转换
package cn.hanjiaxiaozhi.sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
/**
* Author hanjiaxiaozhi
* Date 2020/7/23 16:41
* Desc 演示RDD/DataFrame/DataSet三者之间的相互转换
*/
object TransformDemo {
case class Person(id: Int, name: String, age: Int)
def main(args: Array[String]): Unit = {
//创建SparkSQL执行环境-SparkSession
val spark: SparkSession = SparkSession.builder().appName("sql").master("local[*]").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//准备RDD
val fileRDD: RDD[String] = sc.textFile("D:\\data\\spark\\person.txt")
//将RDD和样例类进行关联
val personRDD: RDD[Person] = fileRDD.map(line => {
val arr: Array[String] = line.split(" ") //每一行
Person(arr(0).toInt, arr(1), arr(2).toInt)
})
//将RDD转为DataFrame/DataSet
import spark.implicits._
//TODO 1.rdd-->df
val personDF: DataFrame = personRDD.toDF
personDF.show(10, false) //false表示不阶段,也就是列名过长时也不用...表示
personDF.printSchema()
//TODO 2.df-->rdd
val rdd: RDD[Row] = personDF.rdd //注意:DF本身没有泛型,转为RDD之后使用的默认泛型是Row
//TODO 3.rdd-->ds
val personDS: Dataset[Person] = personRDD.toDS()
//TODO 4.ds-->rdd
val rdd1: RDD[Person] = personDS.rdd //注意:DS本身具有泛型,转为RDD之后使用泛型和DS一致
//TODO 5.ds-->df
val dataFrame: DataFrame = personDS.toDF()
//TODO 6.df-->ds
val ds: Dataset[Person] = dataFrame.as[Person]
//后面的该怎么操作怎么操作,如:
//personDS.show()
//personDS.printSchema()
//personDS.select("xxx")
//personDS.createOrReplaceTempView("xxx")
//注意:DataFrame没有泛型,DataSet有泛型,使用方式都差不多,细节上的差异后面会体会到,现在只需要知道如何转换即可
}
}