一、添加依赖关系
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.0.0</version>
</dependency>
二、搭建SparkSQL运行环境
object Spark01_SparkSQL_Basic {
def main(args: Array[String]): Unit = {
//TODO 创建SparkSQL的运行环境
val SparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkSQL")
//创建SparkSession对象
val spark = SparkSession.builder().config(SparkConf).getOrCreate()
// RDD=>DataFrame=>DataSet转换需要引入隐式转换规则,否则无法转换
// spark不是包名,是上下文环境对象名
//TODO 执行逻辑操作
//TODO 关闭环境
spark.stop()
}
}
运行结果表面环境OK
三、执行逻辑操作
1、DataFrame展示
//DataFrame
val df: DataFrame = spark.read.json("datas/user.json")
df.show()
运行结果:
2、DataFrame => SQL
//DataFrame => SQL
df.createOrReplaceTempView("user")
spark.sql("select avg(age) from user").show
3、DATa Frame => DSL
df.select("username","age").show()
//在使用DataFrame时,如果涉及到转换操作,需要引入转换规则import spark.implicits._
import spark.implicits._
df.select($"age"+1).show()
//或者
df.select('age+1).show()
4、DataSet展示
// TODO Dataset
val seq = Seq(1,2,3,4)
val ds: Dataset[Int] = seq.toDS
ds.show
5、RDD & DATaFrame & Dataset 之间的转换
//TODO RDD <=> DataFrame
val rdd = spark.sparkContext.makeRDD(List((1,"zhangsan",30),(2,"lisi",40)))
val df:DataFrame = rdd.toDF("id","name","age")
val rowRDD: RDD[Row] =df.rdd
//DataFrame <=> DataSet
val ds:Dataset[User]= df.as[User]
val df1:DataFrame = ds.toDF()
//RDD <=>DataSet
val ds1:Dataset[User] = rdd.map {
case (id,name,age) =>{
User(id,name,age)
}
}.toDS()
val userRDD:RDD[User] = ds1.rdd
//释放资源
spark.stop()
}
}
case class User(id:Int,name:String,age:Int)
或者
//*****RDD=>DataFrame=>DataSet*****
//RDD
val rdd1: RDD[(Int, String, Int)] = spark.sparkContext.makeRDD(List((1,"zhangxiaoming",30),(2,"zhouxiaolian",24),(3,"lixiaohua",23),(4,"wangxiaoli",26),(5,"chenxiaoyan",22)))
//DataFrame
val df1: DataFrame = rdd1.toDF("id","name","age")
//df1.show()
//DateSet
val ds1: Dataset[User] = df1.as[User]
//ds1.show()
//*****DataSet=>DataFrame=>RDD*****
//DataFrame
val df2: DataFrame = ds1.toDF()
//RDD 返回的RDD类型为Row,里面提供的getXXX方法可以获取字段值,类似jdbc处理结果集,但是索引从0开始
val rdd2: RDD[Row] = df2.rdd
//rdd2.foreach(a=>println(a.getString(1)))
//*****RDD=>DataSet*****
rdd1.map{
case (id,name,age)=>User(id,name,age) }.toDS()
//*****DataSet=>=>RDD*****
ds1.rdd
//释放资源
spark.stop()
}
}
case class User(id:Int,name:String,age:Int)