好了,废话不多说,在上篇文章讲解了Spark SQL,这次看一下怎么使用,请看以下代码片段 (很简单易懂,有什么补充或者不足请指教)
Spark SQL的详细介绍
这是源文件:
1 zhangsan 20
2 lisi 29
3 wangwu 25
4 zhaoliu 30
5 tianqi 35
6 kobe 40
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
case class Person(id:Int,name:String,age:Int) //样例类
object Test {
def main(args: Array[String]): Unit = {
//todo:1、构建sparkSession 指定appName和master的地址
val spark: SparkSession = SparkSession.builder()
.appName("InferringSchema")
.master("local[2]").getOrCreate()
//todo:2、从sparkSession获取sparkContext对象
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//设置日志输出级别
//todo:3、加载数据
val dataRDD: RDD[String] = sc.textFile("person.txt")
//todo:4、切分每一行记录
val lineArrayRDD: RDD[Array[String]] = dataRDD.map(_.split(" "))
//todo:5、将RDD与Person类关联
val personRDD: RDD[Person] = lineArrayRDD.map(x => Person(x(0).toInt, x(1), x(2).toInt))
//todo:6、创建dataFrame,需要导入隐式转换
import spark.implicits._
val personDF: DataFrame = personRDD.toDF()
// personDF.show()
//2、显示DataFrame的schema信息
personDF.printSchema()
/* root
|-- id: integer (nullable = false)
|-- name: string (nullable = true)
|-- age: integer (nullable = false) */
//3、显示DataFrame记录数
println(personDF.count()) // 6
//4、显示DataFrame的所有字段
personDF.columns.foreach(print) // id name age
//5、取出DataFrame的第一行记录
println(personDF.head()) // [1,zhangsan,20]
//6、显示DataFrame中name字段的所有值
personDF.select("name").show()
/* +--------+
| name|
+--------+
|zhangsan|
| lisi|
| wangwu|
| zhaoliu|
| tianqi|
| kobe|
+--------+ */
//7、过滤出DataFrame中年龄大于30的记录
personDF.filter($"age" > 30).show()
/* +---+------+---+
| id| name|age|
+---+------+---+
| 5|tianqi| 35|
| 6| kobe| 40|
+---+------+---+
*/
//8、统计DataFrame中年龄大于30的人数
println(personDF.filter($"age" > 30).count()) // 2
//9、统计DataFrame中按照年龄进行分组,求每个组的人数
personDF.groupBy("age").count().show()
/* +---+-----+
|age|count|
+---+-----+
| 20| 1|
| 40| 1|
| 35| 1|
| 25| 1|
| 29| 1|
| 30| 1|
+---+-----+ */
//10、统计DataFrame中按照年龄进行分组,求每个组的人数
personDF.groupBy("age").count().show()
/* +---+-----+
|age|count|
+---+-----+
| 20| 1|
| 40| 1|
| 35| 1|
| 25| 1|
| 29| 1|
| 30| 1|
+---+-----+ */
//todo:将DataFrame注册成表
personDF.createOrReplaceTempView("t_person")
//todo:传入sql语句,进行操作
spark.sql("select * from t_person").show()
/* +---+--------+---+
| id| name|age|
+---+--------+---+
| 1|zhangsan| 20|
| 2| lisi| 29|
| 3| wangwu| 25|
| 4| zhaoliu| 30|
| 5| tianqi| 35|
| 6| kobe| 40|
+---+--------+---+ */
spark.sql("select * from t_person where name='zhangsan'").show()
/* +---+--------+---+
| id| name|age|
+---+--------+---+
| 1|zhangsan| 20|
+---+--------+---+ */
spark.sql("select * from t_person order by age desc").show()
/* +---+--------+---+
| id| name|age|
+---+--------+---+
| 6| kobe| 40|
| 5| tianqi| 35|
| 4| zhaoliu| 30|
| 2| lisi| 29|
| 3| wangwu| 25|
| 1|zhangsan| 20|
+---+--------+---+ */
sc.stop()
}