创建DataFrame/DataSet
Spark会根据文件信息尝试着去推断DataFrame/DataSet的Schema,当然我们也可以手动指定,手动指定的方式有以下几种:
第1种:指定列名添加Schema
第2种:通过StructType指定Schema
第3种:编写样例类,利用反射机制推断Schema
指定列名添加Schema
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} object CreateDFDS { def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileRDD: RDD[String] = sc.textFile("D:\\data\\person.txt") val linesRDD: RDD[Array[String]] = fileRDD.map(_.split(" ")) val rowRDD: RDD[(Int, String, Int)] = linesRDD.map(line =>(line(0).toInt,line(1),line(2).toInt)) //3.将RDD转成DF //注意:RDD中原本没有toDF方法,新版本中要给它增加一个方法,可以使用隐式转换 import spark.implicits._ val personDF: DataFrame = rowRDD.toDF("id","name","age") personDF.show(10) personDF.printSchema() sc.stop() spark.stop() } } |
StructType指定Schema-了解
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} object CreateDFDS2 { def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileRDD: RDD[String] = sc.textFile("D:\\data\\person.txt") val linesRDD: RDD[Array[String]] = fileRDD.map(_.split(" ")) val rowRDD: RDD[Row] = linesRDD.map(line =>Row(line(0).toInt,line(1),line(2).toInt)) //3.将RDD转成DF //注意:RDD中原本没有toDF方法,新版本中要给它增加一个方法,可以使用隐式转换 //import spark.implicits._ val schema: StructType = StructType(Seq( StructField("id", IntegerType, true),//允许为空 StructField("name", StringType, true), StructField("age", IntegerType, true)) ) val personDF: DataFrame = spark.createDataFrame(rowRDD,schema) personDF.show(10) personDF.printSchema() sc.stop() spark.stop() } } |
反射推断Schema--掌握
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} object CreateDFDS3 { case class Person(id:Int,name:String,age:Int) def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL") .getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileRDD: RDD[String] = sc.textFile("D:\\data\\person.txt") val linesRDD: RDD[Array[String]] = fileRDD.map(_.split(" ")) val rowRDD: RDD[Person] = linesRDD.map(line =>Person(line(0).toInt,line(1),line(2).toInt)) //3.将RDD转成DF //注意:RDD中原本没有toDF方法,新版本中要给它增加一个方法,可以使用隐式转换 import spark.implicits._ //注意:上面的rowRDD的泛型是Person,里面包含了Schema信息 //所以SparkSQL可以通过反射自动获取到并添加给DF val personDF: DataFrame = rowRDD.toDF personDF.show(10) personDF.printSchema() sc.stop() spark.stop() } } |
花式查询
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} object QueryDemo { case class Person(id:Int,name:String,age:Int) def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL") .getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileRDD: RDD[String] = sc.textFile("D:\\data\\person.txt") val linesRDD: RDD[Array[String]] = fileRDD.map(_.split(" ")) val rowRDD: RDD[Person] = linesRDD.map(line =>Person(line(0).toInt,line(1),line(2).toInt)) //3.将RDD转成DF //注意:RDD中原本没有toDF方法,新版本中要给它增加一个方法,可以使用隐式转换 import spark.implicits._ //注意:上面的rowRDD的泛型是Person,里面包含了Schema信息 //所以SparkSQL可以通过反射自动获取到并添加给DF val personDF: DataFrame = rowRDD.toDF personDF.show(10) personDF.printSchema() //=======================SQL方式查询======================= //0.注册表 personDF.createOrReplaceTempView("t_person") //1.查询所有数据 spark.sql("select * from t_person").show() //2.查询age+1 spark.sql("select age,age+1 from t_person").show() //3.查询age最大的两人 spark.sql("select name,age from t_person order by age desc limit 2").show() //4.查询各个年龄的人数 spark.sql("select age,count(*) from t_person group by age").show() //5.查询年龄大于30的 spark.sql("select * from t_person where age > 30").show() //=======================DSL方式查询======================= //1.查询所有数据 personDF.select("name","age") //2.查询age+1 personDF.select($"name",$"age" + 1) //3.查询age最大的两人 personDF.sort($"age".desc).show(2) //4.查询各个年龄的人数 personDF.groupBy("age").count().show() //5.查询年龄大于30的 personDF.filter($"age" > 30).show() sc.stop() spark.stop() } } |
相互转化
RDD、DF、DS之间的相互转换有很多(6种),但是我们实际操作就只有2类:
1)使用RDD算子操作
2)使用DSL/SQL对表操作
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object TransformDemo { case class Person(id:Int,name:String,age:Int) def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileRDD: RDD[String] = sc.textFile("D:\\data\\person.txt") val linesRDD: RDD[Array[String]] = fileRDD.map(_.split(" ")) val personRDD: RDD[Person] = linesRDD.map(line =>Person(line(0).toInt,line(1),line(2).toInt)) //3.将RDD转成DF //注意:RDD中原本没有toDF方法,新版本中要给它增加一个方法,可以使用隐式转换 import spark.implicits._ //注意:上面的rowRDD的泛型是Person,里面包含了Schema信息 //所以SparkSQL可以通过反射自动获取到并添加给DF //=========================相互转换====================== //1.RDD-->DF val personDF: DataFrame = personRDD.toDF //2.DF-->RDD val rdd: RDD[Row] = personDF.rdd //3.RDD-->DS val DS: Dataset[Person] = personRDD.toDS() //4.DS-->RDD val rdd2: RDD[Person] = DS.rdd //5.DF-->DS val DS2: Dataset[Person] = personDF.as[Person] //6.DS-->DF val DF: DataFrame = DS2.toDF() sc.stop() spark.stop() } } |
Spark SQL完成WordCount
SQL风格
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} object WordCount { def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileDF: DataFrame = spark.read.text("D:\\data\\words.txt") val fileDS: Dataset[String] = spark.read.textFile("D:\\data\\words.txt") //fileDF.show() //fileDS.show() //3.对每一行按照空格进行切分并压平 //fileDF.flatMap(_.split(" ")) //注意:错误,因为DF没有泛型,不知道_是String import spark.implicits._ val wordDS: Dataset[String] = fileDS.flatMap(_.split(" "))//注意:正确,因为DS有泛型,知道_是String //wordDS.show() /* +-----+ |value| +-----+ |hello| | me| |hello| | you| ... */ //4.对上面的数据进行WordCount wordDS.createOrReplaceTempView("t_word") val sql = """ |select value ,count(value) as count |from t_word |group by value |order by count desc """.stripMargin spark.sql(sql).show() sc.stop() spark.stop() } } |
DSL风格
package cn.itcast.sql import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} object WordCount2 { def main(args: Array[String]): Unit = { //1.创建SparkSession val spark: SparkSession = SparkSession.builder().master("local[*]").appName("SparkSQL").getOrCreate() val sc: SparkContext = spark.sparkContext sc.setLogLevel("WARN") //2.读取文件 val fileDF: DataFrame = spark.read.text("D:\\data\\words.txt") val fileDS: Dataset[String] = spark.read.textFile("D:\\data\\words.txt") //fileDF.show() //fileDS.show() //3.对每一行按照空格进行切分并压平 //fileDF.flatMap(_.split(" ")) //注意:错误,因为DF没有泛型,不知道_是String import spark.implicits._ val wordDS: Dataset[String] = fileDS.flatMap(_.split(" "))//注意:正确,因为DS有泛型,知道_是String //wordDS.show() /* +-----+ |value| +-----+ |hello| | me| |hello| | you| ... */ //4.对上面的数据进行WordCount wordDS.groupBy("value").count().orderBy($"count".desc).show() sc.stop() spark.stop() } } |