简介: 实际项目开发中,常常需要对RDD、DataFrame及Dataset之间相互转换,其中要点就是Schema约束结构信息。
1) RDD转换DataFrame或者Dataset
- 转换DataFrame时,定义Schema信息,两种方式
- 转换为Dataset时,不仅需要Schema信息,还需要RDD数据类型为CaseClass类型
2) Dataset或DataFrame转换RDD
- 由于Dataset或DataFrame底层就是RDD,所以直接调用rdd函数即可转换
- dataframe.rdd 或者dataset.rdd
3) DataFrame与Dataset之间转换
- 由于DataFrame为Dataset特例,所以Dataset直接调用toDF函数转换为DataFrame
- 当将DataFrame转换为Dataset时,使用函数as[Type],指定CaseClass类型即可。
代码:
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
* @author liu a fu
* @date 2021/1/17 0017
* @version 1.0
* @DESC 步骤 演示各个数据类型的转换 RDD DataFrame DateSet之间的转换
*/
case class PerSon2(id: Int, name: String, age: Int)
object _07SparkSQLataSetChange {
def main(args: Array[String]): Unit = {
//1-环境准备
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[*]")
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
//如果从rdd转化到df需要引入隐式转换
import spark.implicits._
import org.apache.spark.sql.functions._
//读取元数据文件
val peopleRDD: RDD[String] = spark.sparkContext.textFile("data/input/sql/person.txt")
val valueRDD: RDD[PerSon2] = peopleRDD.map(_.split("\\s+")).map(line => PerSon2(line(0).toInt, line(1), line(2).toInt))
/**
* 开始相互转换
*/
//2 -从rdd转换为DataFrame,在使用case class的基础上增加schema在toDF
val peopleDF: DataFrame = valueRDD.toDF()
peopleDF.printSchema()
peopleDF.show()
/**
* +---+--------+---+
* | id| name|age|
* +---+--------+---+
* | 1|zhangsan| 20|
* | 2| lisi| 29|
* | 3| wangwu| 25|
* | 4| zhaoliu| 30|
* | 5| tianqi| 35|
* | 6| kobe| 40|
* +---+--------+---+
*/
// 3-从df转化为rdd
peopleDF.rdd.collect().foreach(println(_))
/**
* [1,zhangsan,20]
* [2,lisi,29]
* [3,wangwu,25]
* [4,zhaoliu,30]
* [5,tianqi,35]
* [6,kobe,40]
*/
//4-从df转到ds
val peopleDS: Dataset[PerSon2] = peopleDF.as[PerSon2]
peopleDS.show()
/**
* +---+--------+---+
* | id| name|age|
* +---+--------+---+
* | 1|zhangsan| 20|
* | 2| lisi| 29|
* | 3| wangwu| 25|
* | 4| zhaoliu| 30|
* | 5| tianqi| 35|
* | 6| kobe| 40|
* +---+--------+---+
*/
// 5- DS ---> RDD
peopleDS.rdd.foreach(println(_))
/**
* PerSon2(1,zhangsan,20)
* PerSon2(4,zhaoliu,30)
* PerSon2(2,lisi,29)
* PerSon2(5,tianqi,35)
* PerSon2(3,wangwu,25)
* PerSon2(6,kobe,40)
*/
//6- RDD -> DS
val peopleDS2: Dataset[PerSon2] = valueRDD.toDS()
peopleDS2.show()
//7- DS --> DF
val peopleDSDF: DataFrame = peopleDS.toDF()
peopleDSDF.show()
/**
* +---+--------+---+
* | id| name|age|
* +---+--------+---+
* | 1|zhangsan| 20|
* | 2| lisi| 29|
* | 3| wangwu| 25|
* | 4| zhaoliu| 30|
* | 5| tianqi| 35|
* | 6| kobe| 40|
* +---+--------+---+
*/
//关闭SparkSession
spark.sparkContext.stop()
}
}