简介: Spark官网提供了两种方法来实现从RDD转换得到DataFrame
- 第一种方法是利用反射机制,推导包含某种类型的RDD,通过反射将其转换为指定类型的DataFrame,适用于提前知道RDD的schema。
- 第二种方法通过编程接口与RDD进行交互获取schema,并动态创建DataFrame,在运行时决定列及其类型。
写代码的三种方式:
- 第1种:指定列名添加Schema
- 第2种:通过StructType指定Schema
- 第3种:编写样例类,利用反射机制推断Schema
指定列名添加Schema:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author liu a fu
* @date 2021/1/17 0017
* @version 1.0
* @DESC 编写Spark SQL程序实现RDD转换成DataFrame --->> 指定列名添加Schema 无样例类 无 Row
*/
object _05IrisSparkSQL {
def main(args: Array[String]): Unit = {
//1-准备环境
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[*]")
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//引入隐式转换
import spark.implicits._
//在SparkSQL中将一条条的数据抽象为Row对象
//读取数据
val fileRDD: RDD[String] = sc.textFile("data/input/sql/iris.data")
val valueRDD: RDD[(Double, Double, Double, Double, String)] = fileRDD.map(_.split(",")).map(x => (x(0).toDouble, x(1).toDouble, x(2).toDouble, x(3).toDouble, x(4))) //类似于Lambda (形参) ->{函数体}
//下面我们的工作就是增加Schema的信息,没有样例类的支持,只能选择使用程序中动态加载schema
val irisDF: DataFrame = valueRDD.toDF("sepal_length", "sepal_width", "petal_length", "petal_width", "class")
//3- 展示数据
irisDF.printSchema()
/**
* root
* |-- sepal_length: double (nullable = false)
* |-- sepal_width: double (nullable = false)
* |-- petal_length: double (nullable = false)
* |-- petal_width: double (nullable = false)
* |-- class: string (nullable = true)
*/
irisDF.show()
}
}
StructType配合Row直接指定Schema:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataTypes, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SparkSession, types}
/**
* @author liu a fu
* @date 2021/1/17 0017
* @version 1.0
* @DESC 编写Spark SQL程序实现RDD转换成DataFrame --->> StructType配合Row直接指定Schema 有Row
*/
object _04IrisSparkSQL {
def main(args: Array[String]): Unit = {
//1-准备环境
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[*]")
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//读取数据文件
val fileRDD: RDD[String] = sc.textFile("data/input/sql/iris.data")
val valueRDD: RDD[Row] = fileRDD.map(_.split(",")).map(x => Row(x(0).toDouble, x(1).toDouble, x(2).toDouble, x(3).toDouble, x(4).toString))
//下面我们的工作就是增加Schema的信息,没有样例类的支持,只能选择使用程序中动态加载schema
/* val schema: StructType = new StructType()
.add("sepal_length", DataTypes.DoubleType, true)
.add("sepal_width", DataTypes.DoubleType, true)
.add("petal_length", DataTypes.DoubleType, true)
.add("petal_width", DataTypes.DoubleType, true)
.add("classlabel", DataTypes.StringType, true)*/
/* val schema: StructType = StructType(
StructField("sepal_length", DataTypes.DoubleType, true) ::
StructField("sepal_width", DataTypes.DoubleType, true) ::
StructField("petal_length", DataTypes.DoubleType, true) ::
StructField("petal_width", DataTypes.DoubleType, true) ::
StructField("classlabel", DataTypes.StringType, true) :: Nil
)*/
val schema: StructType = StructType(
Array(
StructField("sepal_length", DataTypes.DoubleType, true),
StructField("sepal_width", DataTypes.DoubleType, true),
StructField("petal_length",DataTypes.DoubleType,true),
StructField("petal_width",DataTypes.DoubleType,true),
StructField("classlabel",DataTypes.StringType,true)
)
)
//2-结合RDD+Schema
val irisDF: DataFrame = spark.createDataFrame(valueRDD, schema)
//3- 展示数据
irisDF.printSchema()
irisDF.show()
}
}
通过反射配合样例类推断Schema:
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
/**
* @author liu a fu
* @date 2021/1/17 0017
* @version 1.0
* @DESC 编写Spark SQL程序实现RDD转换成DataFrame --->>通过反射配合样例类推断Schema 有样例类
*
*/
//样例类 相当于Java中的实体类
case class Iris(sepal_length: Double, sepal_width: Double, petal_length: Double, petal_width: Double, classlabel: String)
object _03IrisSparkSQL {
def main(args: Array[String]): Unit = {
//1-环境准备
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName.stripSuffix("$")).setMaster("local[*]")
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//引入隐士转换 RDD ---> DataFrame 要引入
import spark.implicits._
//2-读取数据
val fileRDD: RDD[String] = sc.textFile("data/input/sql/iris.data")
val splitRDD: RDD[Array[String]] = fileRDD.map(_.split(","))
val irisDF: DataFrame = splitRDD.map(x => Iris(x(0).toDouble, x(1).toDouble, x(2).toDouble, x(3).toDouble, x(4).toString)).toDF()
//展示数据
irisDF.printSchema()
/**
* root
* |-- sepal_length: double (nullable = false)
* |-- sepal_width: double (nullable = false)
* |-- petal_length: double (nullable = false)
* |-- petal_width: double (nullable = false)
* |-- classlabel: string (nullable = true)
*
*/
irisDF.show() //这个展示表内所有内容
spark.stop()
}
}