package com.spark.sql
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql._
object Rdd2DataFrame {
System.setProperty("hadoop.home.dir", "d://soft/hadoop/hadoop-2.9.2")
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("rdd2dataframe").getOrCreate()
val lineRdd = spark.sparkContext.textFile("G:\\people.txt")
//1.RDD----->DataFrame
val df = rdd2DataFrame(lineRdd, spark)
//2.DataFrame--->RDD
dataFrame2Rdd(df)
//3.RDD---->DataSet
val ds = rdd2Dataset(lineRdd, spark)
// 4.DataSet---->RDD
dataset2Rdd(ds)
//5. DataFrame---->DataSet
val ds1 = dataFrame2Dataset(df)
//6. DataSet----->DataFrame
dataset2DataFrame(ds1)
spark.close()
}
//1.RDD----->DataFrame
def rdd2DataFrame(lineRdd: RDD[String], spark: SparkSession): DataFrame = {
// 方式一:动态编程方式
// val personRDD = lineRdd.map(line => Row(line.split(" ")(0), line.split(" ")(1).toInt))
// val fields = Seq(StructField("name", StringType), StructField("age", IntegerType))
// val schema = StructType(fields)
// val personDF = spark.createDataFrame(personRdd, schema)
//
// personDF.show()
//方式二:反射方式类型推断 (case class Persons(name:String, age:Int))
//(1) 使用类来反射字段名称
// val personRdd = lineRdd.map(line => Persons(line.split(" ")(0), line.split(" ")(1).toInt))
// import spark.implicits._
// val personDF = personRdd.toDF
//(2)不使用类 直接起列名
val personRDD = lineRdd.map(line => (line.split(" ")(0), line.split(" ")(1).toInt))
import spark.implicits._
val personDF = personRDD.toDF("name", "age")
personDF.show()
//返回值
personDF
}
//2.DataFrame--->RDD
def dataFrame2Rdd(df: DataFrame): Unit = {
val rdd = df.rdd
rdd.foreach(println)
}
//3.RDD---->DataSet
def rdd2Dataset(lineRdd: RDD[String], spark: SparkSession): Dataset[Person] = {
import spark.implicits._
// val df = List(1,2,3,4,5).toDF("num")
// val ds: Dataset[Int] = List(1,2,3,4,5).toDS()
val personRdd = lineRdd.map(line => Person(line.split(" ")(0), line.split(" ")(1).toInt))
//kyro序列化方式
// implicit val personEncoder = org.apache.spark.sql.Encoders.kryo[Person]
//Encoder进行序列化(SparkSQL推荐使用的方式)
// implicit val personEncoder: Encoder[Person] = ExpressionEncoder()
// val personDS = spark.createDataset(personRdd)
val personDS = personRdd.toDS()
personDS.filter(p => p.age > 20).show()
personDS.show()
personDS
}
// 4.DataSet---->RDD
def dataset2Rdd(ds: Dataset[Person]): Unit = {
val rdd = ds.rdd
rdd.foreach(println)
}
//5. DataFrame---->DataSet
def dataFrame2Dataset(df: DataFrame): Dataset[Person] = {
implicit val personEncoder: Encoder[Person] = ExpressionEncoder()
val personDS = df.as[Person]
personDS.show()
personDS
}
//6. DataSet----->DataFrame
def dataset2DataFrame(ds: Dataset[Person]): Unit = {
val df = ds.toDF("name", "age")
df.show()
}
}
case class Persons(name: String, age: Int)
Spark RDD DataSet 和 DataFrame之间的相互转换
最新推荐文章于 2022-08-25 12:17:31 发布