1,从RDD[Case class类]创建DataFrame
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object SparkSQL02 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getName).master("local[*]").getOrCreate()
val lines: RDD[String] = spark.sparkContext.textFile("C:\\Users\\LEMMONT\\Desktop\\testdata\\stu")
val stuRDD: RDD[Stu] = lines.map(
line => {
val arr = line.split(",")
Stu(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
)
// val df: DataFrame = spark.createDataFrame(stuRDD)
//toDF要用隐式转换
import spark.implicits._
val df: DataFrame = stuRDD.toDF
df.printSchema()
df.show()
spark.stop()
}
}
case class Stu(id: Int, name: String, age: Int, city: String, score: Double)
2,从RDD[Tuple]创建DataFrame
val lines: RDD[String] = spark.sparkContext.textFile("C:\\Users\\LEMMONT\\Desktop\\testdata\\stu")
val tupleRdd: RDD[(Int, String, Int, String, Double)] = lines.map(
line => {
val arr = line.split(",")
(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
)
import spark.implicits._
val df: DataFrame = tupleRdd.toDF("id","name","age","city","score")
3,从RDD[Row]中创建DataFrame
val lines: RDD[String] = spark.sparkContext.textFile("C:\\Users\\LEMMONT\\Desktop\\testdata\\stu")
val rowRDD: RDD[Row] = lines.map(
line => {
val arr = line.split(",")
Row(arr(0).toInt, arr(1), arr(2).toInt, arr(3), arr(4).toDouble)
}
)
/* val schema: StructType = StructType(
List(
StructField("id", IntegerType),
StructField("name", StringType),
StructField("age", IntegerType),
StructField("city", StringType),
StructField("score", DoubleType)
)
)*/
val schema: StructType = new StructType()
.add("id", DataTypes.IntegerType)
.add("name", DataTypes.StringType)
.add("age", DataTypes.IntegerType)
.add("city", DataTypes.StringType)
.add("score", DataTypes.DoubleType)
val df: DataFrame = spark.createDataFrame(rowRDD, schema)
这种比较灵活,可以动态的定义列