1:创建dataframe, dataframe=RDD+schema
所以让RDD和schema连接起来是关键
----------------------------------------------------------------------------------------------------------------------------
简单数据
1,张飞,21,北京,80.0 2,关羽,23,北京,82.0 3,赵云,20,上海,88.6 4,刘备,26,上海,83.0 5,曹操,30,深圳,90.01.toDF
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate() val sc = spark.sparkContext val lines: RDD[String] = sc.textFile("data\\user.txt") //5,曹操,30,深圳,90.0 val userRdd = lines.map(it => { val line = it.split(",") val id = line(0).toInt val name = line(1) val age = line(2).toInt val province = line(3) val score = line(4).toDouble Person(id,name,age,province,score) //如果不带Person,就没有表头为第一种结果 //因为是object可以直接掉,如果class就得new创建实例. }) //toDF需要导入隐士转换,toDF可以让rdd和schema关联 ,它可以自动判定你属于什么属性, //但是没有类型(name,id age)等等,只有1,2,3,4,5来代替表头,所以我们传入case class--注意 //,这里用case class是因为它自带getting方法,这样就不需要再次创建getting方法了 //,如果用javabean,最好设置有参和无惨 //使用getting方法就可以获取schema了 -------------------------------------------------------------------------------------- import spark.implicits._ val df = userRdd.toDF() df.printSchema() df.show() case class Person(id:Int,name:String,age:Int,province:String,score:Double)
不带 Person 那么列头就只有12345 root |-- _1: integer (nullable = false) |-- _2: string (nullable = true) |-- _3: integer (nullable = false) |-- _4: string (nullable = true) |-- _5: double (nullable = false) +---+----+---+----+----+ | _1| _2| _3| _4| _5| +---+----+---+----+----+ | 1|张飞| 21|北京|80.0| | 2|关羽| 23|北京|82.0| | 3|赵云| 20|上海|88.6| | 4|刘备| 26|上海|83.0| | 5|曹操| 30|深圳|90.0| +---+----+---+----+----+
带了 Person 就有了列头 +---+----+---+--------+-----+ | id|name|age|province|score| +---+----+---+--------+-----+ | 1|张飞| 21| 北京| 80.0| | 2|关羽| 23| 北京| 82.0| | 3|赵云| 20| 上海| 88.6| | 4|刘备| 26| 上海| 83.0| | 5|曹操| 30| 深圳| 90.0| +---+----+---+--------+-----+ |-- id: integer (nullable = false) |-- name: string (nullable = true) |-- age: integer (nullable = false) |-- province: string (nullable = true) |-- score: double (nullable = false)
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------
2:第二种,使用Row来连接DataFrame
object RowCreateDataFrameJoinRDD{
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val sc = spark.sparkContext
val lines: RDD[String] = sc.textFile("data\\user.txt")
//5,曹操,30,深圳,90.0
val userRdd: RDD[Row] = lines.map(it => {
val line = it.split(",")
val f1 = line(0).toInt
val f2 = line(1)
val f3 = line(2).toInt
val f4 = line(3)
val f5 = line(4).toDouble
Row(f1,f2,f3,f4,f5)
})
val schema = StructType(
List (
StructField("id",IntegerType),
StructField("name",StringType),
StructField("age",IntegerType),
StructField("province",StringType),
StructField("score",DoubleType)
)
)
val df = spark.createDataFrame(userRdd, schema)
df.printSchema()
df.show()
spark.stop()
}
}
得到的结果
root
|-- id: integer (nullable = true)
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- province: string (nullable = true)
|-- score: double (nullable = true)
+---+----+---+--------+-----+
| id|name|age|province|score|
+---+----+---+--------+-----+
| 1|张飞| 21| 北京| 80.0|
| 2|关羽| 23| 北京| 82.0|
| 3|赵云| 20| 上海| 88.6|
| 4|刘备| 26| 上海| 83.0|
| 5|曹操| 30| 深圳| 90.0|
+---+----+---+--------+-----+
--------------------------------------------------------------------------------------------------------------------------------------------------------------
3:用元组 和toDF
package cn._51doit.spark.day10
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
/**
* 通过元组关联schema创建DataFrame
*/
object CreateDataFrameDemo5 {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
//创建DataFrame里面可以有一到多例
//DataFrame = RDD + schema
val sc = spark.sparkContext
val lines: RDD[String] = sc.textFile("data/user.txt")
//关联case class
//已经将RDD + schema关联
val tupleRdd: RDD[(String, Int, Double)] = lines.map(line => {
val fields = line.split(",")
val f1 = fields(0)
val f2 = fields(1).toInt
val f3 = fields(2).toDouble
(f1, f2, f3)
})
import spark.implicits._
a
val df: DataFrame = tupleRdd.toDF("name", "age", "face_value")
df.printSchema()
df.orderBy($"face_value".asc).show()
spark.stop()
}
}