1.通过case class 的方式创建DataFrame
laozhao,18,9999.99 laoduan,30,99.99 xuance,28,99.99 yeqing,25,99.0 dezhi,24,99.9 libai,88,50.0 banzang,29,50.6
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object DataFrameCreateDemo01 {
def main(args: Array[String]): Unit = {
//创建SparkSession(是对SparkContext的增强)
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc = spark.sparkContext
val lines: RDD[String] = sc.textFile("src/main/scala/data/user.txt")
val userRdd: RDD[User] = lines.map(e => {
val split = e.split(",")
val name = split(0)
val age = split(1).toInt
val fv = split(2).toDouble
User(name, age, fv)
})
//导入隐式转换
import spark.implicits._
//将RDD转换成特殊的数据集
val df1: DataFrame = userRdd.toDF()
//使用两种风格的ARI
//①使用SQL风格
//生成一个表名
df1.createTempView("t_fv")
//写sql语句
val df2: DataFrame = spark.sql("SELECT name,age,fv FROM t_fv ORDER BY fv DESC , age ASC")
// df2.show()
//②使用DSL风格(特定领域语法)
val df3: Dataset[Row] = df1.select("name", "age", "fv").orderBy($"fv".desc, $"age".asc)
df3.printSchema()
df3.show()
}
}
case class User (name:String,age:Int,fv:Double)
2. 通过class的方式创建DataFrame
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import scala.beans.BeanProperty
//将RDD关联scala class创建DataFrame
object DataFrameCreateDemo02 {
def main(args: Array[String]): Unit = {
//创建SparkSession
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val lines: RDD[String] = sc.textFile("src/main/scala/data/user.txt")
val userRdd: RDD[Persion] = lines.map(e => {
val split = e.split(",")
new Persion(split(0), split(1).toInt, split(2).toDouble)
})
//将RDD转换成特殊的数据集
val df1: DataFrame = spark.createDataFrame(userRdd, classOf[Persion])
import spark.implicits._
//使用两种风格的ARI
//①使用SQL风格
//生成一个表名
df1.createTempView("t_fv")
//写sql语句
val df2: DataFrame = spark.sql("SELECT name,age,fv FROM t_fv ORDER BY fv DESC , age ASC")
// df2.show()
//②使用DSL风格(特定领域语法)
val df3: Dataset[Row] = df1.select("name","age","fv").orderBy($"fv".desc, $"age".asc)
df3.printSchema()
df3.show()
}
}
class Persion (
@BeanProperty
val name:String,
@BeanProperty
val age:Int,
@BeanProperty
val fv:Double
)
3.通过StructType方式创建DataFrame
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DoubleType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
//将RDD中的数据转成row,并关联schema
object DataFrameCreateDemo03 {
def main(args: Array[String]): Unit = {
//创建SparkSession
val spark: SparkSession = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
val lines: RDD[String] = sc.textFile("src/main/scala/data/user.txt")
//row的字段没有名字 没有类型
val rdd1: RDD[Row] = lines.map(e => {
val split = e.split(",")
Row(split(0), split(1).toInt, split(2).toDouble)
})
//关联schema(字段名称、字段类型、是否可以为空)
val schema: StructType = StructType(
Array(
StructField("name", StringType),
StructField("age", IntegerType),
StructField("fv", DoubleType)
)
)
//将RowRDD与StructType中的schema关联
val df1: DataFrame = spark.createDataFrame(rdd1, schema)
import spark.implicits._
//使用两种风格的ARI
//①使用SQL风格
//生成一个表名
df1.createTempView("t_fv")
//写sql语句
val df2: DataFrame = spark.sql("SELECT name,age,fv FROM t_fv ORDER BY fv DESC , age ASC")
// df2.show()
//②使用DSL风格(特定领域语法)
val df3: Dataset[Row] = df1.select("name","age","fv").orderBy($"fv".desc, $"age".asc)
df3.printSchema()
df3.show()
}
}