创建DataFrame/DataSet的三种方法
下面直接copy代码
**
- List item
第1种:指定列名添加Schema**
package spark_sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object Spark_Sql_Schema {
def main(args: Array[String]): Unit = {
//创建sparkSession
//通过sparkSession创建SparkContext
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("zhiDingSchema").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//读取数据并加工
val ttRDD: RDD[String] = sc.textFile("D:\\大数据\\学期文档\\spark\\资料\\tt.txt")
val lineDatas: RDD[Array[String]] = ttRDD.map(a => a.split(" "))
//这里返回的必须是Row
val rowRDD: RDD[Row] = lineDatas.map(x => Row(x(0).toInt, x(1), x(2).toInt))
//定义schema 设置表结构
val schema: StructType = StructType(
Seq(
StructField("id", IntegerType, true),
StructField("name", StringType, true),
StructField("age", IntegerType, true)
)
)
//创建dataFrame
val df: DataFrame = spark.createDataFrame(rowRDD, schema)
//查看数据
df.show()
df.printSchema()
//如果要查询将df注册成一张表
// df.createOrReplaceTempView("person")
//sql查询
//停止ss、sc
sc.stop()
spark.stop()
}
}
**
- List item
第2种:通过StructType指定Schema**
package spark_sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object SparkSqlZhiDinSchema {
def main(args: Array[String]): Unit = {
//创建sparkSession
//通过sparkSession创建SparkContext
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("zhiDingSchema").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//读取数据并加工
val ttRDD: RDD[String] = sc.textFile("D:\\大数据\\学期文档\\spark\\资料\\tt.txt")
val lineDatas: RDD[Array[String]] = ttRDD.map(a => a.split(" "))
val RDD: RDD[(Int, String, Int)] = lineDatas.map(x => (x(0).toInt, x(1), x(2).toInt))
//转化成DF、DS
//隐式转化
import spark.implicits._
val rddDF: DataFrame = RDD.toDF("id", "name", "age")
//查看数据
rddDF.show()
rddDF.printSchema()
// //如果要查询sql
// rddDF.createOrReplaceTempView("tt")
// spark.sql("select * from tt").show()
//StructType形式DSL风格不能使用$ 符号调用
//停止ss、sc
sc.stop()
spark.stop()
}
}
- List item
第3种:编写样例类,利用反射机制推断Schema*
package spark_sql
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object SparkSqlFanShe {
//创建样例类
case class Person(id: Int, name: String, age: Int)
def main(args: Array[String]): Unit = {
//创建sparkSession
//通过sparkSession创建SparkContext
val spark: SparkSession = SparkSession.builder().master("local[*]").appName("zhiDingSchema").getOrCreate()
val sc: SparkContext = spark.sparkContext
sc.setLogLevel("WARN")
//读取数据并加工
val ttRDD: RDD[String] = sc.textFile("D:\\大数据\\学期文档\\spark\\资料\\tt.txt")
val lineDatas: RDD[Array[String]] = ttRDD.map(a => a.split(" "))
//遍历每一行数据 传递到样例类中
val personRDD: RDD[Person] = lineDatas.map(line => Person(line(0).toInt, line(1), line(2).toInt))
//转化成DF、DS
//隐式调用
import spark.implicits._
val df: DataFrame = personRDD.toDF()
//查看数据
df.show()
df.printSchema()
//将df注册成一张表
df.createOrReplaceTempView("person")
//sql查询
//spark.sql("select * from person").show()
//也可以用DSL风格
df.select("id", "name").filter($"id" > 3).show()
/**
* 笔记
* RDD DF DS 三者之间的转化
*/
//RDD转化DF、DS
personRDD.toDF() //前提是进行隐式转化
val DS: Dataset[Person] = personRDD.toDS()
//DF转化RDD、DS
df.rdd
val DS1: Dataset[Person] = df.as[Person]
//DS转化RDD、DF
DS.rdd
DS.toDF()
/**
* 总结
* 1.转化RDD .rdd即可
* 2.转化成DF toDF()即可
* 3.转化成DS
* 3.1 RDD转化DS toDS()
* 3.2 DF 转DS as[Person]
*/
//停止sc、ss
sc.stop()
spark.stop()
}
}