- 本篇仅提供DS、DF的创建方式,如果需要了解DS、DF可用函数:点击这里
Dataset创建方法
- 第一种:Seq集合(如果有多列,需要通过用case class转成指定类型)
object DatasetDemo{
//定义样例类
case class student(name:String,age:Int)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("ds").getOrCreate()
//定义隐式转换(否则会报错)
import spark.implicits._
//通过createDataset传入seq集合生成dataset对象
val ds = spark.createDataset(Seq(student("zs",1),student("lz",2)))
//显示数据
ds.show()
}
}
- 第二种:List集合(同上,只需要将seq改为List)
- 第三种:RDD转Dataset
object DatasetDemo{
//定义样例类
case class student(name:String,age:Int)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("ds").getOrCreate()
//定义隐式转换(否则会报错)
import spark.implicits._
//创建RDD,将属性通过map转成student类型,在通过toDS转成Dataset
val ds = spark.sparkContext.parallelize(Array(("zs",1),("ls",2))).map(x=>student(x._1,x._2)).toDS()
//显示数据
ds.show()
}
}
- 第四种:加载csv文件
object DatasetDemo{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("ds").getOrCreate()
//使用read函数加载csv文件返回Dataset对象
val ds = spark.read.csv("in/users.csv")
ds.show()
}
}
在Spark-shell环境下操作
//createDataset()的参数可以是:Seq、Array、RDD
//示例一
scala> spark.createDataset(1 to 3)
//返回的类型是Dataset[Int]
res24: org.apache.spark.sql.Dataset[Int] = [value: int]
/*
+-----+
|value|
+-----+
| 1|
| 2|
| 3|
+-----+
*/
//示例二
scala> spark.createDataset(List(("a",1),("b",2),("c",3)))
//返回的类型是Dataset[(String, Int)]
res4: org.apache.spark.sql.Dataset[(String, Int)] = [_1: string, _2: int]
/*
+---+---+
| _1| _2|
+---+---+
| a| 1|
| b| 2|
| c| 3|
+---+---+
*/
//示例三
//创建rdd
scala> sc.parallelize(List(("a",1,1),("b",2,2),("c",3,3)))
res7: org.apache.spark.rdd.RDD[(String, Int, Int)] = ParallelCollectionRDD[0] atparallelize at <console>:25
//将rdd当作参数获取createDataset
scala> spark.createDataset(res7)
//返回的类型为Dataset[(String, Int, Int)]
res8: org.apache.spark.sql.Dataset[(String, Int, Int)] = [_1: string, _2: int ... 1 more field]
/*
+---+---+---+
| _1| _2| _3|
+---+---+---+
| a| 1| 1|
| b| 2| 2|
| c| 3| 3|
+---+---+---+
*/
- Seq转成Dataset
//定义样例类
scala> case class student(naem:String,age:Int)
defined class student
//创建seq
scala> Seq(student("zs",20),student("ls",22))
res0: Seq[student] = List(student(zs,20), student(ls,22))
//seq通过toDS转成Dataset
scala> res0.toDS
res1: org.apache.spark.sql.Dataset[student] = [naem: string, age: int]
//输出转成后的Dataset
scala> res1.show
+----+---+
|naem|age|
+----+---+
| zs| 20|
| ls| 22|
+----+---+
- RDD转成Dataset
//定义样例类
scala> case class student(naem:String,age:Int)
defined class student
//创建RDD
scala> sc.parallelize(List(("zs",20),("ls",22)))
res3: org.apache.spark.rdd.RDD[(String, Int)] = ParallelCollectionRDD[0] at parallelize at <console>:25
//将rdd参数类型都转成student类型后,通过toDS转成Dataset
scala> res3.map(x=>student(x._1,x._2)).toDS
res5: org.apache.spark.sql.Dataset[student] = [naem: string, age: int]
//输出转成后的Dataset
scala> res5.show
+----+---+
|naem|age|
+----+---+
| zs| 20|
| ls| 22|
+----+---+
DataFrame的使用
- DataFrame就是数据类型为ROW的Dataset即:Dataset[ROW]
- 类似传统数据的二维表格
- 在RDD基础上加入了Schema(数据结构信息)
- DataFrame Schema支持嵌套数据类型
- 提供更多类似SQL操作的API
RDD与DataFrame对比
创建DataFrame
- JSON文件 -> DataFrame
/* 将JSON文件转成DataFrame
* people.json内容如下
* {"name":"Michael"}
* {"name":"Andy", "age":30}
* {"name":"Justin", "age":19}
*/
val df:DataFrame = spark.read.json("file:///home/hadoop/data/people.json")
// 使用show方法将DataFrame的内容输出
df.show
/*
+----+-------+
| Age| name|
+----+-------+
|null|Michael|
| 30| Andy|
| 19| Justin|
+----+-------+
*/
- RDD -> DataFrame
object Dataset2 {
//定义样例类
case class student(name: String, age: Int)
def main(args: Array[String]): Unit = {
//获取SparkSession实例
val spark = SparkSession.builder().master("local").appName("ds").getOrCreate()
//引入implicits
import spark.implicits._
//通过RDD.map将数据转成student类型,在.toDF生成DataFrame对象
val df = spark.sparkContext.parallelize(List(("zs,20"),("ls,24"))).map(_.split(",")).map(x=>student(x(0),x(1).toInt)).toDF()
df.show()
/*
+----+---+
|name|age|
+----+---+
| zs| 20|
| ls| 24|
+----+---+
*/
- (RDD[ROW],Schema) -> DataFrame
Schema的作用:因为RDD的数据本身没有结构,可以通过创建DataFrame的时候传入Schema,指定RDD的数据结构
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("ds").getOrCreate()
//定义生成DataFrame的RDD数据列名
val schemaString="name age"
//这里需要引入一下包
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType,StructField,StructType}
//然后使用StructType函数,将定义的schemaString切分后生成对应的StructField(列名,类型)
//返回类型为StructType[StructField[x,StringType]]
val schema = StructType(schemaString.split(" ").map(x=>StructField(x,StringType,true)))
//创建RDD,通过.map将数据都转成RDD[ROW]格式
val rdd:RDD[Row] = spark.sparkContext.parallelize(List(("zs,20"),("ls,24"))).map(_.split(",")).map(x=>Row(x(0),x(1)))
val frame = spark.createDataFrame(rdd,schema)
frame.show()
/*
+----+---+
|name|age|
+----+---+
| zs| 20|
| ls| 22|
+----+---+
*/