Dataset的常用函数
object DataSet {
//定义样例类
case class student(name:String,age:Integer,grade:String)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local").appName("ds").getOrCreate()
//定义隐式转换(否则会报错)
import spark.implicits._
//使用集合转Dataset
val ds1 = spark.sparkContext.parallelize( Seq(("zs",20,"13"),("ls",23,"15"))).map(x=>student(x._1,x._2,x._3)).toDS()
//显示表结构
ds1.printSchema()
/*root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- grade: string (nullable = true)*/
//选择输出
println("____________________select_________________________")
//仅显示name列的数据
ds1.select("name").show()
/*
+----+
|name|
+----+
| zs|
| ls|
+----+*/
//仅显示name列的数据的第一条
ds1.select("name").show(1)
/*
+----+
|name|
+----+
| zs|
+----+*/
//过滤器
println("____________________filter/where_________________________")
//过滤出,age大于20的数据信息
ds1.filter("age>20").show()
/*
+----+---+-----+
|name|age|grade|
+----+---+-----+
| ls| 23| 15|
+----+---+-----+*/
//过滤出字段age小于20并且grade字段不为空的数据
//filter可以使用$符合来指定操作字段
ds1.filter($"age"<=20 && $"grade"!=null).show()
/*
+----+---+-----+
|name|age|grade|
+----+---+-----+
| zs| 20| 13|
+----+---+-----+
*/
//过滤age字段大于20,并且grade等于15的数据
//x为每行数据相当于一个对象
ds1.filter(x=>x.age>20 && x.grade.equals("15")).show()
/*
+----+---+-----+
|name|age|grade|
+----+---+-----+
| ls| 23| 15|
+----+---+-----+
*/
//等同于filter
ds1.where("age>20").show()
ds1.where($"age"<=20 && $"grade"!=null).show()
//这句是错的,where不能像filter一样可以按列操作
//ds1.where(x=>x.age>20 && x.grade.equals("15")).show()
//操作列名 属性
println("____________________withColumn_________________________")
//新增列,列名name1,数据与name一致
val dswc2 = ds1.withColumn("name1",ds1.col("name"))
dswc2.printSchema()
/* root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- grade: string (nullable = true)
|-- name1: string (nullable = true)*/
//修改列名,将原来的name改为Name
val dswc3 = ds1.withColumn("Name",ds1.col("name"))
dswc3.printSchema()
/*root
|-- Name: string (nullable = true)
|-- age: integer (nullable = true)
|-- grade: string (nullable = true)*/
//新增列,列名Grade,将grade列的数据以int类型传至新列
val dswc4 = ds1.withColumn("Grade",ds1.col("grade").cast("int"))
dswc4.printSchema()
/*root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- Grade: integer (nullable = true)*/
//修改grade列的数据类型为int
val dswc5 = ds1.withColumn("grade",ds1.col("grade").cast("int"))
dswc5.printSchema()
/*root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- grade: integer (nullable = true)*/
//将age列的所有数据整体加5
val dswc6 = ds1.withColumn("age",ds1.col("age")+5)
dswc6.printSchema()
/*
root
|-- name: string (nullable = true)
|-- age: integer (nullable = true)
|-- grade: string (nullable = true)
*/
dswc6.show()
/*
+----+---+-----+
|name|age|grade|
+----+---+-----+
| zs| 25| 13|
| ls| 28| 15|
+----+---+-----+*/
//删除age列
val dswc7 = ds1.drop("age")
dswc7.printSchema()
/*root
|-- name: string (nullable = true)
|-- grade: string (nullable = true*/
}
}