spark源码阅读笔记Dataset（二）Dataset中Actions、function、transformations

最新推荐文章于 2024-03-19 15:04:37 发布
legotime
最新推荐文章于 2024-03-19 15:04:37 发布
阅读量3.6k
点赞数 1
分类专栏： spark源码阅读笔记文章标签： spark dataset dataframe
本文链接：https://blog.csdn.net/legotime/article/details/52562796
版权
spark源码阅读笔记专栏收录该内容
15 篇文章 6 订阅
订阅专栏
package Dataset


import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
/**
  * Created by legotime
  */
object dataSetOperation {
  case class Person(name: String, age: Long)
  val sparkSession = SparkSession.builder().appName("data set example")
    .master("local").getOrCreate()
  import sparkSession.implicits._
  val rdd = sparkSession.sparkContext.textFile("hdfs://master:9000/src/main/resources/people.txt")
  val dataSet = rdd.map(_.split(",")).map(p =>Person(p(0),p(1).trim.toLong)).toDS()

  //---------------------------------------------------------------Actions--------------------------------------
  def dataSet_collect() = {
    //Returns an array that contains all of Rows in this Dataset.
    dataSet.collect().foreach(println)
    /**
      * Person(Michael,29)
      * Person(Andy,30)
      * Person(Justin,19)
      */
  }
  def dataSet_collectAsList()={
    //Returns a Java list that contains all of Rows in this Dataset.
    println(dataSet.collectAsList)
    /**
      * [Person(Michael,29), Person(Andy,30), Person(Justin,19)]
      */
  }
  def dataSet_count() = {
    //Returns the number of rows in the Dataset.
    println(dataSet.count())

    /**
      * 3
      */
  }
  def dataSet_describe()={
    //Computes statistics for numeric columns, including count, mean, stddev, min, and max
    dataSet.describe("name","age").show
    /** 这个函数可以计算可中基本统计信息
      * +-------+-------+-----------------+
      * |summary|   name|              age|
      * +-------+-------+-----------------+
      * |  count|      3|                3|
      * |   mean|   null|             26.0|
      * | stddev|   null|6.082762530298219|
      * |    min|   Andy|               19|
      * |    max|Michael|               30|
      * +-------+-------+-----------------+
      */
  }
  def dataSet_first()={
    //Returns the first row.
    println(dataSet.first())

    /**
      * Person(Michael,29)
      */
  }
  def dataSet_foreachPartition()={
    //Applies a function f to each partition of this Dataset.
    dataSet.foreachPartition{ part =>
      println(part.toList)
    }

    /**
      * List(Person(Michael,29), Person(Andy,30), Person(Justin,19))
      */
  }

  def dataSet_head()={
    //Returns the first n rows.
    dataSet.head(2).foreach(println)
    /**
      * Person(Michael,29)
      * Person(Andy,30)
      */
  }
  def dataSet_reduce()={
    //(Scala-specific) Reduces the elements of this Dataset using the specified binary function.
    val data: Dataset[String] = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]
    println(data.reduce(_+_))

    /**
      * Michael, 29Andy, 30Justin, 19
      */

  }
  def dataSet_show() ={
    println("----------默认全部打印---------------")
    dataSet.show()
    println("----------only showing top 2 rows---------------")
    dataSet.show(2)
    println("---------Displays the top 20 rows of Dataset in a tabular form.------------")
    dataSet.show(true)
  }
  def dataSet_toLocalIterator()={
    //Return an iterator that contains all of Rows in this Dataset.
    val tmp = dataSet.toLocalIterator()
    while (tmp.hasNext){
      println(tmp.next())
    }

    /**
      * Person(Michael,29)
      * Person(Andy,30)
      * Person(Justin,19)
      */
  }
  //---------------------------------------------------------------Basic Dataset functions---------------------
  def dataSet_as()={
    //Returns a new Dataset where each record has been mapped on to the specified type
    val data: DataFrame = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt")
    data.as[String]
    data.show()

    /**
      * +-----------+
      * |      value|
      * +-----------+
      * |Michael, 29|
      * |   Andy, 30|
      * | Justin, 19|
      * +-----------+
      */
  }
  def dataSet_cache()={
    dataSet.cache()

    /**
      * 对数据进行缓存
      */
  }
  def dataSet_columns() ={
    //Returns all column names as an array.
    dataSet.columns.foreach(println)
    /**
      * name
      * age
      */
  }
  def dataSet_createOrReplaceTempView()={
    //Creates a temporary view using the given name.
    dataSet.createOrReplaceTempView("myPerson")
    val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")
    dataFrame.show()

    /**
      * +------+---+
        |  name|age|
        +------+---+
        |Justin| 19|
        +------+---+
      */
  }
  def dataSet_createTempView() = {
    //Creates a temporary view using the given name.
    dataSet.createOrReplaceTempView("myPerson")
    val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")
    dataFrame.map(teenager => "Name: " + teenager(0)).show()

    /**
      * +------------+
        |       value|
        +------------+
        |Name: Justin|
        +------------+
      */
  }
  def dataSet_dtypes()={
    //Returns all column names and their data types as an array.
    dataSet.dtypes.foreach(println)
    /**
      * (name,StringType)
        (age,LongType)
      */
  }
  def dataSet_explain()={
    //Prints the physical plan to the console for debugging purposes.
    //dataSet.explain()
    /**
      * == Physical Plan ==
        Scan ExistingRDD[name#2,age#3L]
      */
    //Prints the plans (logical and physical) to the console for debugging purposes.
    dataSet.explain(true)

    /**
      * == Parsed Logical Plan ==
        LogicalRDD [name#2, age#3L]

        == Analyzed Logical Plan ==
        name: string, age: bigint
        LogicalRDD [name#2, age#3L]

        == Optimized Logical Plan ==
        LogicalRDD [name#2, age#3L]

        == Physical Plan ==
        Scan ExistingRDD[name#2,age#3L]
      */
  }
  def dataSet_inputFiles()={
    println(dataSet.inputFiles.toList)
    //List()
  }
  def dataSet_isLocal() ={
    //Returns true if the collect and take methods can be run locally (without any Spark executors).
    dataSet.isLocal
    //false
  }
  def dataSet_isStreaming() ={
    dataSet.isStreaming
  }

  def dataSet_javaRDD()={
    //Returns the content of the Dataset as a JavaRDD of Ts.
    println(dataSet.toJavaRDD)
    //MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:222
  }
  def dataSet_persist()={
    //Persist this Dataset with the given storage level.
    dataSet.persist()
    /**
      * 省却的情况下是(MEMORY_AND_DISK).
      */
  }
  def dataSet_printSchema()={
    //Prints the schema to the console in a nice tree format.
    dataSet.printSchema()

    /**
      * root
       |-- name: string (nullable = true)
       |-- age: long (nullable = false)
      */
  }
  def dataSet_rdd()={
    //Represents the content of the Dataset as an RDD of T.
    println(dataSet.rdd)

    /**返回RDD形式
      *MapPartitionsRDD[7] at rdd at dataSetOperation.scala:243
      */
  }
  def dataSet_schema()={
    //Returns the schema of this Dataset.
    println(dataSet.schema)
    /**
      * StructType(StructField(name,StringType,true), StructField(age,LongType,false))
      */
  }
  def dataSet_toDF()={
    //Converts this strongly typed collection of data to generic Dataframe.
    dataSet.toDF().show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
    //Converts this strongly typed collection of data to generic DataFrame with columns renamed
    dataSet.toDF("man","ID").show()//列的数目要和原来一样

    /**
      * +-------+---+
        |    man| ID|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */

  }
  def dataSet_toJavaRDD()={
    //Returns the content of the Dataset as a JavaRDD of Ts.
    println(dataSet.toJavaRDD)
    //MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:285
  }
  def dataSet_unpersist()={
    //Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk.
    dataSet.unpersist(true)
    //dataSet.unpersist()
  }
  def dataSet_write()={
    //Interface for saving the content of the non-streaming Dataset out into external storage.
    dataSet.write

    /**
      * 实验阶段
      */
  }
  def dataSet_writeStream()={
    //Interface for saving the content of the non-streaming Dataset out into external storage.
    dataSet.writeStream

    /**
      * 实验阶段
      */
  }
  def dataSet_registerTempTable()={
    //Registers this Dataset as a temporary table using the given name.
    // The lifetime of this temporary table is tied to the SparkSession that was used to create this Dataset.
    dataSet.registerTempTable("myPerson")
    val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19")
    dataFrame.map(teenager => "Name: " + teenager(0)).show()

    /**已经用createTempView代替，后来版本会慢慢取消，
      * +------------+
        |       value|
        +------------+
        |Name: Justin|
        +------------+
      */
  }
  //---------------------------------------------------------------Typed transformations----------------------
  def dataSet_AS()={
    val tmpDS: Dataset[Person] = dataSet.as("oldDataSet")
  }
  def dataSet_alias()={
    /**本质是调用as
      * def alias(alias: Symbol): Dataset[T] = as(alias)
      * def alias(alias: String): Dataset[T] = as(alias)
      *
      * [name: string, age: bigint]
      */
  }
  def dataSet_coalesce()={
    //本质：Repartition(numPartitions, shuffle = false, logicalPlan)
    //给dataSet重新设置partition数目，和RDD一样
    //但是数据量非常小的时候，发现重新设置分区数不起作用
    def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {
      iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
    }
    dataSet.coalesce(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)

    /**
      * [partID:0, val: Person(Michael,29)]
        [partID:0, val: Person(Andy,30)]
        [partID:0, val: Person(Justin,19)]
      */
    println(dataSet.coalesce(2).toJavaRDD.rdd.partitions.length)

    /**
      * 1
      */
  }
  def dataSet_distinct()={
    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100)))
    val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)
    unionedDS.show()

    /**
      * +--------+---+
        | Michael| 29|
        |    Andy| 30|
        |  Justin| 19|
        |legotime|100|
        |legotime|100|
        +--------+---+
      */
    unionedDS.distinct().show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        |    Andy| 30|
        |legotime|100|
        | Michael| 29|
        |  Justin| 19|
        +--------+---+
      */
    // distinct 操作内部是会进行shuffle 排序的
  }
  def dataSet_dropDuplicates()={
    //def dropDuplicates(): Dataset[T]
    //def distinct(): Dataset[T]
    //def dropDuplicates(colNames: Array[String]): Dataset[T]

    //Returns a new Dataset with duplicate rows removed, considering only the subset of columns.
    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
    val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)
    unionedDS.show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        | Michael| 29|
        |    Andy| 30|
        |  Justin| 19|
        |legotime|100|
        |    lego| 19|
        |legotime|100|
        |    lego| 19|
        +--------+---+
      */
    unionedDS.dropDuplicates().show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        |    Andy| 30|
        |    lego| 19|
        |legotime|100|
        | Michael| 29|
        |  Justin| 19|
        +--------+---+
      */

    unionedDS.dropDuplicates("name").show()
    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        | Michael| 29|
        |    Andy| 30|
        |    lego| 19|
        |legotime|100|
        |  Justin| 19|
        +--------+---+
      */

    unionedDS.dropDuplicates("age").show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        | Michael| 29|
        |  Justin| 19|
        |legotime|100|
        |    Andy| 30|
        +--------+---+
      */
    unionedDS.dropDuplicates(Array("name","age")).show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        |    Andy| 30|
        |    lego| 19|
        |legotime|100|
        | Michael| 29|
        |  Justin| 19|
        +--------+---+
      */
  }
  def dataSet_except()={
    //Returns a new Dataset containing rows in this Dataset but not in another Dataset. This is equivalent to EXCEPT in SQL.
    val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))
    dataSet.except(tmpDataSet).show()

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        | Justin| 19|
        +-------+---+
      */
  }
  def dataSet_filter()={
    //def filter(func: (T) ⇒ Boolean): Dataset[T]
    //def filter(conditionExpr: String): Dataset[T]
    //def filter(condition: Column): Dataset[T]

    dataSet.filter($"age" > 20).show()
    dataSet.filter("age > 20").show()

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        +-------+---+
      */
  }
  def dataSet_flatMap()={
    //def flatMap[U](func: (T) ⇒ TraversableOnce[U])(implicit arg0: Encoder[U]): Dataset[U]
    /**因为序列化问题，报错，期待后续开发
      * dataSet.flatMap{ P =>
          P.toString
        }.show()
      */

    val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]
    val words = tmpDataSet.flatMap(line => line.split(","))
    words.show()

    /**
      * +-------+
        |  value|
        +-------+
        |Michael|
        |     29|
        |   Andy|
        |     30|
        | Justin|
        |     19|
        +-------+
      */
    //可以说，只要你想对内部具体的值进行动刀，都离不开flatMap ，flatMap之后可以实现很多要求，比如如下：
    words.map((word) =>(word,1)).show()

    /**
      * +-------+---+
        |     _1| _2|
        +-------+---+
        |Michael|  1|
        |     29|  1|
        |   Andy|  1|
        |     30|  1|
        | Justin|  1|
        |     19|  1|
        +-------+---+
      */
    //在如下：
    words.map((word) =>(word,1)).groupByKey(value => value).count().show()

    /**
      * +-----------+--------+
        |        key|count(1)|
        +-----------+--------+
        |   [Andy,1]|       1|
        |[Michael,1]|       1|
        |    [ 29,1]|       1|
        | [Justin,1]|       1|
        |    [ 30,1]|       1|
        |    [ 19,1]|       1|
        +-----------+--------+
      */

    /**
      * 实验阶段
      */
  }
  def dataSet_groupByKey()={
    val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String]
    val words = tmpDataSet.flatMap(line => line.split(","))
    words.groupByKey(_.toLowerCase).count().show()

    /**
      * +-------+--------+
        |  value|count(1)|
        +-------+--------+
        |     29|       1|
        |   andy|       1|
        |michael|       1|
        | justin|       1|
        |     19|       1|
        |     30|       1|
        +-------+--------+
      */
    /**
      * 值得注意的是，它是spark中降低效率前几的一个函数，尽量用其他函数代替。
      */
  }
  def dataSet_intersect()={
    //Returns a new Dataset containing rows only in both this Dataset and another Dataset. This is equivalent to INTERSECT in SQL.(交集)
    val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))
    dataSet.show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
    dataSet.intersect(tmpDataSet).show()

    /**
      * +----+---+
        |name|age|
        +----+---+
        |Andy| 30|
        +----+---+
      */
  }
  def dataSet_joinWith()={
    /**
      * 实验阶段
      */
    //Using inner equi-join to join this Dataset returning a Tuple2 for each pair where condition evaluates to true.
    val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19)))
    dataSet.joinWith(tmpDataSet,tmpDataSet("name") ===  dataSet("name")).show()

    /**
      * +---------+---------+
        |       _1|       _2|
        +---------+---------+
        |[Andy,30]|[Andy,30]|
        +---------+---------+
      */
    dataSet.joinWith(tmpDataSet,tmpDataSet("age") ===  dataSet("age")).show()

    /**
      * +-----------+---------+
        |         _1|       _2|
        +-----------+---------+
        |  [Andy,30]|[Andy,30]|
        |[Justin,19]|[lego,19]|
        +-----------+---------+
      */
    dataSet.joinWith(tmpDataSet,tmpDataSet("age") ===  19).show()

    /**
      * +------------+---------+
        |          _1|       _2|
        +------------+---------+
        |[Michael,29]|[lego,19]|
        |   [Andy,30]|[lego,19]|
        | [Justin,19]|[lego,19]|
        +------------+---------+
      */
  }
  def dataSet_limit()={
    /**
      * Returns a new Dataset by taking the first n rows.
      * The difference between this function and head is that head is an action and returns an array
      * (by triggering query execution) while limit returns a new Dataset.
      */
    dataSet.show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
    dataSet.limit(2).show()

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        +-------+---+
      */

  }
  def dataSet_map()={
    //Returns a new Dataset that contains the result of applying func to each element.
    dataSet.map(Person =>Person.age).show()
    /**
      * +-----+
        |value|
        +-----+
        |   29|
        |   30|
        |   19|
        +-----+
      */
  }
  def dataSet_mapPartitions()={
    def myfunc[Person](iter: Iterator[Person]) : Iterator[(Person, Person)] = {
      var res = List[(Person, Person)]()
      var pre = iter.next
      while (iter.hasNext)
      {
        val cur = iter.next
        res .::= (pre, cur)
        pre = cur
      }
      res.iterator
    }

    dataSet.mapPartitions(myfunc).show()


    /**
      * +------------+-----------+
        |          _1|         _2|
        +------------+-----------+
        |   [Andy,30]|[Justin,19]|
        |[Michael,29]|  [Andy,30]|
        +------------+-----------+
      */
    /**
      * 实验阶段
      */
  }
  def dataSet_orderBy()={
    dataSet.show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
    dataSet.orderBy($"age").show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        | Justin| 19|
        |Michael| 29|
        |   Andy| 30|
        +-------+---+
      */
  }
  def dataSet_randomSplit()={
    dataSet.randomSplit(Array(0.6,0.4),0L).foreach{ds =>
      ds.show()
    }

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |   Andy| 30|
        |Michael| 29|
        +-------+---+
      */
    /**
      * +------+---+
        |  name|age|
        +------+---+
        |Justin| 19|
        +------+---+
      */
  }
  def dataSet_randomSplitAsList()={
    //Returns a Java list that contains randomly split Dataset with the provided weights.
    println(dataSet.randomSplitAsList(Array(0.6,0.4),0L).size())
    //2
  }
  def dataSet_repartition()={
    def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {
      iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
    }
    dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)

    /**
      * [partID:0, val: Person(Michael,29)]
        [partID:0, val: Person(Justin,19)]
        [partID:1, val: Person(Andy,30)]
      */
    dataSet.repartition($"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)

    /**
      * [partID:71, val: Person(Michael,29)]
        [partID:164, val: Person(Andy,30)]
        [partID:169, val: Person(Justin,19)]
      */
    dataSet.repartition(2,$"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)

    /**
      * [partID:0, val: Person(Andy,30)]
        [partID:1, val: Person(Michael,29)]
        [partID:1, val: Person(Justin,19)]
      */
  }
  def dataSet_sample()={
    //Returns a new Dataset by sampling a fraction of rows, using a random seed.
    dataSet.sample(withReplacement = true,0.6,0L).show()

    /**
      * +----+---+
        |name|age|
        +----+---+
        |Andy| 30|
        +----+---+
      */
  }
  def dataSet_select()={
    dataSet.select($"name").show()
    /**
      * +-------+
        |   name|
        +-------+
        |Michael|
        |   Andy|
        | Justin|
        +-------+
      */
  }
  def dataSet_sort()={
    dataSet.show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
    dataSet.sort($"name",$"age".desc).show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |   Andy| 30|
        | Justin| 19|
        |Michael| 29|
        +-------+---+
      */
  }
  def dataSet_sortWithinPartitions()={
    def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = {
      iter.toList.map(x => "[partID:" +  index + ", val: " + x + "]").iterator
    }
    dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)

    /**
      * [partID:0, val: Person(Michael,29)]
        [partID:0, val: Person(Justin,19)]
        [partID:1, val: Person(Andy,30)]
      */
    dataSet.repartition(2).sortWithinPartitions($"age").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println)

    /**
      * [partID:0, val: Person(Justin,19)]
        [partID:0, val: Person(Michael,29)]
        [partID:1, val: Person(Andy,30)]
      */
  }
  def dataSet_transform()={
    //Concise syntax for chaining custom transformations.
    dataSet.show()
    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
    dataSet.transform{ p =>p.sort($"age".desc)}.show()

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |   Andy| 30|
        |Michael| 29|
        | Justin| 19|
        +-------+---+
      */

  }
  def dataSet_union()={
    //Returns a new Dataset containing union of rows in this Dataset and another Dataset. This is equivalent to UNION ALL in SQL.
    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
    val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet)
    unionedDS.show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        | Michael| 29|
        |    Andy| 30|
        |  Justin| 19|
        |legotime|100|
        |    lego| 19|
        |legotime|100|
        |    lego| 19|
        +--------+---+
      */
  }
  def dataSet_where()={
    dataSet.where($"age">20).show()
    dataSet.where("age > 20").show()
    dataSet.filter($"age">20).show()
    dataSet.filter("age >20").show()

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        +-------+---+
      */
  }
  def dataSet_unionAll()={
    /**
      * Annotation  @deprecated
       Deprecate (Since version 2.0.0) use union()
      */
    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
    val unionedDS = dataSet.unionAll(tmpDataSet).union(tmpDataSet)
    unionedDS.show()

    /**
      * +--------+---+
        |    name|age|
        +--------+---+
        | Michael| 29|
        |    Andy| 30|
        |  Justin| 19|
        |legotime|100|
        |    lego| 19|
        |legotime|100|
        |    lego| 19|
        +--------+---+
      */
  }
  //---------------------------------------------------------------Untyped transformations---------------------

  def dataSet_agg()={
    // import org.apache.spark.sql.functions._
    dataSet.groupBy($"age",$"name").agg(max($"name"), avg($"age")).show()
    /**
      * +---+-------+---------+--------+
        |age|   name|max(name)|avg(age)|
        +---+-------+---------+--------+
        | 29|Michael|  Michael|    29.0|
        | 30|   Andy|     Andy|    30.0|
        | 19| Justin|   Justin|    19.0|
        +---+-------+---------+--------+
      */
    dataSet.groupBy().agg(max($"name"), avg($"age")).show()
    dataSet.agg(max($"name"), avg($"age")).show()
    // dataSet.agg(...) is a shorthand for dataSet.groupBy().agg(...)
    /**
      * +---------+--------+
        |max(name)|avg(age)|
        +---------+--------+
        |  Michael|    26.0|
        +---------+--------+
      */
  }
  def dataSet_apply()={
    //Selects column based on the column name and return it as a Column. Note that the column name can also reference to a nested column like a.b.
    println(dataSet.apply("age"))
    //age
  }
  def dataSet_col()={
    //Selects column based on the column name and return it as a Column.
    dataSet.select(col("age")).show()
    /**
      * +---+
        |age|
        +---+
        | 29|
        | 30|
        | 19|
        +---+
      */
  }
  def dataSet_cube()={
    //Create a multi-dimensional cube for the current Dataset using the specified columns, so we can run aggregation on them.
    dataSet.cube("age","name").agg(avg($"age")).show()
    /**
      * +----+-------+--------+
        | age|   name|max(age)|
        +----+-------+--------+
        |null|Michael|      29|
        |null|   null|      30|
        |  29|Michael|      29|
        |  19|   null|      19|
        |  30|   Andy|      30|
        |  30|   null|      30|
        |null|   Andy|      30|
        |  19| Justin|      19|
        |  29|   null|      29|
        |null| Justin|      19|
        +----+-------+--------+
      */

  }
  def dataSet_drop()={
    //Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column name(s).
    dataSet.drop("age").show()
    //Returns a new Dataset with a column dropped. This version of drop accepts a Column rather than a name.
    // This is a no-op if the Dataset doesn't have a column with an equivalent expression.
    dataSet.drop(col = col("age")).show()

    /**
      * +-------+
        |   name|
        +-------+
        |Michael|
        |   Andy|
        | Justin|
        +-------+
      */
  }
  def dataSet_groupBy()={
    dataSet.groupBy(col("age")).agg{Map(
      "age"->"avg",
      "name"->"max"
    )}.show()
    dataSet.groupBy($"age").agg{Map(
      "age"->"avg",
      "name"->"max"
    )}.show()

    /**
      * +---+--------+---------+
        |age|avg(age)|max(name)|
        +---+--------+---------+
        | 29|    29.0|  Michael|
        | 19|    19.0|   Justin|
        | 30|    30.0|     Andy|
        +---+--------+---------+
      */
  }
  def dataSet_join()={
    val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19)))
    dataSet.join(tmpDataSet).show()

    /**
      * +-------+---+--------+---+
        |Michael| 29|legotime|100|
        |Michael| 29|    lego| 19|
        |   Andy| 30|legotime|100|
        |   Andy| 30|    lego| 19|
        | Justin| 19|legotime|100|
        | Justin| 19|    lego| 19|
        +-------+---+--------+---+
      */
    dataSet.join(tmpDataSet,"age").show()

    /**
      * +---+------+----+
        |age|  name|name|
        +---+------+----+
        | 19|Justin|lego|
        +---+------+----+
      */
    dataSet.join(tmpDataSet,Seq("age","name")).show()

    /**
      * +---+----+
        |age|name|
        +---+----+
        +---+----+
      */
  }
  def dataSet_na()={
    //Returns a DataFrameNaFunctions for working with missing data.
    dataSet.na.drop("all").show()

    /**
      * +-------+---+
        |   name|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
  }
  def dataSet_rollup()={
    //Create a multi-dimensional rollup for the current Dataset using the specified columns, so we can run aggregation on them
    dataSet.rollup("age", "name").avg().show()
    /**
      * +----+-------+--------+
        | age|   name|avg(age)|
        +----+-------+--------+
        |null|   null|    26.0|
        |  29|Michael|    29.0|
        |  19|   null|    19.0|
        |  30|   Andy|    30.0|
        |  30|   null|    30.0|
        |  19| Justin|    19.0|
        |  29|   null|    29.0|
        +----+-------+--------+
      */
  }
  def dataSet_select_2()={
    dataSet.select("age","name","age").show()

    /**
      * +---+-------+---+
        |age|   name|age|
        +---+-------+---+
        | 29|Michael| 29|
        | 30|   Andy| 30|
        | 19| Justin| 19|
        +---+-------+---+
      */
  }
  def dataSet_selectExpr()={
    //Selects a set of SQL expressions. This is a variant of select that accepts SQL expressions.
    dataSet.selectExpr("name","age+1","name as NAME","age as AGE").show()
    dataSet.select(expr("name"),expr("age+1"), expr("name as NAME"), expr("age as AGE"))

    /**
      * +-------+---------+-------+---+
        |   name|(age + 1)|   NAME|AGE|
        +-------+---------+-------+---+
        |Michael|       30|Michael| 29|
        |   Andy|       31|   Andy| 30|
        | Justin|       20| Justin| 19|
        +-------+---------+-------+---+
      */
  }

  def dataSet_stat()={
    //Returns a DataFrameStatFunctions for working statistic functions support.
    dataSet.stat.sampleBy("age",Map("age"->0.5,"name"->0.5),0L).show()

    /**
      * +----+---+
        |name|age|
        +----+---+
        +----+---+
      */
  }
  def dataSet_withColumn()={
    //Returns a new Dataset by adding a column or replacing the existing column that has the same name.
    dataSet.withColumn("NAME",col("name")).show()

    /**
      * +-------+---+
        |   NAME|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
  }
  def dataSet_withColumnRenamed()={
    dataSet.withColumnRenamed("name","newName").show()

    /**
      * +-------+---+
        |newName|age|
        +-------+---+
        |Michael| 29|
        |   Andy| 30|
        | Justin| 19|
        +-------+---+
      */
  }
  def dataSet_explode()={
    /**
      * Annotations    @deprecated
        Deprecated   (Since version 2.0.0) use flatMap() or select() with functions.explode() instead
        Since       2.0.0
      */
    dataSet.explain()
  }

  def main(args: Array[String]) {
    //dataSet_collect()
    //dataSet_collectAsList
    //dataSet_count
    //dataSet_describe
    //dataSet_first
    //dataSet_foreachPartition
    //dataSet_head
    //dataSet_reduce
    //dataSet_show
    //dataSet_toLocalIterator

    //dataSet_as
    //dataSet_cache
    //dataSet_columns
    //dataSet_createOrReplaceTempView
    //dataSet_createTempView
    //dataSet_dtypes
    //dataSet_explain
    //dataSet_inputFiles
    //println(dataSet_isLocal)
    //dataSet_isStreaming
    //dataSet_javaRDD
    //dataSet_persist
    //dataSet_printSchema
    //dataSet_rdd
    //dataSet_schema
    //dataSet_toDF
    //dataSet_toJavaRDD
    //dataSet_unpersist
    //dataSet_write
    //dataSet_write
    //dataSet_writeStream
    //dataSet_registerTempTable

    //dataSet_alias
    //dataSet_coalesce
    //dataSet_distinct
    //dataSet_dropDuplicates
    //dataSet_except
    //dataSet_filter
    //dataSet_flatMap
    //dataSet_groupByKey
    //dataSet_intersect
    //dataSet_joinWith
    //dataSet_limit
    //dataSet_map
    dataSet_mapPartitions
    //dataSet_orderBy
    //dataSet_randomSplit
    //dataSet_randomSplitAsList
    //dataSet_repartition
    //dataSet_sample
    //dataSet_select
    //dataSet_sort
    //dataSet_sortWithinPartitions
    //dataSet_transform
    //dataSet_union
    //dataSet_where
    //dataSet_unionAll

    //dataSet_agg
    //dataSet_apply
    //dataSet_col
    //dataSet_cube
    //dataSet_drop
    //dataSet_groupBy
    //dataSet_join
    //dataSet_na
    //dataSet_rollup
    //dataSet_select_2
    //dataSet_selectExpr
    //dataSet_stat
    //dataSet_stat
    //dataSet_withColumn
    //dataSet_withColumnRenamed
    //dataSet_explode

  }
}
legotime
关注
1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
spark源码阅读笔记Dataset（二）Dataset中Actions、function、transformations

package Datasetimport org.apache.spark.sql.{DataFrame, Dataset, SparkSession}/** * Created by legotime */object dataSetOperation { case class Person(name: String, age: Long) val sparkSes
复制链接

扫一扫