package Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} /** * Created by legotime */ object dataSetOperation { case class Person(name: String, age: Long) val sparkSession = SparkSession.builder().appName("data set example") .master("local").getOrCreate() import sparkSession.implicits._ val rdd = sparkSession.sparkContext.textFile("hdfs://master:9000/src/main/resources/people.txt") val dataSet = rdd.map(_.split(",")).map(p =>Person(p(0),p(1).trim.toLong)).toDS() //---------------------------------------------------------------Actions-------------------------------------- def dataSet_collect() = { //Returns an array that contains all of Rows in this Dataset. dataSet.collect().foreach(println) /** * Person(Michael,29) * Person(Andy,30) * Person(Justin,19) */ } def dataSet_collectAsList()={ //Returns a Java list that contains all of Rows in this Dataset. println(dataSet.collectAsList) /** * [Person(Michael,29), Person(Andy,30), Person(Justin,19)] */ } def dataSet_count() = { //Returns the number of rows in the Dataset. println(dataSet.count()) /** * 3 */ } def dataSet_describe()={ //Computes statistics for numeric columns, including count, mean, stddev, min, and max dataSet.describe("name","age").show /** 这个函数可以计算可中基本统计信息 * +-------+-------+-----------------+ * |summary| name| age| * +-------+-------+-----------------+ * | count| 3| 3| * | mean| null| 26.0| * | stddev| null|6.082762530298219| * | min| Andy| 19| * | max|Michael| 30| * +-------+-------+-----------------+ */ } def dataSet_first()={ //Returns the first row. println(dataSet.first()) /** * Person(Michael,29) */ } def dataSet_foreachPartition()={ //Applies a function f to each partition of this Dataset. dataSet.foreachPartition{ part => println(part.toList) } /** * List(Person(Michael,29), Person(Andy,30), Person(Justin,19)) */ } def dataSet_head()={ //Returns the first n rows. dataSet.head(2).foreach(println) /** * Person(Michael,29) * Person(Andy,30) */ } def dataSet_reduce()={ //(Scala-specific) Reduces the elements of this Dataset using the specified binary function. val data: Dataset[String] = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String] println(data.reduce(_+_)) /** * Michael, 29Andy, 30Justin, 19 */ } def dataSet_show() ={ println("----------默认全部打印---------------") dataSet.show() println("----------only showing top 2 rows---------------") dataSet.show(2) println("---------Displays the top 20 rows of Dataset in a tabular form.------------") dataSet.show(true) } def dataSet_toLocalIterator()={ //Return an iterator that contains all of Rows in this Dataset. val tmp = dataSet.toLocalIterator() while (tmp.hasNext){ println(tmp.next()) } /** * Person(Michael,29) * Person(Andy,30) * Person(Justin,19) */ } //---------------------------------------------------------------Basic Dataset functions--------------------- def dataSet_as()={ //Returns a new Dataset where each record has been mapped on to the specified type val data: DataFrame = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt") data.as[String] data.show() /** * +-----------+ * | value| * +-----------+ * |Michael, 29| * | Andy, 30| * | Justin, 19| * +-----------+ */ } def dataSet_cache()={ dataSet.cache() /** * 对数据进行缓存 */ } def dataSet_columns() ={ //Returns all column names as an array. dataSet.columns.foreach(println) /** * name * age */ } def dataSet_createOrReplaceTempView()={ //Creates a temporary view using the given name. dataSet.createOrReplaceTempView("myPerson") val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19") dataFrame.show() /** * +------+---+ | name|age| +------+---+ |Justin| 19| +------+---+ */ } def dataSet_createTempView() = { //Creates a temporary view using the given name. dataSet.createOrReplaceTempView("myPerson") val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19") dataFrame.map(teenager => "Name: " + teenager(0)).show() /** * +------------+ | value| +------------+ |Name: Justin| +------------+ */ } def dataSet_dtypes()={ //Returns all column names and their data types as an array. dataSet.dtypes.foreach(println) /** * (name,StringType) (age,LongType) */ } def dataSet_explain()={ //Prints the physical plan to the console for debugging purposes. //dataSet.explain() /** * == Physical Plan == Scan ExistingRDD[name#2,age#3L] */ //Prints the plans (logical and physical) to the console for debugging purposes. dataSet.explain(true) /** * == Parsed Logical Plan == LogicalRDD [name#2, age#3L] == Analyzed Logical Plan == name: string, age: bigint LogicalRDD [name#2, age#3L] == Optimized Logical Plan == LogicalRDD [name#2, age#3L] == Physical Plan == Scan ExistingRDD[name#2,age#3L] */ } def dataSet_inputFiles()={ println(dataSet.inputFiles.toList) //List() } def dataSet_isLocal() ={ //Returns true if the collect and take methods can be run locally (without any Spark executors). dataSet.isLocal //false } def dataSet_isStreaming() ={ dataSet.isStreaming } def dataSet_javaRDD()={ //Returns the content of the Dataset as a JavaRDD of Ts. println(dataSet.toJavaRDD) //MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:222 } def dataSet_persist()={ //Persist this Dataset with the given storage level. dataSet.persist() /** * 省却的情况下是(MEMORY_AND_DISK). */ } def dataSet_printSchema()={ //Prints the schema to the console in a nice tree format. dataSet.printSchema() /** * root |-- name: string (nullable = true) |-- age: long (nullable = false) */ } def dataSet_rdd()={ //Represents the content of the Dataset as an RDD of T. println(dataSet.rdd) /**返回RDD形式 *MapPartitionsRDD[7] at rdd at dataSetOperation.scala:243 */ } def dataSet_schema()={ //Returns the schema of this Dataset. println(dataSet.schema) /** * StructType(StructField(name,StringType,true), StructField(age,LongType,false)) */ } def dataSet_toDF()={ //Converts this strongly typed collection of data to generic Dataframe. dataSet.toDF().show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ //Converts this strongly typed collection of data to generic DataFrame with columns renamed dataSet.toDF("man","ID").show()//列的数目要和原来一样 /** * +-------+---+ | man| ID| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ } def dataSet_toJavaRDD()={ //Returns the content of the Dataset as a JavaRDD of Ts. println(dataSet.toJavaRDD) //MapPartitionsRDD[7] at toJavaRDD at dataSetOperation.scala:285 } def dataSet_unpersist()={ //Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk. dataSet.unpersist(true) //dataSet.unpersist() } def dataSet_write()={ //Interface for saving the content of the non-streaming Dataset out into external storage. dataSet.write /** * 实验阶段 */ } def dataSet_writeStream()={ //Interface for saving the content of the non-streaming Dataset out into external storage. dataSet.writeStream /** * 实验阶段 */ } def dataSet_registerTempTable()={ //Registers this Dataset as a temporary table using the given name. // The lifetime of this temporary table is tied to the SparkSession that was used to create this Dataset. dataSet.registerTempTable("myPerson") val dataFrame = sparkSession.sql ("SELECT name, age FROM myPerson WHERE age BETWEEN 13 AND 19") dataFrame.map(teenager => "Name: " + teenager(0)).show() /**已经用createTempView代替,后来版本会慢慢取消, * +------------+ | value| +------------+ |Name: Justin| +------------+ */ } //---------------------------------------------------------------Typed transformations---------------------- def dataSet_AS()={ val tmpDS: Dataset[Person] = dataSet.as("oldDataSet") } def dataSet_alias()={ /**本质是调用as * def alias(alias: Symbol): Dataset[T] = as(alias) * def alias(alias: String): Dataset[T] = as(alias) * * [name: string, age: bigint] */ } def dataSet_coalesce()={ //本质:Repartition(numPartitions, shuffle = false, logicalPlan) //给dataSet重新设置partition数目,和RDD一样 //但是数据量非常小的时候,发现重新设置分区数不起作用 def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } dataSet.coalesce(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println) /** * [partID:0, val: Person(Michael,29)] [partID:0, val: Person(Andy,30)] [partID:0, val: Person(Justin,19)] */ println(dataSet.coalesce(2).toJavaRDD.rdd.partitions.length) /** * 1 */ } def dataSet_distinct()={ val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100))) val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet) unionedDS.show() /** * +--------+---+ | Michael| 29| | Andy| 30| | Justin| 19| |legotime|100| |legotime|100| +--------+---+ */ unionedDS.distinct().show() /** * +--------+---+ | name|age| +--------+---+ | Andy| 30| |legotime|100| | Michael| 29| | Justin| 19| +--------+---+ */ // distinct 操作内部是会进行shuffle 排序的 } def dataSet_dropDuplicates()={ //def dropDuplicates(): Dataset[T] //def distinct(): Dataset[T] //def dropDuplicates(colNames: Array[String]): Dataset[T] //Returns a new Dataset with duplicate rows removed, considering only the subset of columns. val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19))) val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet) unionedDS.show() /** * +--------+---+ | name|age| +--------+---+ | Michael| 29| | Andy| 30| | Justin| 19| |legotime|100| | lego| 19| |legotime|100| | lego| 19| +--------+---+ */ unionedDS.dropDuplicates().show() /** * +--------+---+ | name|age| +--------+---+ | Andy| 30| | lego| 19| |legotime|100| | Michael| 29| | Justin| 19| +--------+---+ */ unionedDS.dropDuplicates("name").show() /** * +--------+---+ | name|age| +--------+---+ | Michael| 29| | Andy| 30| | lego| 19| |legotime|100| | Justin| 19| +--------+---+ */ unionedDS.dropDuplicates("age").show() /** * +--------+---+ | name|age| +--------+---+ | Michael| 29| | Justin| 19| |legotime|100| | Andy| 30| +--------+---+ */ unionedDS.dropDuplicates(Array("name","age")).show() /** * +--------+---+ | name|age| +--------+---+ | Andy| 30| | lego| 19| |legotime|100| | Michael| 29| | Justin| 19| +--------+---+ */ } def dataSet_except()={ //Returns a new Dataset containing rows in this Dataset but not in another Dataset. This is equivalent to EXCEPT in SQL. val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19))) dataSet.except(tmpDataSet).show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Justin| 19| +-------+---+ */ } def dataSet_filter()={ //def filter(func: (T) ⇒ Boolean): Dataset[T] //def filter(conditionExpr: String): Dataset[T] //def filter(condition: Column): Dataset[T] dataSet.filter($"age" > 20).show() dataSet.filter("age > 20").show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| +-------+---+ */ } def dataSet_flatMap()={ //def flatMap[U](func: (T) ⇒ TraversableOnce[U])(implicit arg0: Encoder[U]): Dataset[U] /**因为序列化问题,报错,期待后续开发 * dataSet.flatMap{ P => P.toString }.show() */ val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String] val words = tmpDataSet.flatMap(line => line.split(",")) words.show() /** * +-------+ | value| +-------+ |Michael| | 29| | Andy| | 30| | Justin| | 19| +-------+ */ //可以说,只要你想对内部具体的值进行动刀,都离不开flatMap ,flatMap之后可以实现很多要求,比如如下: words.map((word) =>(word,1)).show() /** * +-------+---+ | _1| _2| +-------+---+ |Michael| 1| | 29| 1| | Andy| 1| | 30| 1| | Justin| 1| | 19| 1| +-------+---+ */ //在如下: words.map((word) =>(word,1)).groupByKey(value => value).count().show() /** * +-----------+--------+ | key|count(1)| +-----------+--------+ | [Andy,1]| 1| |[Michael,1]| 1| | [ 29,1]| 1| | [Justin,1]| 1| | [ 30,1]| 1| | [ 19,1]| 1| +-----------+--------+ */ /** * 实验阶段 */ } def dataSet_groupByKey()={ val tmpDataSet = sparkSession.read.text("hdfs://master:9000/src/main/resources/people.txt").as[String] val words = tmpDataSet.flatMap(line => line.split(",")) words.groupByKey(_.toLowerCase).count().show() /** * +-------+--------+ | value|count(1)| +-------+--------+ | 29| 1| | andy| 1| |michael| 1| | justin| 1| | 19| 1| | 30| 1| +-------+--------+ */ /** * 值得注意的是,它是spark中降低效率前几的一个函数,尽量用其他函数代替。 */ } def dataSet_intersect()={ //Returns a new Dataset containing rows only in both this Dataset and another Dataset. This is equivalent to INTERSECT in SQL.(交集) val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19))) dataSet.show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ dataSet.intersect(tmpDataSet).show() /** * +----+---+ |name|age| +----+---+ |Andy| 30| +----+---+ */ } def dataSet_joinWith()={ /** * 实验阶段 */ //Using inner equi-join to join this Dataset returning a Tuple2 for each pair where condition evaluates to true. val tmpDataSet = sparkSession.createDataset(Seq(Person("Andy",30),Person("lego",19))) dataSet.joinWith(tmpDataSet,tmpDataSet("name") === dataSet("name")).show() /** * +---------+---------+ | _1| _2| +---------+---------+ |[Andy,30]|[Andy,30]| +---------+---------+ */ dataSet.joinWith(tmpDataSet,tmpDataSet("age") === dataSet("age")).show() /** * +-----------+---------+ | _1| _2| +-----------+---------+ | [Andy,30]|[Andy,30]| |[Justin,19]|[lego,19]| +-----------+---------+ */ dataSet.joinWith(tmpDataSet,tmpDataSet("age") === 19).show() /** * +------------+---------+ | _1| _2| +------------+---------+ |[Michael,29]|[lego,19]| | [Andy,30]|[lego,19]| | [Justin,19]|[lego,19]| +------------+---------+ */ } def dataSet_limit()={ /** * Returns a new Dataset by taking the first n rows. * The difference between this function and head is that head is an action and returns an array * (by triggering query execution) while limit returns a new Dataset. */ dataSet.show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ dataSet.limit(2).show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| +-------+---+ */ } def dataSet_map()={ //Returns a new Dataset that contains the result of applying func to each element. dataSet.map(Person =>Person.age).show() /** * +-----+ |value| +-----+ | 29| | 30| | 19| +-----+ */ } def dataSet_mapPartitions()={ def myfunc[Person](iter: Iterator[Person]) : Iterator[(Person, Person)] = { var res = List[(Person, Person)]() var pre = iter.next while (iter.hasNext) { val cur = iter.next res .::= (pre, cur) pre = cur } res.iterator } dataSet.mapPartitions(myfunc).show() /** * +------------+-----------+ | _1| _2| +------------+-----------+ | [Andy,30]|[Justin,19]| |[Michael,29]| [Andy,30]| +------------+-----------+ */ /** * 实验阶段 */ } def dataSet_orderBy()={ dataSet.show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ dataSet.orderBy($"age").show() /** * +-------+---+ | name|age| +-------+---+ | Justin| 19| |Michael| 29| | Andy| 30| +-------+---+ */ } def dataSet_randomSplit()={ dataSet.randomSplit(Array(0.6,0.4),0L).foreach{ds => ds.show() } /** * +-------+---+ | name|age| +-------+---+ | Andy| 30| |Michael| 29| +-------+---+ */ /** * +------+---+ | name|age| +------+---+ |Justin| 19| +------+---+ */ } def dataSet_randomSplitAsList()={ //Returns a Java list that contains randomly split Dataset with the provided weights. println(dataSet.randomSplitAsList(Array(0.6,0.4),0L).size()) //2 } def dataSet_repartition()={ def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println) /** * [partID:0, val: Person(Michael,29)] [partID:0, val: Person(Justin,19)] [partID:1, val: Person(Andy,30)] */ dataSet.repartition($"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println) /** * [partID:71, val: Person(Michael,29)] [partID:164, val: Person(Andy,30)] [partID:169, val: Person(Justin,19)] */ dataSet.repartition(2,$"name").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println) /** * [partID:0, val: Person(Andy,30)] [partID:1, val: Person(Michael,29)] [partID:1, val: Person(Justin,19)] */ } def dataSet_sample()={ //Returns a new Dataset by sampling a fraction of rows, using a random seed. dataSet.sample(withReplacement = true,0.6,0L).show() /** * +----+---+ |name|age| +----+---+ |Andy| 30| +----+---+ */ } def dataSet_select()={ dataSet.select($"name").show() /** * +-------+ | name| +-------+ |Michael| | Andy| | Justin| +-------+ */ } def dataSet_sort()={ dataSet.show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ dataSet.sort($"name",$"age".desc).show() /** * +-------+---+ | name|age| +-------+---+ | Andy| 30| | Justin| 19| |Michael| 29| +-------+---+ */ } def dataSet_sortWithinPartitions()={ def myfunc(index: Int, iter: Iterator[(Person)]) : Iterator[String] = { iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator } dataSet.repartition(2).toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println) /** * [partID:0, val: Person(Michael,29)] [partID:0, val: Person(Justin,19)] [partID:1, val: Person(Andy,30)] */ dataSet.repartition(2).sortWithinPartitions($"age").toJavaRDD.rdd.mapPartitionsWithIndex(myfunc).collect().foreach(println) /** * [partID:0, val: Person(Justin,19)] [partID:0, val: Person(Michael,29)] [partID:1, val: Person(Andy,30)] */ } def dataSet_transform()={ //Concise syntax for chaining custom transformations. dataSet.show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ dataSet.transform{ p =>p.sort($"age".desc)}.show() /** * +-------+---+ | name|age| +-------+---+ | Andy| 30| |Michael| 29| | Justin| 19| +-------+---+ */ } def dataSet_union()={ //Returns a new Dataset containing union of rows in this Dataset and another Dataset. This is equivalent to UNION ALL in SQL. val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19))) val unionedDS = dataSet.union(tmpDataSet).union(tmpDataSet) unionedDS.show() /** * +--------+---+ | name|age| +--------+---+ | Michael| 29| | Andy| 30| | Justin| 19| |legotime|100| | lego| 19| |legotime|100| | lego| 19| +--------+---+ */ } def dataSet_where()={ dataSet.where($"age">20).show() dataSet.where("age > 20").show() dataSet.filter($"age">20).show() dataSet.filter("age >20").show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| +-------+---+ */ } def dataSet_unionAll()={ /** * Annotation @deprecated Deprecate (Since version 2.0.0) use union() */ val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19))) val unionedDS = dataSet.unionAll(tmpDataSet).union(tmpDataSet) unionedDS.show() /** * +--------+---+ | name|age| +--------+---+ | Michael| 29| | Andy| 30| | Justin| 19| |legotime|100| | lego| 19| |legotime|100| | lego| 19| +--------+---+ */ } //---------------------------------------------------------------Untyped transformations--------------------- def dataSet_agg()={ // import org.apache.spark.sql.functions._ dataSet.groupBy($"age",$"name").agg(max($"name"), avg($"age")).show() /** * +---+-------+---------+--------+ |age| name|max(name)|avg(age)| +---+-------+---------+--------+ | 29|Michael| Michael| 29.0| | 30| Andy| Andy| 30.0| | 19| Justin| Justin| 19.0| +---+-------+---------+--------+ */ dataSet.groupBy().agg(max($"name"), avg($"age")).show() dataSet.agg(max($"name"), avg($"age")).show() // dataSet.agg(...) is a shorthand for dataSet.groupBy().agg(...) /** * +---------+--------+ |max(name)|avg(age)| +---------+--------+ | Michael| 26.0| +---------+--------+ */ } def dataSet_apply()={ //Selects column based on the column name and return it as a Column. Note that the column name can also reference to a nested column like a.b. println(dataSet.apply("age")) //age } def dataSet_col()={ //Selects column based on the column name and return it as a Column. dataSet.select(col("age")).show() /** * +---+ |age| +---+ | 29| | 30| | 19| +---+ */ } def dataSet_cube()={ //Create a multi-dimensional cube for the current Dataset using the specified columns, so we can run aggregation on them. dataSet.cube("age","name").agg(avg($"age")).show() /** * +----+-------+--------+ | age| name|max(age)| +----+-------+--------+ |null|Michael| 29| |null| null| 30| | 29|Michael| 29| | 19| null| 19| | 30| Andy| 30| | 30| null| 30| |null| Andy| 30| | 19| Justin| 19| | 29| null| 29| |null| Justin| 19| +----+-------+--------+ */ } def dataSet_drop()={ //Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column name(s). dataSet.drop("age").show() //Returns a new Dataset with a column dropped. This version of drop accepts a Column rather than a name. // This is a no-op if the Dataset doesn't have a column with an equivalent expression. dataSet.drop(col = col("age")).show() /** * +-------+ | name| +-------+ |Michael| | Andy| | Justin| +-------+ */ } def dataSet_groupBy()={ dataSet.groupBy(col("age")).agg{Map( "age"->"avg", "name"->"max" )}.show() dataSet.groupBy($"age").agg{Map( "age"->"avg", "name"->"max" )}.show() /** * +---+--------+---------+ |age|avg(age)|max(name)| +---+--------+---------+ | 29| 29.0| Michael| | 19| 19.0| Justin| | 30| 30.0| Andy| +---+--------+---------+ */ } def dataSet_join()={ val tmpDataSet = sparkSession.createDataset(Seq(Person("legotime",100),Person("lego",19))) dataSet.join(tmpDataSet).show() /** * +-------+---+--------+---+ |Michael| 29|legotime|100| |Michael| 29| lego| 19| | Andy| 30|legotime|100| | Andy| 30| lego| 19| | Justin| 19|legotime|100| | Justin| 19| lego| 19| +-------+---+--------+---+ */ dataSet.join(tmpDataSet,"age").show() /** * +---+------+----+ |age| name|name| +---+------+----+ | 19|Justin|lego| +---+------+----+ */ dataSet.join(tmpDataSet,Seq("age","name")).show() /** * +---+----+ |age|name| +---+----+ +---+----+ */ } def dataSet_na()={ //Returns a DataFrameNaFunctions for working with missing data. dataSet.na.drop("all").show() /** * +-------+---+ | name|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ } def dataSet_rollup()={ //Create a multi-dimensional rollup for the current Dataset using the specified columns, so we can run aggregation on them dataSet.rollup("age", "name").avg().show() /** * +----+-------+--------+ | age| name|avg(age)| +----+-------+--------+ |null| null| 26.0| | 29|Michael| 29.0| | 19| null| 19.0| | 30| Andy| 30.0| | 30| null| 30.0| | 19| Justin| 19.0| | 29| null| 29.0| +----+-------+--------+ */ } def dataSet_select_2()={ dataSet.select("age","name","age").show() /** * +---+-------+---+ |age| name|age| +---+-------+---+ | 29|Michael| 29| | 30| Andy| 30| | 19| Justin| 19| +---+-------+---+ */ } def dataSet_selectExpr()={ //Selects a set of SQL expressions. This is a variant of select that accepts SQL expressions. dataSet.selectExpr("name","age+1","name as NAME","age as AGE").show() dataSet.select(expr("name"),expr("age+1"), expr("name as NAME"), expr("age as AGE")) /** * +-------+---------+-------+---+ | name|(age + 1)| NAME|AGE| +-------+---------+-------+---+ |Michael| 30|Michael| 29| | Andy| 31| Andy| 30| | Justin| 20| Justin| 19| +-------+---------+-------+---+ */ } def dataSet_stat()={ //Returns a DataFrameStatFunctions for working statistic functions support. dataSet.stat.sampleBy("age",Map("age"->0.5,"name"->0.5),0L).show() /** * +----+---+ |name|age| +----+---+ +----+---+ */ } def dataSet_withColumn()={ //Returns a new Dataset by adding a column or replacing the existing column that has the same name. dataSet.withColumn("NAME",col("name")).show() /** * +-------+---+ | NAME|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ } def dataSet_withColumnRenamed()={ dataSet.withColumnRenamed("name","newName").show() /** * +-------+---+ |newName|age| +-------+---+ |Michael| 29| | Andy| 30| | Justin| 19| +-------+---+ */ } def dataSet_explode()={ /** * Annotations @deprecated Deprecated (Since version 2.0.0) use flatMap() or select() with functions.explode() instead Since 2.0.0 */ dataSet.explain() } def main(args: Array[String]) { //dataSet_collect() //dataSet_collectAsList //dataSet_count //dataSet_describe //dataSet_first //dataSet_foreachPartition //dataSet_head //dataSet_reduce //dataSet_show //dataSet_toLocalIterator //dataSet_as //dataSet_cache //dataSet_columns //dataSet_createOrReplaceTempView //dataSet_createTempView //dataSet_dtypes //dataSet_explain //dataSet_inputFiles //println(dataSet_isLocal) //dataSet_isStreaming //dataSet_javaRDD //dataSet_persist //dataSet_printSchema //dataSet_rdd //dataSet_schema //dataSet_toDF //dataSet_toJavaRDD //dataSet_unpersist //dataSet_write //dataSet_write //dataSet_writeStream //dataSet_registerTempTable //dataSet_alias //dataSet_coalesce //dataSet_distinct //dataSet_dropDuplicates //dataSet_except //dataSet_filter //dataSet_flatMap //dataSet_groupByKey //dataSet_intersect //dataSet_joinWith //dataSet_limit //dataSet_map dataSet_mapPartitions //dataSet_orderBy //dataSet_randomSplit //dataSet_randomSplitAsList //dataSet_repartition //dataSet_sample //dataSet_select //dataSet_sort //dataSet_sortWithinPartitions //dataSet_transform //dataSet_union //dataSet_where //dataSet_unionAll //dataSet_agg //dataSet_apply //dataSet_col //dataSet_cube //dataSet_drop //dataSet_groupBy //dataSet_join //dataSet_na //dataSet_rollup //dataSet_select_2 //dataSet_selectExpr //dataSet_stat //dataSet_stat //dataSet_withColumn //dataSet_withColumnRenamed //dataSet_explode } }
spark源码阅读笔记Dataset(二)Dataset中Actions、function、transformations
最新推荐文章于 2024-06-30 22:45:29 发布