一、Transformation 操作
### --- select * from tab where ... group by ... having... order by...
# --- 1、RDD类似的操作持久化
~~~ 缓存与checkpoint
~~~ select
~~~ where
~~~ group by / 聚合
~~~ order by
~~~ join
~~~ 集合操作
~~~ 空值操作(函数)
~~~ 函数
### --- 2、与RDD类似的操作
map、filter、flatMap、mapPartitions、sample、 randomSplit、
limit、distinct、dropDuplicates、describe
scala> df1.map(row=>row.getAs[Int](0)).show
+-----+
|value|
+-----+
| 7369|
| 7499|
| 7521|
| 7566|
| 7654|
| 7698|
| 7782|
| 7788|
| 7839|
| 7844|
| 7876|
| 7900|
| 7902|
| 7934|
+-----+
~~~ # randomSplit(与RDD类似,将DF、DS按给定参数分成多份)
scala> val df2 = df1.randomSplit(Array(0.5, 0.6, 0.7))
df2: Array[org.apache.spark.sql.Dataset[org.apache.spark.sql.Row]] = Array([EMPNO: int, ENAME: string ... 6 more fields], [EMPNO: int, ENAME: string ... 6 more fields], [EMPNO: int, ENAME: string ... 6 more fields])
scala> df2(0).count
res76: Long = 2
scala> df2(1).count
res77: Long = 4
scala> df2(2).count
res78: Long = 8
~~~ # 取10行数据生成新的DataSet
scala> val df2 = df1.limit(10)
df2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [EMPNO: int, ENAME: string ... 6 more fields]
~~~ # distinct,去重
scala> val df2 = df1.union(df1)
df2: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [EMPNO: int, ENAME: string ... 6 more fields]
scala> df2.distinct.count
res79: Long = 14
~~~ # dropDuplicates,按列值去重
scala> df2.dropDuplicates.show
+-----+------+---------+----+-------------------+----+----+------+
|EMPNO| ENAME| JOB| MGR| HIREDATE| SAL|COMM|DEPTNO|
+-----+------+---------+----+-------------------+----+----+------+
| 7499| ALLEN| SALESMAN|7698|2002-01-02 22:12:13|1600| 300| 30|
| 7521| WARD| SALESMAN|7698|2003-01-02 22:12:13|1250| 500| 30|
| 7369| SMITH| CLERK|7902|2001-01-02 22:12:13| 800|null| 20|
| 7934|MILLER| CLERK|7782|2012-11-02 22:12:13|1300|null| 10|
| 7900| JAMES| CLERK|7698|2011-06-02 22:12:13| 950|null| 30|
| 7782| CLARK| MANAGER|7839|2006-03-02 22:12:13|2450|null| 10|
| 7566| JONES| MANAGER|7839|2004-01-02 22:12:13|2975|null| 20|
| 7698| BLAKE| MANAGER|7839|2005-04-02 22:12:13|2850|null| 30|
| 7654|MARTIN| SALESMAN|7698|2005-01-02 22:12:13|1250|1400| 30|
| 7876| ADAMS| CLERK|7788|2010-05-02 22:12:13|1100|null| 20|
| 7902| FORD| ANALYST|7566|2011-07-02 22:12:13|3000|null| 20|
| 7839| KING|PRESIDENT|null|2006-03-02 22:12:13|5000|null| 10|
| 7788| SCOTT| ANALYST|7566|2007-03-02 22:12:13|3000|null| 20|
| 7844|TURNER| SALESMAN|7698|2009-07-02 22:12:13|1500| 0| 30|
+-----+------+---------+----+-------------------+----+----+------+
scala> df2.dropDuplicates("mgr", "deptno").show
+-----+------+---------+----+-------------------+----+----+------+
|EMPNO| ENAME| JOB| MGR| HIREDATE| SAL|COMM|DEPTNO|
+-----+------+---------+----+-------------------+----+----+------+
| 7698| BLAKE| MANAGER|7839|2005-04-02 22:12:13|2850|null| 30|
| 7499| ALLEN| SALESMAN|7698|2002-01-02 22:12:13|1600| 300| 30|
| 7782| CLARK| MANAGER|7839|2006-03-02 22:12:13|2450|null| 10|
| 7876| ADAMS| CLERK|7788|2010-05-02 22:12:13|1100|null| 20|
| 7839| KING|PRESIDENT|null|2006-03-02 22:12:13|5000|null| 10|
| 7934|MILLER| CLERK|7782|2012-11-02 22:12:13|1300|null| 10|
| 7369| SMITH| CLERK|7902|2001-01-02 22:12:13| 800|null| 20|
| 7788| SCOTT| ANALYST|7566|2007-03-02 22:12:13|3000|null| 20|
| 7566| JONES| MANAGER|7839|2004-01-02 22:12:13|2975|null| 20|
+-----+------+---------+----+-------------------+----+----+------+
scala> df2.dropDuplicates("mgr").show
+-----+------+---------+----+-------------------+----+----+------+
|EMPNO| ENAME| JOB| MGR| HIREDATE| SAL|COMM|DEPTNO|
+-----+------+---------+----+-------------------+----+----+------+
| 7876| ADAMS| CLERK|7788|2010-05-02 22:12:13|1100|null| 20|
| 7369| SMITH| CLERK|7902|2001-01-02 22:12:13| 800|null| 20|
| 7839| KING|PRESIDENT|null|2006-03-02 22:12:13|5000|null| 10|
| 7788| SCOTT| ANALYST|7566|2007-03-02 22:12:13|3000|null| 20|
| 7499| ALLEN| SALESMAN|7698|2002-01-02 22:12:13|1600| 300| 30|
| 7566| JONES| MANAGER|7839|2004-01-02 22:12:13|2975|null| 20|
| 7934|MILLER| CLERK|7782|2012-11-02 22:12:13|1300|null| 10|
+-----+------+---------+----+-------------------+----+----+------+
scala> df2.dropDuplicates("deptno").show
+-----+-----+--------+----+-------------------+----+----+------+
|EMPNO|ENAME| JOB| MGR| HIREDATE| SAL|COMM|DEPTNO|
+-----+-----+--------+----+-------------------+----+----+------+
| 7369|SMITH| CLERK|7902|2001-01-02 22:12:13| 800|null| 20|
| 7782|CLARK| MANAGER|7839|2006-03-02 22:12:13|2450|null| 10|
| 7499|ALLEN|SALESMAN|7698|2002-01-02 22:12:13|1600| 300| 30|
+-----+-----+--------+----+-------------------+----+----+------+
~~~ # 返回全部列的统计(count、mean、stddev、min、max)
scala> ds1.describe().show
+-------+----+----------------+------------------+
|summary|name| age| height|
+-------+----+----------------+------------------+
| count| 3| 3| 3|
| mean|null| 18.0|164.33333333333334|
| stddev|null|9.16515138991168|20.008331597945226|
| min|Andy| 10| 144|
| max| Tom| 28| 184|
+-------+----+----------------+------------------+
~~~ # 返回指定列的统计
scala> ds1.describe("*").show
+-------+----+----------------+------------------+
|summary|name| age| height|
+-------+----+----------------+------------------+
| count| 3| 3| 3|
| mean|null| 18.0|164.33333333333334|
| stddev|null|9.16515138991168|20.008331597945226|
| min|Andy| 10| 144|
| max| Tom| 28| 184|
+-------+----+----------------+------------------+
### --- 3、存储相关
~~~ cacheTable、persist、checkpoint、unpersist、cache
~~~ 备注:Dataset 默认的存储级别是 MEMORY_AND_DISK
scala> import org.apache.spark.storage.StorageLevel
import org.apache.spark.storage.StorageLevel
scala> spark.sparkContext.setCheckpointDir("hdfs://hadoop01:9000/checkpoint")
scala> df1.show()
+-----+------+---------+----+-------------------+----+----+------+
|EMPNO| ENAME| JOB| MGR| HIREDATE| SAL|COMM|DEPTNO|
+-----+------+---------+----+-------------------+----+----+------+
| 7369| SMITH| CLERK|7902|2001-01-02 22:12:13| 800|null| 20|
| 7499| ALLEN| SALESMAN|7698|2002-01-02 22:12:13|1600| 300| 30|
| 7521| WARD| SALESMAN|7698|2003-01-02 22:12:13|1250| 500| 30|
| 7566| JONES| MANAGER|7839|2004-01-02 22:12:13|2975|null| 20|
| 7654|MARTIN| SALESMAN|7698|2005-01-02 22:12:13|1250|1400| 30|
| 7698| BLAKE| MANAGER|7839|2005-04-02 22:12:13|2850|null| 30|
| 7782| CLARK| MANAGER|7839|2006-03-02 22:12:13|2450|null| 10|
| 7788| SCOTT| ANALYST|7566|2007-03-02 22:12:13|3000|null| 20|
| 7839| KING|PRESIDENT|null|2006-03-02 22:12:13|5000|null| 10|
| 7844|TURNER| SALESMAN|7698|2009-07-02 22:12:13|1500| 0| 30|
| 7876| ADAMS| CLERK|7788|2010-05-02 22:12:13|1100|null| 20|
| 7900| JAMES| CLERK|7698|2011-06-02 22:12:13| 950|null| 30|
| 7902| FORD| ANALYST|7566|2011-07-02 22:12:13|3000|null| 20|
| 7934|MILLER| CLERK|7782|2012-11-02 22:12:13|1300|null| 10|
+-----+------+---------+----+-------------------+----+----+------+
scala> df1.checkpoint()
res36: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [EMPNO: int, ENAME: string ... 6 more fields]
scala> df1.cache()
res37: df1.type = [EMPNO: int, ENAME: string ... 6 more fields]
scala> df1.persist(StorageLevel.MEMORY_ONLY)
21/10/20 15:45:46 WARN CacheManager: Asked to cache already cached data.
res38: df1.type = [EMPNO: int, ENAME: string ... 6 more fields]
scala> df1.count()
res39: Long = 14
scala> df1.unpersist(true)
res40: df1.type = [EMPNO: int, ENAME: string ... 6 more fields]
scala> df1.createOrReplaceTempView("t1")
scala> spark.catalog.cacheTable("t1")
scala> spark.catalog.uncacheTable("t1")
### --- 4、select相关
~~~ 列的多种表示、select、selectExpr
~~~ drop、withColumn、withColumnRenamed、cast(内置函数)
~~~ # 列的多种表示方法。使用""、$""、'、col()、ds("")
~~~ # 注意:不要混用;必要时使用spark.implicitis._;并非每个表示在所有的地方都有效
scala> df1.select($"ename", $"hiredate", $"sal").show
+------+-------------------+----+
| ename| hiredate| sal|
+------+-------------------+----+
| SMITH|2001-01-02 22:12:13| 800|
| ALLEN|2002-01-02 22:12:13|1600|
| WARD|2003-01-02 22:12:13|1250|
| JONES|2004-01-02 22:12:13|2975|
|MARTIN|2005-01-02 22:12:13|1250|
| BLAKE|2005-04-02 22:12:13|2850|
| CLARK|2006-03-02 22:12:13|2450|
| SCOTT|2007-03-02 22:12:13|3000|
| KING|2006-03-02 22:12:13|5000|
|TURNER|2009-07-02 22:12:13|1500|
| ADAMS|2010-05-02 22:12:13|1100|
| JAMES|2011-06-02 22:12:13| 950|
| FORD|2011-07-02 22:12:13|3000|
|MILLER|2012-11-02 22:12:13|1300|
+------+-------------------+----+
scala> df1.select("ename", "hiredate", "sal").show
+------+-------------------+----+
| ename| hiredate| sal|
+------+-------------------+----+
| SMITH|2001-01-02 22:12:13| 800|
| ALLEN|2002-01-02 22:12:13|1600|
| WARD|2003-01-02 22:12:13|1250|
| JONES|2004-01-02 22:12:13|2975|
|MARTIN|2005-01-02 22:12:13|1250|
| BLAKE|2005-04-02 22:12:13|2850|
| CLARK|2006-03-02 22:12:13|2450|
| SCOTT|2007-03-02 22:12:13|3000|
| KING|2006-03-02 22:12:13|5000|
|TURNER|2009-07-02 22:12:13|1500|
| ADAMS|2010-05-02 22:12:13|1100|
| JAMES|2011-06-02 22:12:13| 950|
| FORD|2011-07-02 22:12:13|3000|
|MILLER|2012-11-02 22:12:13|1300|
+------+-------------------+----+
scala> df1.select('ename, 'hiredate, 'sal).show
+------+-------------------+----+
| ename| hiredate| sal|
+------+-------------------+----+
| SMITH|2001-01-02 22:12:13| 800|
| ALLEN|2002-01-02 22:12:13|1600|
| WARD|2003-01-02 22:12:13|1250|
| JONES|2004-01-02 22:12:13|2975|
|MARTIN|2005-01-02 22:12:13|1250|
| BLAKE|2005-04-02 22:12:13|2850|
| CLARK|2006-03-02 22:12:13|2450|
| SCOTT|2007-03-02 22:12:13|3000|
| KING|2006-03-02 22:12:13|5000|
|TURNER|2009-07-02 22:12:13|1500|
| ADAMS|2010-05-02 22:12:13|1100|
| JAMES|2011-06-02 22:12:13| 950|
| FORD|2011-07-02 22:12:13|3000|
|MILLER|2012-11-02 22:12:13|1300|
+------+-------------------+----+
scala> df1.select(col("ename"), col("hiredate"), col("sal")).show
+------+-------------------+----+
| ename| hiredate| sal|
+------+-------------------+----+
| SMITH|2001-01-02 22:12:13| 800|
| ALLEN|2002-01-02 22:12:13|1600|
| WARD|2003-01-02 22:12:13|1250|
| JONES|2004-01-02 22:12:13|2975|
|MARTIN|2005-01-02 22:12:13|1250|
| BLAKE|2005-04-02 22:12:13|2850|
| CLARK|2006-03-02 22:12:13|2450|
| SCOTT|2007-03-02 22:12:13|3000|
| KING|2006-03-02 22:12:13|5000|
|TURNER|2009-07-02 22:12:13|1500|
| ADAMS|2010-05-02 22:12:13|1100|
| JAMES|2011-06-02 22:12:13| 950|
| FORD|2011-07-02 22:12:13|3000|
|MILLER|2012-11-02 22:12:13|1300|
+------+-------------------+----+
scala> df1.select(df1("ename"), df1("hiredate"), df1("sal")).show
+------+-------------------+----+
| ename| hiredate| sal|
+------+-------------------+----+
| SMITH|2001-01-02 22:12:13| 800|
| ALLEN|2002-01-02 22:12:13|1600|
| WARD|2003-01-02 22:12:13|1250|
| JONES|2004-01-02 22:12:13|2975|
|MARTIN|2005-01-02 22:12:13|1250|
| BLAKE|2005-04-02 22:12:13|2850|
| CLARK|2006-03-02 22:12:13|2450|
| SCOTT|2007-03-02 22:12:13|3000|
| KING|2006-03-02 22:12:13|5000|
|TURNER|2009-07-02 22:12:13|1500|
| ADAMS|2010-05-02 22:12:13|1100|
| JAMES|2011-06-02 22:12:13| 950|
| FORD|2011-07-02 22:12:13|3000|
|MILLER|2012-11-02 22:12:13|1300|
+------+-------------------+----+
~~~ # 下面的写法无效,其他列的表示法有效
scala> df1.select("ename", "hiredate", "sal"+100).show
scala> df1.select("ename", "hiredate", "sal+100").show
~~~ # 这样写才符合语法
scala> df1.select($"ename", $"hiredate", $"sal"+100).show
+------+-------------------+-----------+
| ename| hiredate|(sal + 100)|
+------+-------------------+-----------+
| SMITH|2001-01-02 22:12:13| 900|
| ALLEN|2002-01-02 22:12:13| 1700|
| WARD|2003-01-02 22:12:13| 1350|
| JONES|2004-01-02 22:12:13| 3075|
|MARTIN|2005-01-02 22:12:13| 1350|
| BLAKE|2005-04-02 22:12:13| 2950|
| CLARK|2006-03-02 22:12:13| 2550|
| SCOTT|2007-03-02 22:12:13| 3100|
| KING|2006-03-02 22:12:13| 5100|
|TURNER|2009-07-02 22:12:13| 1600|
| ADAMS|2010-05-02 22:12:13| 1200|
| JAMES|2011-06-02 22:12:13| 1050|
| FORD|2011-07-02 22:12:13| 3100|
|MILLER|2012-11-02 22:12:13| 1400|
+------+-------------------+-----------+
scala> df1.select('ename, 'hiredate, 'sal+100).show
+------+-------------------+-----------+
| ename| hiredate|(sal + 100)|
+------+-------------------+-----------+
| SMITH|2001-01-02 22:12:13| 900|
| ALLEN|2002-01-02 22:12:13| 1700|
| WARD|2003-01-02 22:12:13| 1350|
| JONES|2004-01-02 22:12:13| 3075|
|MARTIN|2005-01-02 22:12:13| 1350|
| BLAKE|2005-04-02 22:12:13| 2950|
| CLARK|2006-03-02 22:12:13| 2550|
| SCOTT|2007-03-02 22:12:13| 3100|
| KING|2006-03-02 22:12:13| 5100|
|TURNER|2009-07-02 22:12:13| 1600|
| ADAMS|2010-05-02 22:12:13| 1200|
| JAMES|2011-06-02 22:12:13| 1050|
| FORD|2011-07-02 22:12:13| 3100|
|MILLER|2012-11-02 22:12:13| 1400|
+------+-------------------+-----------+
~~~ # 可使用expr表达式(expr里面只能使用引号)
scala> df1.select(expr("comm+100"), expr("sal+100"), expr("ename")).show
+------------+-----------+------+
|(comm + 100)|(sal + 100)| ename|
+------------+-----------+------+
| null| 900| SMITH|
| 400| 1700| ALLEN|
| 600| 1350| WARD|
| null| 3075| JONES|
| 1500| 1350|MARTIN|
| null| 2950| BLAKE|
| null| 2550| CLARK|
| null| 3100| SCOTT|
| null| 5100| KING|
| 100| 1600|TURNER|
| null| 1200| ADAMS|
| null| 1050| JAMES|
| null| 3100| FORD|
| null| 1400|MILLER|
+------------+-----------+------+
scala> df1.selectExpr("ename as name").show
+------+
| name|
+------+
| SMITH|
| ALLEN|
| WARD|
| JONES|
|MARTIN|
| BLAKE|
| CLARK|
| SCOTT|
| KING|
|TURNER|
| ADAMS|
| JAMES|
| FORD|
|MILLER|
+------+
scala> df1.selectExpr("power(sal, 2)", "sal").show
+---------------------------------------------+----+
|POWER(CAST(sal AS DOUBLE), CAST(2 AS DOUBLE))| sal|
+---------------------------------------------+----+
| 640000.0| 800|
| 2560000.0|1600|
| 1562500.0|1250|
| 8850625.0|2975|
| 1562500.0|1250|
| 8122500.0|2850|
| 6002500.0|2450|
| 9000000.0|3000|
| 2.5E7|5000|
| 2250000.0|1500|
| 1210000.0|1100|
| 902500.0| 950|
| 9000000.0|3000|
| 1690000.0|1300|
+---------------------------------------------+----+
scala> df1.selectExpr("round(sal, -3) as newsal", "sal", "ename").show
+------+----+------+
|newsal| sal| ename|
+------+----+------+
| 1000| 800| SMITH|
| 2000|1600| ALLEN|
| 1000|1250| WARD|
| 3000|2975| JONES|
| 1000|1250|MARTIN|
| 3000|2850| BLAKE|
| 2000|2450| CLARK|
| 3000|3000| SCOTT|
| 5000|5000| KING|
| 2000|1500|TURNER|
| 1000|1100| ADAMS|
| 1000| 950| JAMES|
| 3000|3000| FORD|
| 1000|1300|MILLER|
+------+----+------+
~~~ # drop、withColumn、 withColumnRenamed、casting
~~~ # drop 删除一个或多个列,得到新的DF
scala> df1.drop("mgr")
res42: org.apache.spark.sql.DataFrame = [EMPNO: int, ENAME: string ... 5 more fields]
scala> df1.drop("empno", "mgr")
res43: org.apache.spark.sql.DataFrame = [ENAME: string, JOB: string ... 4 more fields]
~~~ # withColumn,修改列值
scala> val df2 = df1.withColumn("sal", $"sal"+1000)
df2: org.apache.spark.sql.DataFrame = [EMPNO: int, ENAME: string ... 6 more fields]
scala> df2.show
+-----+------+---------+----+-------------------+----+----+------+
|EMPNO| ENAME| JOB| MGR| HIREDATE| sal|COMM|DEPTNO|
+-----+------+---------+----+-------------------+----+----+------+
| 7369| SMITH| CLERK|7902|2001-01-02 22:12:13|1800|null| 20|
| 7499| ALLEN| SALESMAN|7698|2002-01-02 22:12:13|2600| 300| 30|
| 7521| WARD| SALESMAN|7698|2003-01-02 22:12:13|2250| 500| 30|
| 7566| JONES| MANAGER|7839|2004-01-02 22:12:13|3975|null| 20|
| 7654|MARTIN| SALESMAN|7698|2005-01-02 22:12:13|2250|1400| 30|
| 7698| BLAKE| MANAGER|7839|2005-04-02 22:12:13|3850|null| 30|
| 7782| CLARK| MANAGER|7839|2006-03-02 22:12:13|3450|null| 10|
| 7788| SCOTT| ANALYST|7566|2007-03-02 22:12:13|4000|null| 20|
| 7839| KING|PRESIDENT|null|2006-03-02 22:12:13|6000|null| 10|
| 784