常用方法
show
scala> emp.show()
+----+------+-----+------+----------+---------+----+------+
|comm|deptno|empno| ename| hiredate| job| mgr| sal|
+----+------+-----+------+----------+---------+----+------+
|null| 20| 7369| SMITH|1980-12-17| CLERK|7902| 800.0|
| 300| 30| 7499| ALLEN|1981-02-20| SALESMAN|7698|1600.5|
| 500| 30| 7521| WARD|1981-02-22| SALESMAN|7698|1250.0|
|null| 20| 7566| JONES|1981-04-02| MANAGER|7839|2975.0|
|1400| 30| 7654|MARTIN|1981-09-28| SALESMAN|7698|1250.0|
|null| 30| 7698| BLAKE|1981-05-01| MANAGER|7839|2850.0|
|null| 10| 7782| CLARK|1981-06-09| MANAGER|7839|2450.0|
|null| 20| 7788| SCOTT|1987-04-19| ANALYST|7566|3000.0|
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
| 0| 30| 7844|TURNER|1981-09-08| SALESMAN|7698|1500.0|
|null| 20| 7876| ADAMS|1987-05-23| CLERK|7788|1100.0|
|null| 30| 7900| JAMES|1981-12-03| CLERK|7698| 950.0|
|null| 20| 7902| FORD|1981-12-02| ANALYST|7566|3000.0|
|null| 10| 7934|MILLER|1982-01-23| CLERK|7369|1300.0|
+----+------+-----+------+----------+---------+----+------+
scala> emp.show(3)
+----+------+-----+-----+----------+--------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+--------+----+------+
|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0|
| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5|
| 500| 30| 7521| WARD|1981-02-22|SALESMAN|7698|1250.0|
+----+------+-----+-----+----------+--------+----+------+
only showing top 3 rows
collect
scala> emp.collect
res8: Array[org.apache.spark.sql.Row] = Array([null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0], [300,30,7499,ALLEN,1981-02-20,SALESMAN,7698,1600.5], [500,30,7521,WARD,1981-02-22,SALESMAN,7698,1250.0], [null,20,7566,JONES,1981-04-02,MANAGER,7839,2975.0], [1400,30,7654,MARTIN,1981-09-28,SALESMAN,7698,1250.0], [null,30,7698,BLAKE,1981-05-01,MANAGER,7839,2850.0], [null,10,7782,CLARK,1981-06-09,MANAGER,7839,2450.0], [null,20,7788,SCOTT,1987-04-19,ANALYST,7566,3000.0], [null,10,7839,KING,1981-11-17,PRESIDENT,null,5000.0], [0,30,7844,TURNER,1981-09-08,SALESMAN,7698,1500.0], [null,20,7876,ADAMS,1987-05-23,CLERK,7788,1100.0], [null,30,7900,JAMES,1981-12-03,CLERK,7698,950.0], [null,20,7902,FORD,1981-12-02,ANALYST,7566,3000.0], [null,10,7934,MILLER,1982-01-23,CLERK,7369,1300.0])
collectAsList
scala> emp.collectAsList
res9: java.util.List[org.apache.spark.sql.Row] = [[null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0], [300,30,7499,ALLEN,1981-02-20,SALESMAN,7698,1600.5], [500,30,7521,WARD,1981-02-22,SALESMAN,7698,1250.0], [null,20,7566,JONES,1981-04-02,MANAGER,7839,2975.0], [1400,30,7654,MARTIN,1981-09-28,SALESMAN,7698,1250.0], [null,30,7698,BLAKE,1981-05-01,MANAGER,7839,2850.0], [null,10,7782,CLARK,1981-06-09,MANAGER,7839,2450.0], [null,20,7788,SCOTT,1987-04-19,ANALYST,7566,3000.0], [null,10,7839,KING,1981-11-17,PRESIDENT,null,5000.0], [0,30,7844,TURNER,1981-09-08,SALESMAN,7698,1500.0], [null,20,7876,ADAMS,1987-05-23,CLERK,7788,1100.0], [null,30,7900,JAMES,1981-12-03,CLERK,7698,950.0], [null,20,7902,FORD,1981-12-02,ANALYST,7566,3000.0], [null,10,7934,MILLER,1982-01-23,CLERK,7369,1300.0]]
first等
scala> emp.first
res10: org.apache.spark.sql.Row = [null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0]
scala> emp.head
res11: org.apache.spark.sql.Row = [null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0]
scala> emp.head(2)
res12: Array[org.apache.spark.sql.Row] = Array([null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0], [300,30,7499,ALLEN,1981-02-20,SALESMAN,7698,1600.5])
scala> emp.take(2)
res13: Array[org.apache.spark.sql.Row] = Array([null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0], [300,30,7499,ALLEN,1981-02-20,SALESMAN,7698,1600.5])
scala> emp.takeAsList(2)
res14: java.util.List[org.apache.spark.sql.Row] = [[null,20,7369,SMITH,1980-12-17,CLERK,7902,800.0], [300,30,7499,ALLEN,1981-02-20,SALESMAN,7698,1600.5]]
where
scala> emp.where("deptno=10").show
+----+------+-----+------+----------+---------+----+------+
|comm|deptno|empno| ename| hiredate| job| mgr| sal|
+----+------+-----+------+----------+---------+----+------+
|null| 10| 7782| CLARK|1981-06-09| MANAGER|7839|2450.0|
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
|null| 10| 7934|MILLER|1982-01-23| CLERK|7369|1300.0|
+----+------+-----+------+----------+---------+----+------+
scala> emp.where("deptno=10 and sal>1400").show
+----+------+-----+-----+----------+---------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+---------+----+------+
|null| 10| 7782|CLARK|1981-06-09| MANAGER|7839|2450.0|
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
+----+------+-----+-----+----------+---------+----+------+
filter
scala> emp.filter("deptno=10 and sal>1400").show
+----+------+-----+-----+----------+---------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+---------+----+------+
|null| 10| 7782|CLARK|1981-06-09| MANAGER|7839|2450.0|
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
+----+------+-----+-----+----------+---------+----+------+
select
scala> emp.select("empno","ename")show(3)
+-----+-----+
|empno|ename|
+-----+-----+
| 7369|SMITH|
| 7499|ALLEN|
| 7521| WARD|
+-----+-----+
only showing top 3 rows
scala> emp.select(emp("empno"),emp("sal")+1).show(3)
+-----+---------+
|empno|(sal + 1)|
+-----+---------+
| 7369| 801.0|
| 7499| 1601.5|
| 7521| 1251.0|
+-----+---------+
only showing top 3 rows
scala> emp.select(col("sal"),col("sal")+1).show(3)
+------+---------+
| sal|(sal + 1)|
+------+---------+
| 800.0| 801.0|
|1600.5| 1601.5|
|1250.0| 1251.0|
+------+---------+
only showing top 3 rows
selectExpr
scala> emp.selectExpr("ename","empno as no","round(sal)").show(4)
+-----+----+-------------+
|ename| no|round(sal, 0)|
+-----+----+-------------+
|SMITH|7369| 800.0|
|ALLEN|7499| 1601.0|
| WARD|7521| 1250.0|
|JONES|7566| 2975.0|
+-----+----+-------------+
only showing top 4 rows
col
scala> emp.select(col("ename")).show
+------+
| ename|
+------+
| SMITH|
| ALLEN|
| WARD|
| JONES|
|MARTIN|
| BLAKE|
| CLARK|
| SCOTT|
| KING|
|TURNER|
| ADAMS|
| JAMES|
| FORD|
|MILLER|
+------+
drop
scala> emp.drop("empno").show(3)
+----+------+-----+----------+--------+----+------+
|comm|deptno|ename| hiredate| job| mgr| sal|
+----+------+-----+----------+--------+----+------+
|null| 20|SMITH|1980-12-17| CLERK|7902| 800.0|
| 300| 30|ALLEN|1981-02-20|SALESMAN|7698|1600.5|
| 500| 30| WARD|1981-02-22|SALESMAN|7698|1250.0|
+----+------+-----+----------+--------+----+------+
only showing top 3 rows
limit
scala> emp.limit(3)
res26: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [comm: bigint, deptno: bigint ... 6 more fields]
scala> emp.limit(3).show
+----+------+-----+-----+----------+--------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+--------+----+------+
|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0|
| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5|
| 500| 30| 7521| WARD|1981-02-22|SALESMAN|7698|1250.0|
+----+------+-----+-----+----------+--------+----+------+
orderBy & sort
scala> emp.orderBy(-col("sal")).show(3)
+----+------+-----+-----+----------+---------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+---------+----+------+
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
|null| 20| 7788|SCOTT|1987-04-19| ANALYST|7566|3000.0|
|null| 20| 7902| FORD|1981-12-02| ANALYST|7566|3000.0|
+----+------+-----+-----+----------+---------+----+------+
only showing top 3 rows
scala> emp.sort(-col("sal")).show(3)
+----+------+-----+-----+----------+---------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+---------+----+------+
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
|null| 20| 7788|SCOTT|1987-04-19| ANALYST|7566|3000.0|
|null| 20| 7902| FORD|1981-12-02| ANALYST|7566|3000.0|
+----+------+-----+-----+----------+---------+----+------+
only showing top 3 rows
sortWithinPartitions
scala> emp.repartition(2).sortWithinPartitions("sal").show
+----+------+-----+------+----------+---------+----+------+
|comm|deptno|empno| ename| hiredate| job| mgr| sal|
+----+------+-----+------+----------+---------+----+------+
|null| 20| 7369| SMITH|1980-12-17| CLERK|7902| 800.0|
|null| 20| 7876| ADAMS|1987-05-23| CLERK|7788|1100.0|
|1400| 30| 7654|MARTIN|1981-09-28| SALESMAN|7698|1250.0|
| 500| 30| 7521| WARD|1981-02-22| SALESMAN|7698|1250.0|
|null| 10| 7934|MILLER|1982-01-23| CLERK|7369|1300.0|
|null| 20| 7566| JONES|1981-04-02| MANAGER|7839|2975.0|
|null| 20| 7788| SCOTT|1987-04-19| ANALYST|7566|3000.0|
|null| 30| 7900| JAMES|1981-12-03| CLERK|7698| 950.0|
| 0| 30| 7844|TURNER|1981-09-08| SALESMAN|7698|1500.0|
| 300| 30| 7499| ALLEN|1981-02-20| SALESMAN|7698|1600.5|
|null| 10| 7782| CLARK|1981-06-09| MANAGER|7839|2450.0|
|null| 30| 7698| BLAKE|1981-05-01| MANAGER|7839|2850.0|
|null| 20| 7902| FORD|1981-12-02| ANALYST|7566|3000.0|
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
+----+------+-----+------+----------+---------+----+------+
groupBy
scala> emp.groupBy(“deptno”).count.show
+------+-----+
|deptno|count|
+------+-----+
| 10| 3|
| 30| 6|
| 20| 5|
+------+-----+
cube
scala> emp.cube("deptno","job").sum("sal").show()
+------+---------+--------+
|deptno| job|sum(sal)|
+------+---------+--------+
| 20| MANAGER| 2975.0|
| null|PRESIDENT| 5000.0|
| null| null| 29025.5|
| 10|PRESIDENT| 5000.0|
| null| SALESMAN| 5600.5|
| 30| CLERK| 950.0|
| 10| null| 8750.0|
| 20| CLERK| 1900.0|
| null| ANALYST| 6000.0|
| 30| SALESMAN| 5600.5|
| 20| null| 10875.0|
| 10| CLERK| 1300.0|
| null| MANAGER| 8275.0|
| 30| null| 9400.5|
| null| CLERK| 4150.0|
| 20| ANALYST| 6000.0|
| 30| MANAGER| 2850.0|
| 10| MANAGER| 2450.0|
+------+---------+--------+
rollup
scala> emp.rollup("deptno","job").sum("sal").show()
+------+---------+--------+
|deptno| job|sum(sal)|
+------+---------+--------+
| 20| MANAGER| 2975.0|
| null| null| 29025.5|
| 10|PRESIDENT| 5000.0|
| 30| CLERK| 950.0|
| 10| null| 8750.0|
| 20| CLERK| 1900.0|
| 30| SALESMAN| 5600.5|
| 20| null| 10875.0|
| 10| CLERK| 1300.0|
| 30| null| 9400.5|
| 20| ANALYST| 6000.0|
| 30| MANAGER| 2850.0|
| 10| MANAGER| 2450.0|
+------+---------+--------+
max
scala> emp.groupBy("deptno").max("sal").show()
+------+--------+
|deptno|max(sal)|
+------+--------+
| 10| 5000.0|
| 30| 2850.0|
| 20| 3000.0|
+------+--------+
distinct
scala> emp.distinct.show(3)
+----+------+-----+-----+----------+-------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+-------+----+------+
|null| 20| 7876|ADAMS|1987-05-23| CLERK|7788|1100.0|
|null| 20| 7902| FORD|1981-12-02|ANALYST|7566|3000.0|
|null| 20| 7788|SCOTT|1987-04-19|ANALYST|7566|3000.0|
+----+------+-----+-----+----------+-------+----+------+
only showing top 3 rows
dropDuplicates
scala> emp.dropDuplicates("deptno").show
+----+------+-----+-----+----------+--------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+--------+----+------+
|null| 10| 7782|CLARK|1981-06-09| MANAGER|7839|2450.0|
| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5|
|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0|
+----+------+-----+-----+----------+--------+----+------+
union
scala> emp.limit(1).union(emp.limit(1)).show
+----+------+-----+-----+----------+-----+----+-----+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+-----+----+-----+
|null| 20| 7369|SMITH|1980-12-17|CLERK|7902|800.0|
|null| 20| 7369|SMITH|1980-12-17|CLERK|7902|800.0|
+----+------+-----+-----+----------+-----+----+-----+
join
scala> emp.join(emp,"ename").show
+------+----+------+-----+----------+---------+----+------+----+------+-----+----------+---------+----+------+
| ename|comm|deptno|empno| hiredate| job| mgr| sal|comm|deptno|empno| hiredate| job| mgr| sal|
+------+----+------+-----+----------+---------+----+------+----+------+-----+----------+---------+----+------+
| SMITH|null| 20| 7369|1980-12-17| CLERK|7902| 800.0|null| 20| 7369|1980-12-17| CLERK|7902| 800.0|
| ALLEN| 300| 30| 7499|1981-02-20| SALESMAN|7698|1600.5| 300| 30| 7499|1981-02-20| SALESMAN|7698|1600.5|
| WARD| 500| 30| 7521|1981-02-22| SALESMAN|7698|1250.0| 500| 30| 7521|1981-02-22| SALESMAN|7698|1250.0|
| JONES|null| 20| 7566|1981-04-02| MANAGER|7839|2975.0|null| 20| 7566|1981-04-02| MANAGER|7839|2975.0|
|MARTIN|1400| 30| 7654|1981-09-28| SALESMAN|7698|1250.0|1400| 30| 7654|1981-09-28| SALESMAN|7698|1250.0|
| BLAKE|null| 30| 7698|1981-05-01| MANAGER|7839|2850.0|null| 30| 7698|1981-05-01| MANAGER|7839|2850.0|
| CLARK|null| 10| 7782|1981-06-09| MANAGER|7839|2450.0|null| 10| 7782|1981-06-09| MANAGER|7839|2450.0|
| SCOTT|null| 20| 7788|1987-04-19| ANALYST|7566|3000.0|null| 20|
+------+----+------+-----+----------+---------+----+------+----+------+-----+----------+---------+----+------+
用多个字段连接.不一定要包装在Seq中,List,Array也可以
scala> emp.join(emp,Seq("ename","deptno")).show
+------+------+----+-----+----------+---------+----+------+----+-----+----------+---------+----+------+
| ename|deptno|comm|empno| hiredate| job| mgr| sal|comm|empno| hiredate| job| mgr| sal|
+------+------+----+-----+----------+---------+----+------+----+-----+----------+---------+----+------+
| SMITH| 20|null| 7369|1980-12-17| CLERK|7902| 800.0|null| 7369|1980-12-17| CLERK|7902| 800.0|
| ALLEN| 30| 300| 7499|1981-02-20| SALESMAN|7698|1600.5| 300| 7499|1981-02-20| SALESMAN|7698|1600.5|
| WARD| 30| 500| 7521|1981-02-22| SALESMAN|7698|1250.0| 500| 7521|1981-02-22| SALESMAN|7698|1250.0|
| JONES| 20|null| 7566|1981-04-02| MANAGER|7839|2975.0|null| 7566|1981-04-02| MANAGER|7839|2975.0|
|MARTIN| 30|1400| 7654|1981-09-28| SALESMAN|7698|1250.0|1400| 7654|1981-09-28| SALESMAN|7698|1250.0|
| BLAKE| 30|null| 7698|1981-05-01| MANAGER|7839|2850.0|null|
+------+------+----+-----+----------+---------+----+------+----+-----+----------+---------+----+------+
内连接
scala> emp.join(emp,Array("ename","deptno"),"inner").show(3)
+-----+------+----+-----+----------+--------+----+------+----+-----+----------+--------+----+------+
|ename|deptno|comm|empno| hiredate| job| mgr| sal|comm|empno| hiredate| job| mgr| sal|
+-----+------+----+-----+----------+--------+----+------+----+-----+----------+--------+----+------+
|SMITH| 20|null| 7369|1980-12-17| CLERK|7902| 800.0|null| 7369|1980-12-17| CLERK|7902| 800.0|
|ALLEN| 30| 300| 7499|1981-02-20|SALESMAN|7698|1600.5| 300| 7499|1981-02-20|SALESMAN|7698|1600.5|
| WARD| 30| 500| 7521|1981-02-22|SALESMAN|7698|1250.0| 500| 7521|1981-02-22|SALESMAN|7698|1250.0|
+-----+------+----+-----+----------+--------+----+------+----+-----+----------+--------+----+------+
换种写法
scala> emp.join(emp,emp("ename")===emp("ename")).show(3)
20/11/24 21:27:48 WARN sql.Column: Constructing trivially true equals predicate, 'ename#79 = ename#79'. Perhaps you need to use aliases.
+----+------+-----+-----+----------+--------+----+------+----+------+-----+-----+----------+--------+----+------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+--------+----+------+----+------+-----+-----+----------+--------+----+------+
|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0|
| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5|
| 500| 30| 7521| WARD|1981-02-22|SALESMAN|7698|1250.0| 500| 30| 7521| WARD|1981-02-22|SALESMAN|7698|1250.0|
+----+------+-----+-----+----------+--------+----+------+----+------+-----+-----+----------+--------+----+------+
求交集
scala> emp.intersect(emp.limit(1)).show()
+----+------+-----+-----+----------+-----+----+-----+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|
+----+------+-----+-----+----------+-----+----+-----+
|null| 20| 7369|SMITH|1980-12-17|CLERK|7902|800.0|
+----+------+-----+-----+----------+-----+----+-----+
一个有而另一个没有
scala> emp.except(emp.limit(1)).show()
+----+------+-----+------+----------+---------+----+------+
|comm|deptno|empno| ename| hiredate| job| mgr| sal|
+----+------+-----+------+----------+---------+----+------+
|null| 30| 7900| JAMES|1981-12-03| CLERK|7698| 950.0|
|null| 20| 7902| FORD|1981-12-02| ANALYST|7566|3000.0|
|1400| 30| 7654|MARTIN|1981-09-28| SALESMAN|7698|1250.0|
| 300| 30| 7499| ALLEN|1981-02-20| SALESMAN|7698|1600.5|
|null| 10| 7782| CLARK|1981-06-09| MANAGER|7839|2450.0|
|null| 20| 7788| SCOTT|1987-04-19| ANALYST|7566|3000.0|
| 500| 30| 7521| WARD|1981-02-22| SALESMAN|7698|1250.0|
|null| 10| 7934|MILLER|1982-01-23| CLERK|7369|1300.0|
|null| 10| 7839| KING|1981-11-17|PRESIDENT|null|5000.0|
| 0| 30| 7844|TURNER|1981-09-08| SALESMAN|7698|1500.0|
|null| 30| 7698| BLAKE|1981-05-01| MANAGER|7839|2850.0|
|null| 20| 7566| JONES|1981-04-02| MANAGER|7839|2975.0|
|null| 20| 7876| ADAMS|1987-05-23| CLERK|7788|1100.0|
+----+------+-----+------+----------+---------+----+------+
重命名
scala> emp.withColumnRenamed("deptno", "no").show(3)
+----+---+-----+-----+----------+--------+----+------+
|comm| no|empno|ename| hiredate| job| mgr| sal|
+----+---+-----+-----+----------+--------+----+------+
|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0|
| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5|
| 500| 30| 7521| WARD|1981-02-22|SALESMAN|7698|1250.0|
+----+---+-----+-----+----------+--------+----+------+
新增1列
scala> emp.withColumn("salary2",emp("sal")).show(2)
+----+------+-----+-----+----------+--------+----+------+-------+
|comm|deptno|empno|ename| hiredate| job| mgr| sal|salary2|
+----+------+-----+-----+----------+--------+----+------+-------+
|null| 20| 7369|SMITH|1980-12-17| CLERK|7902| 800.0| 800.0|
| 300| 30| 7499|ALLEN|1981-02-20|SALESMAN|7698|1600.5| 1600.5|
+----+------+-----+-----+----------+--------+----+------+-------+
两种风格的写法
DSL风格写法
//降序排列
val df1: Unit = df.select("empno","ename","deptno")
.orderBy($"deptno".desc).show()
SQL风格写法
df.createTempView("emp")
val sql="select * from emp where deptno<30"
private val frame: DataFrame = spark.sql(sql.stripMargin)
frame.show()