DataFrame操作数据

scala> val lines=sc.textFile("/opt/data/emp.csv").map(_.split(","))
lines: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[8] at map at <console>:24

scala> lines.collect
res2: Array[Array[String]] = Array(Array(7369, SMITH, CLERK, 7902, 1980/12/17, 800, 0, 20), Array(7499, ALLEN, SALESMAN, 7698, 1981/2/20, 1600, 300, 30), Array(7521, WARD, SALESMAN, 7698, 1981/2/22, 1250, 500, 30), Array(7566, JONES, MANAGER, 7839, 1981/4/2, 2975, 0, 20), Array(7654, MARTIN, SALESMAN, 7698, 1981/9/28, 1250, 1400, 30), Array(7698, BLAKE, MANAGER, 7839, 1981/5/1, 2850, 0, 30), Array(7782, CLARK, MANAGER, 7839, 1981/6/9, 2450, 0, 10), Array(7788, SCOTT, ANALYST, 7566, 1987/4/19, 3000, 0, 20), Array(7839, KING, PRESIDENT, 7839, 1981/11/17, 5000, 0, 10), Array(7844, TURNER, SALESMAN, 7698, 1981/9/8, 1500, 0, 30), Array(7876, ADAMS, CLERK, 7788, 1987/5/23, 1100, 0, 20), Array(7900, JAMES, CLERK, 7698, 1981/12/3, 950, 0, 30), Array(7902, FORD, ANALYST, 7566, 1981/12/3, 3000, 0...
scala> case class Emp(empno:Int,ename:String,job:String,mgr:Int,hiredate:String,sal:Int,comm:Int,deptno:Int)
defined class Emp

scala> val allEmp=lines.map(x=>
     |       Emp(x(0).toInt,x(1).toString,x(2).toString,x(3).toInt,x(4).toString,x(5).toInt,x(6).toInt,x(7).toInt)
     |     )
allEmp: org.apache.spark.rdd.RDD[Emp] = MapPartitionsRDD[9] at map at <console>:28

scala> val df1=allEmp.toDF
df1: org.apache.spark.sql.DataFrame = [empno: int, ename: string ... 6 more fields]

scala> df1.show
+-----+------+---------+----+----------+----+----+------+
|empno| ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH|    CLERK|7902|1980/12/17| 800|   0|    20|
| 7499| ALLEN| SALESMAN|7698| 1981/2/20|1600| 300|    30|
| 7521|  WARD| SALESMAN|7698| 1981/2/22|1250| 500|    30|
| 7566| JONES|  MANAGER|7839|  1981/4/2|2975|   0|    20|
| 7654|MARTIN| SALESMAN|7698| 1981/9/28|1250|1400|    30|
| 7698| BLAKE|  MANAGER|7839|  1981/5/1|2850|   0|    30|
| 7782| CLARK|  MANAGER|7839|  1981/6/9|2450|   0|    10|
| 7788| SCOTT|  ANALYST|7566| 1987/4/19|3000|   0|    20|
| 7839|  KING|PRESIDENT|7839|1981/11/17|5000|   0|    10|
| 7844|TURNER| SALESMAN|7698|  1981/9/8|1500|   0|    30|
| 7876| ADAMS|    CLERK|7788| 1987/5/23|1100|   0|    20|
| 7900| JAMES|    CLERK|7698| 1981/12/3| 950|   0|    30|
| 7902|  FORD|  ANALYST|7566| 1981/12/3|3000|   0|    20|
| 7934|MILLER|    CLERK|7782| 1982/1/23|1300|   0|    10|
+-----+------+---------+----+----------+----+----+------+


scala> df1.select("ename").show
+------+
| ename|
+------+
| SMITH|
| ALLEN|
|  WARD|
| JONES|
|MARTIN|
| BLAKE|
| CLARK|
| SCOTT|
|  KING|
|TURNER|
| ADAMS|
| JAMES|
|  FORD|
|MILLER|
+------+


scala> df1.select("ename","sal").show
+------+----+
| ename| sal|
+------+----+
| SMITH| 800|
| ALLEN|1600|
|  WARD|1250|
| JONES|2975|
|MARTIN|1250|
| BLAKE|2850|
| CLARK|2450|
| SCOTT|3000|
|  KING|5000|
|TURNER|1500|
| ADAMS|1100|
| JAMES| 950|
|  FORD|3000|
|MILLER|1300|
+------+----+


scala> df1.select($"ename",$"sal"+100).show
+------+-----------+
| ename|(sal + 100)|
+------+-----------+
| SMITH|        900|
| ALLEN|       1700|
|  WARD|       1350|
| JONES|       3075|
|MARTIN|       1350|
| BLAKE|       2950|
| CLARK|       2550|
| SCOTT|       3100|
|  KING|       5100|
|TURNER|       1600|
| ADAMS|       1200|
| JAMES|       1050|
|  FORD|       3100|
|MILLER|       1400|
+------+-----------+


scala> df1.filter($"sal">2000)
res7: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [empno: int, ename: string ... 6 more fields]

scala> df1.filter($"sal">2000).show
+-----+-----+---------+----+----------+----+----+------+
|empno|ename|      job| mgr|  hiredate| sal|comm|deptno|
+-----+-----+---------+----+----------+----+----+------+
| 7566|JONES|  MANAGER|7839|  1981/4/2|2975|   0|    20|
| 7698|BLAKE|  MANAGER|7839|  1981/5/1|2850|   0|    30|
| 7782|CLARK|  MANAGER|7839|  1981/6/9|2450|   0|    10|
| 7788|SCOTT|  ANALYST|7566| 1987/4/19|3000|   0|    20|
| 7839| KING|PRESIDENT|7839|1981/11/17|5000|   0|    10|
| 7902| FORD|  ANALYST|7566| 1981/12/3|3000|   0|    20|
+-----+-----+---------+----+----------+----+----+------+


scala> df1.groupBy("deptno").count.show
+------+-----+
|deptno|count|
+------+-----+
|    20|    5|
|    10|    3|
|    30|    6|
+------+-----+


scala> 
 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值