scala> val lines=sc.textFile("/opt/data/emp.csv").map(_.split(","))
lines: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[8] at map at <console>:24
scala> lines.collect
res2: Array[Array[String]] = Array(Array(7369, SMITH, CLERK, 7902, 1980/12/17, 800, 0, 20), Array(7499, ALLEN, SALESMAN, 7698, 1981/2/20, 1600, 300, 30), Array(7521, WARD, SALESMAN, 7698, 1981/2/22, 1250, 500, 30), Array(7566, JONES, MANAGER, 7839, 1981/4/2, 2975, 0, 20), Array(7654, MARTIN, SALESMAN, 7698, 1981/9/28, 1250, 1400, 30), Array(7698, BLAKE, MANAGER, 7839, 1981/5/1, 2850, 0, 30), Array(7782, CLARK, MANAGER, 7839, 1981/6/9, 2450, 0, 10), Array(7788, SCOTT, ANALYST, 7566, 1987/4/19, 3000, 0, 20), Array(7839, KING, PRESIDENT, 7839, 1981/11/17, 5000, 0, 10), Array(7844, TURNER, SALESMAN, 7698, 1981/9/8, 1500, 0, 30), Array(7876, ADAMS, CLERK, 7788, 1987/5/23, 1100, 0, 20), Array(7900, JAMES, CLERK, 7698, 1981/12/3, 950, 0, 30), Array(7902, FORD, ANALYST, 7566, 1981/12/3, 3000, 0...
scala> case class Emp(empno:Int,ename:String,job:String,mgr:Int,hiredate:String,sal:Int,comm:Int,deptno:Int)
defined class Emp
scala> val allEmp=lines.map(x=>
| Emp(x(0).toInt,x(1).toString,x(2).toString,x(3).toInt,x(4).toString,x(5).toInt,x(6).toInt,x(7).toInt)
| )
allEmp: org.apache.spark.rdd.RDD[Emp] = MapPartitionsRDD[9] at map at <console>:28
scala> val df1=allEmp.toDF
df1: org.apache.spark.sql.DataFrame = [empno: int, ename: string ... 6 more fields]
scala> df1.show
+-----+------+---------+----+----------+----+----+------+
|empno| ename| job| mgr| hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH| CLERK|7902|1980/12/17| 800| 0| 20|
| 7499| ALLEN| SALESMAN|7698| 1981/2/20|1600| 300| 30|
| 7521| WARD| SALESMAN|7698| 1981/2/22|1250| 500| 30|
| 7566| JONES| MANAGER|7839| 1981/4/2|2975| 0| 20|
| 7654|MARTIN| SALESMAN|7698| 1981/9/28|1250|1400| 30|
| 7698| BLAKE| MANAGER|7839| 1981/5/1|2850| 0| 30|
| 7782| CLARK| MANAGER|7839| 1981/6/9|2450| 0| 10|
| 7788| SCOTT| ANALYST|7566| 1987/4/19|3000| 0| 20|
| 7839| KING|PRESIDENT|7839|1981/11/17|5000| 0| 10|
| 7844|TURNER| SALESMAN|7698| 1981/9/8|1500| 0| 30|
| 7876| ADAMS| CLERK|7788| 1987/5/23|1100| 0| 20|
| 7900| JAMES| CLERK|7698| 1981/12/3| 950| 0| 30|
| 7902| FORD| ANALYST|7566| 1981/12/3|3000| 0| 20|
| 7934|MILLER| CLERK|7782| 1982/1/23|1300| 0| 10|
+-----+------+---------+----+----------+----+----+------+
scala> df1.select("ename").show
+------+
| ename|
+------+
| SMITH|
| ALLEN|
| WARD|
| JONES|
|MARTIN|
| BLAKE|
| CLARK|
| SCOTT|
| KING|
|TURNER|
| ADAMS|
| JAMES|
| FORD|
|MILLER|
+------+
scala> df1.select("ename","sal").show
+------+----+
| ename| sal|
+------+----+
| SMITH| 800|
| ALLEN|1600|
| WARD|1250|
| JONES|2975|
|MARTIN|1250|
| BLAKE|2850|
| CLARK|2450|
| SCOTT|3000|
| KING|5000|
|TURNER|1500|
| ADAMS|1100|
| JAMES| 950|
| FORD|3000|
|MILLER|1300|
+------+----+
scala> df1.select($"ename",$"sal"+100).show
+------+-----------+
| ename|(sal + 100)|
+------+-----------+
| SMITH| 900|
| ALLEN| 1700|
| WARD| 1350|
| JONES| 3075|
|MARTIN| 1350|
| BLAKE| 2950|
| CLARK| 2550|
| SCOTT| 3100|
| KING| 5100|
|TURNER| 1600|
| ADAMS| 1200|
| JAMES| 1050|
| FORD| 3100|
|MILLER| 1400|
+------+-----------+
scala> df1.filter($"sal">2000)
res7: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [empno: int, ename: string ... 6 more fields]
scala> df1.filter($"sal">2000).show
+-----+-----+---------+----+----------+----+----+------+
|empno|ename| job| mgr| hiredate| sal|comm|deptno|
+-----+-----+---------+----+----------+----+----+------+
| 7566|JONES| MANAGER|7839| 1981/4/2|2975| 0| 20|
| 7698|BLAKE| MANAGER|7839| 1981/5/1|2850| 0| 30|
| 7782|CLARK| MANAGER|7839| 1981/6/9|2450| 0| 10|
| 7788|SCOTT| ANALYST|7566| 1987/4/19|3000| 0| 20|
| 7839| KING|PRESIDENT|7839|1981/11/17|5000| 0| 10|
| 7902| FORD| ANALYST|7566| 1981/12/3|3000| 0| 20|
+-----+-----+---------+----+----------+----+----+------+
scala> df1.groupBy("deptno").count.show
+------+-----+
|deptno|count|
+------+-----+
| 20| 5|
| 10| 3|
| 30| 6|
+------+-----+
scala>