创建DataFrame
通过case class创建DataFrame
# 创建case class
scala> case class emp(empno:Int, ename:String, job:String, mgr:Int, hiredate:String, sal:Int, comm:String, deptno:Int)
defined class emp
# 获取数据
scala> val line = sc.textFile("/opt/datas/scala/emp.csv").map(_.split(","))
line: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[12] at map at <console>:24
# 数据与表进行映射
scala> val empRDD = line.map(x => emp(x(0).toInt, x(1), x(2), x(3).toInt, x(4), x(5).toInt, x(6), x(7).toInt))
empRDD: org.apache.spark.rdd.RDD[emp] = MapPartitionsRDD[13] at map at <console>:28
# 将映射后的RDD算子转换成DataFrame
scala> val empDf = empRDD.toDF
empDf: org.apache.spark.sql.DataFrame = [empno: int, ename: string ... 6 more fields]
# 查看DataFrame
scala> empDf.show
+-----+------+---------+----+----------+----+----+------+
|empno| ename| job| mgr| hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH| CLERK|7902|1980/12/17| 800| 0| 20|
| 7499| ALLEN| SALESMAN|7698| 1981/2/20|1600| 300| 30|
| 7521| WARD| SALESMAN|7698| 1981/2/22|1250| 500| 30|
| 7566| JONES| MANAGER|7839| 1981/4/2|2975| 0| 20|
| 7654|MARTIN| SALESMAN|7698| 1981/9/28|1250|1400| 30|
| 7698| BLAKE| MANAGER|7839| 1981/5/1|2850| 0| 30|
| 7782| CLARK| MANAGER|7839| 1981/6/9|2450| 0| 10|
| 7788| SCOTT| ANALYST|7566| 1987/4/19|3000| 0| 20|
| 7839| KING|PRESIDENT|7839|1981/11/17|5000| 0| 10|
| 7844|TURNER| SALESMAN|7698| 1981/9/8|1500| 0| 30|
| 7876| ADAMS| CLERK|7788| 1987/5/23|1100| 0| 20|
| 7900| JAMES| CLERK|7698| 1981/12/3| 950| 0| 30|
| 7902| FORD| ANALYST|7566| 1981/12/3|3000| 0| 20|
| 7934|MILLER| CLERK|7782| 1982/1/23|1300| 0| 10|
+-----+------+---------+----+----------+----+----+------+
使用Spark Session创建
# 引入数据类型包
scala> import org.apache.spark.sql.types._
import org.apache.spark.sql.types._
# 通过StructType创建Schema
scala> val empSchema = StructType(List(StructField("empno", DataTypes.IntegerType),
| StructField("ename", DataTypes.StringType),
| StructField("job", DataTypes.StringType),
| StructField("mgr", DataTypes.IntegerType),
| StructField("hiredate", DataTypes.StringType),
| StructField("sal", DataTypes.IntegerType),
| StructField("comm", DataTypes.StringType),
| StructField("deptno", DataTypes.IntegerType)))
empSchema: org.apache.spark.sql.types.StructType = StructType(StructField(empno,IntegerType,true), StructField(ename,StringType,true), StructField(job,StringType,true), StructField(mgr,IntegerType,true), StructField(hiredate,StringType,true), StructField(sal,IntegerType,true), StructField(comm,StringType,true), StructField(deptno,IntegerType,true))
# 读取数据按逗号切分数据
scala> val empLineRDD = sc.textFile("/opt/datas/scala/emp.csv").map(_.split(","))
empLineRDD: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[19] at map at <console>:27
# 引入Spark Sql Row
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
# 将数据映射成Row
scala> val empRowRDD = empLineRDD.map(line => Row(line(0).toInt, line(1), line(2), line(3).toInt, line(4), line(5).toInt, line(6), line(7).toInt))
empRowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[20] at map at <console>:32
# 通过数据和 schema创建DataFrame
scala> val empDF = spark.createDataFrame(empRowRDD, empSchema)
empDF: org.apache.spark.sql.DataFrame = [empno: int, ename: string ... 6 more fields]
# 查看DataFrame
scala> empDF.show
+-----+------+---------+----+----------+----+----+------+
|empno| ename| job| mgr| hiredate| sal|comm|deptno|
+-----+------+---------+----+----------+----+----+------+
| 7369| SMITH| CLERK|7902|1980/12/17| 800| 0| 20|
| 7499| ALLEN| SALESMAN|7698| 1981/2/20|1600| 300| 30|
| 7521| WARD| SALESMAN|7698| 1981/2/22|1250| 500| 30|
| 7566| JONES| MANAGER|7839| 1981/4/2|2975| 0| 20|
| 7654|MARTIN| SALESMAN|7698| 1981/9/28|1250|1400| 30|
| 7698| BLAKE| MANAGER|7839| 1981/5/1|2850| 0| 30|
| 7782| CLARK| MANAGER|7839| 1981/6/9|2450| 0| 10|
| 7788| SCOTT| ANALYST|7566| 1987/4/19|3000| 0| 20|
| 7839| KING|PRESIDENT|7839|1981/11/17|5000| 0| 10|
| 7844|TURNER| SALESMAN|7698| 1981/9/8|1500| 0| 30|
| 7876| ADAMS| CLERK|7788| 1987/5/23|1100| 0| 20|
| 7900| JAMES| CLERK|7698| 1981/12/3| 950| 0| 30|
| 7902| FORD| ANALYST|7566| 1981/12/3|3000| 0| 20|
| 7934|MILLER| CLERK|7782| 1982/1/23|1300| 0| 10|
+-----+------+---------+----+----------+----+----+------+
通过读取spark支持的数据源创建
-
读取csv文件创建DataFrame
# 读取csv文件创建DataFrame scala> val readCsvDF = spark.read.csv("/opt/datas/scala/emp.csv") readCsvDF: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 6 more fields] # DataFrame查看 scala> readCsvDF.show +----+------+---------+----+----------+----+----+---+ | _c0| _c1| _c2| _c3| _c4| _c5| _c6|_c7| +----+------+---------+----+----------+----+----+---+ |7369| SMITH| CLERK|7902|1980/12/17| 800| 0| 20| |7499| ALLEN| SALESMAN|7698| 1981/2/20|1600| 300| 30| |7521| WARD| SALESMAN|7698| 1981/2/22|1250| 500| 30| |7566| JONES| MANAGER|7839| 1981/4/2|2975| 0| 20| |7654|MARTIN| SALESMAN|7698| 1981/9/28|1250|1400| 30| |7698| BLAKE| MANAGER|7839| 1981/5/1|2850| 0| 30| |7782| CLARK| MANAGER|7839| 1981/6/9|2450| 0| 10| |7788| SCOTT| ANALYST|7566| 1987/4/19|3000| 0| 20| |7839| KING|PRESIDENT|7839|1981/11/17|5000| 0| 10| |7844|TURNER| SALESMAN|7698| 1981/9/8|1500| 0| 30| |7876| ADAMS| CLERK|7788| 1987/5/23|1100| 0| 20| |7900| JAMES| CLERK|7698| 1981/12/3| 950| 0| 30| |7902| FORD| ANALYST|7566| 1981/12/3|3000| 0| 20| |7934|MILLER| CLERK|7782| 1982/1/23|1300| 0| 10| +----+------+---------+----+----------+----+----+---+
-
读取json文件创建DataFrame
# 读取json文件创建DataFrame scala> val readJsonDF = spark.read.json("/opt/datas/scala/emp.json") readJsonDF: org.apache.spark.sql.DataFrame = [comm: string, deptno: bigint ... 6 more fields] # DataFrame查看 scala> readJsonDF.show +----+------+-----+------+----------+---------+----+----+ |comm|deptno|empno| ename| hiredate| job| mgr| sal| +----+------+-----+------+----------+---------+----+----+ | | 20| 7369| SMITH|1980/12/17| CLERK|7902| 800| | 300| 30| 7499| ALLEN| 1981/2/20| SALESMAN|7698|1600| | 500| 30| 7521| WARD| 1981/2/22| SALESMAN|7698|1250| | | 20| 7566| JONES| 1981/4/2| MANAGER|7839|2975| |1400| 30| 7654|MARTIN| 1981/9/28| SALESMAN|7698|1250| | | 30| 7698| BLAKE| 1981/5/1| MANAGER|7839|2850| | | 10| 7782| CLARK| 1981/6/9| MANAGER|7839|2450| | | 20| 7788| SCOTT| 1987/4/19| ANALYST|7566|3000| | | 10| 7839| KING|1981/11/17|PRESIDENT| |5000| | 0| 30| 7844|TURNER| 1981/9/8| SALESMAN|7698|1500| | | 20| 7876| ADAMS| 1987/5/23| CLERK|7788|1100| | | 30| 7900| JAMES| 1981/12/3| CLERK|7698| 950| | | 20| 7902| FORD| 1981/12/3| ANALYST|7566|3000| | | 10| 7934|MILLER| 1982/1/23| CLERK|7782|1300| +----+------+-----+------+----------+---------+----+----+
创建DataSet
通过序列创建
# 自定义case class
scala> case class Person(name:String, age:Int)
defined class Person
# 生成序列并创建DataSet
scala> val personDS = Seq(Person("DAYTOY", 29), Person("kai", 25)).toDS
personDS: org.apache.spark.sql.Dataset[Person] = [name: string, age: int]
# 查看结果集
scala> personDS.show
+------+---+
| name|age|
+------+---+
|DAYTOY| 29|
| kai| 25|
+------+---+
通过JSON数据创建
# 创建case class
scala> case class Person(name:String, age:Long)
defined class Person
# 通过json数据创建DataFrame
scala> val jsonDF = spark.read.json(sc.parallelize("""{"name":"DAYTOY", "age":29}""" :: Nil))
jsonDF: org.apache.spark.sql.DataFrame = [age: bigint, name: string]
# 将DataFrame转换成DataSet
scala> val jsonDS = jsonDF.as[Person]
jsonDS: org.apache.spark.sql.Dataset[Person] = [age: bigint, name: string]
#查看结果
scala> jsonDS.show
+---+------+
|age| name|
+---+------+
| 29|DAYTOY|
+---+------+
通过读取文件创建
# 通过读取文件创建DataSet
scala> val empDS = spark.read.text("/opt/datas/emp.txt").as[String]
empDS: org.apache.spark.sql.Dataset[String] = [value: string]
# 查看结果,只展示部分字段
scala> empDS.show
+--------------------+
| value|
+--------------------+
|7369 SMITH CLERK ...|
|7499 ALLEN SALESM...|
|7521 WARD SALESMA...|
|7566 JONES MANAGE...|
|7654 MARTIN SALES...|
|7698 BLAKE MANAGE...|
|7782 CLARK MANAGE...|
|7788 SCOTT ANALYS...|
|7839 KING PRESIDE...|
|7844 TURNER SALES...|
|7876 ADAMS CLERK ...|
|7900 JAMES CLERK ...|
|7902 FORD ANALYST...|
|7934 MILLER CLERK...|
+--------------------+
# 查看结果,展示所有字段
scala> empDS.show(false)
+------------------------------------------------------+
|value |
+------------------------------------------------------+
|7369 SMITH CLERK 7902 1980-12-17 800.00 20 |
|7499 ALLEN SALESMAN 7698 1981-2-20 1600.00 300.00 30 |
|7521 WARD SALESMAN 7698 1981-2-22 1250.00 500.00 30 |
|7566 JONES MANAGER 7839 1981-4-2 2975.00 20 |
|7654 MARTIN SALESMAN 7698 1981-9-28 1250.00 1400.00 30|
|7698 BLAKE MANAGER 7839 1981-5-1 2850.00 30 |
|7782 CLARK MANAGER 7839 1981-6-9 2450.00 10 |
|7788 SCOTT ANALYST 7566 1987-4-19 3000.00 20 |
|7839 KING PRESIDENT 1981-11-17 5000.00 10 |
|7844 TURNER SALESMAN 7698 1981-9-8 1500.00 0.00 30 |
|7876 ADAMS CLERK 7788 1987-5-23 1100.00 20 |
|7900 JAMES CLERK 7698 1981-12-3 950.00 30 |
|7902 FORD ANALYST 7566 1981-12-3 3000.00 20 |
|7934 MILLER CLERK 7782 1982-1-23 1300.00 10 |
+------------------------------------------------------+
使用DataFrame创建视图进行SQL操作
# 通过读取json文件创建DataFrame
scala> val readJsonEmpDF = spark.read.json("/opt/datas/scala/emp.json")
readJsonEmpDF: org.apache.spark.sql.DataFrame = [comm: string, deptno: bigint ... 6 more fields]
# 通过DataFrame创建视图
scala> readJsonEmpDF.createOrReplaceTempView("emp")
# 查询emp表的数据
scala> spark.sql("select ename,empno,job,sal from emp").show
+------+-----+---------+----+
| ename|empno| job| sal|
+------+-----+---------+----+
| SMITH| 7369| CLERK| 800|
| ALLEN| 7499| SALESMAN|1600|
| WARD| 7521| SALESMAN|1250|
| JONES| 7566| MANAGER|2975|
|MARTIN| 7654| SALESMAN|1250|
| BLAKE| 7698| MANAGER|2850|
| CLARK| 7782| MANAGER|2450|
| SCOTT| 7788| ANALYST|3000|
| KING| 7839|PRESIDENT|5000|
|TURNER| 7844| SALESMAN|1500|
| ADAMS| 7876| CLERK|1100|
| JAMES| 7900| CLERK| 950|
| FORD| 7902| ANALYST|3000|
|MILLER| 7934| CLERK|1300|
+------+-----+---------+----+
# 创建部门case class
scala> case class dept(deptno:Int, deptname:String,address:String)
defined class dept
# 读取dept数据
scala> val deptLine = sc.textFile("/opt/datas/scala/dept.csv").map(_.split(","))
deptLine: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[68] at map at <console>:28
# 将数据和case class映射
scala> val deptRDD = deptLine.map(line => dept(line(0).toInt, line(1), line(2)))
deptRDD: org.apache.spark.rdd.RDD[dept] = MapPartitionsRDD[70] at map at <console>:34
# 创建DataFrame
scala> val deptDF = deptRDD.toDF
deptDF: org.apache.spark.sql.DataFrame = [deptno: int, deptname: string ... 1 more field]
# 创建dept视图
scala> deptDF.createOrReplaceTempView("dept")
# 查询员工所在的部门名称
scala> spark.sql("select dept.deptname,emp.ename from dept join emp on dept.deptno = emp.deptno").show
+----------+------+
| deptname| ename|
+----------+------+
|ACCOUNTING|MILLER|
|ACCOUNTING| KING|
|ACCOUNTING| CLARK|
| RESEARCH| FORD|
| RESEARCH| ADAMS|
| RESEARCH| SCOTT|
| RESEARCH| JONES|
| RESEARCH| SMITH|
| SALES| JAMES|
| SALES|TURNER|
| SALES| BLAKE|
| SALES|MARTIN|
| SALES| WARD|
| SALES| ALLEN|
+----------+------+