package com.kk.sparksql
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.Row
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.types._
object Emps {
def main(args: Array[String]) {
// 创建一个SparkSession
val spark = SparkSession.builder().master("local").appName("sql").getOrCreate()
import spark.sqlContext.implicits._
// 导入emp.csv文件(导入数据)
val lines = spark.sparkContext.textFile("G:/emp.csv").map(_.split(","));
val myschema = StructType(List(StructField("empno", DataTypes.IntegerType),
StructField("ename", DataTypes.StringType),
StructField("job", DataTypes.StringType),
StructField("mgr", DataTypes.StringType),
StructField("hiredate", DataTypes.StringType),
StructField("sal", DataTypes.IntegerType),
StructField("comm", DataTypes.StringType),
StructField("deptno", DataTypes.IntegerType)))
// 把读入的每一行数据映射成一个个Row
val rowRDD = lines.map(x => Row(x(0).toInt, x(1), x(2), x(3), x(4), x(5).toInt, x(6), x(7).toInt))
val df = spark.createDataFrame(rowRDD,myschema)
// 操作表(DataFrame):支持两种语言
// (1) DSL语句:不常用
// 查询所有的员工姓名
println(df.select("ename").show)
// 运行结果
+------+
| ename|
+------+
| SMITH|
| ALLEN|
| WARD|
| JONES|
|MARTIN|
| BLAKE|
| CLARK|
| SCOTT|
| KING|
|TURNER|
| ADAMS|
| JAMES|
| FORD|
|MILLER|
+------+
// 查询所有的员工姓名和薪水,并给薪水加100块钱
println(df.select($"ename",$"sal",$"sal"+100).show())
// 运行结果
+------+----+-----------+
| ename| sal|(sal + 100)|
+------+----+-----------+
| SMITH| 800| 900|
| ALLEN|1600| 1700|
| WARD|1250| 1350|
| JONES|2975| 3075|
|MARTIN|1250| 1350|
| BLAKE|2850| 2950|
| CLARK|2450| 2550|
| SCOTT|3000| 3100|
| KING|5000| 5100|
|TURNER|1500| 1600|
| ADAMS|1100| 1200|
| JAMES| 950| 1050|
| FORD|3000| 3100|
|MILLER|1300| 1400|
+------+----+-----------+
// 查询工资大于2000的员工
println(df.filter($"sal" > 2000).show())
// 运行结果
+-----+-----+---------+----+----------+----+----+------+
|empno|ename| job| mgr| hiredate| sal|comm|deptno|
+-----+-----+---------+----+----------+----+----+------+
| 7566|JONES| MANAGER|7839| 1981/4/2|2975| | 20|
| 7698|BLAKE| MANAGER|7839| 1981/5/1|2850| | 30|
| 7782|CLARK| MANAGER|7839| 1981/6/9|2450| | 10|
| 7788|SCOTT| ANALYST|7566| 1987/4/19|3000| | 20|
| 7839| KING|PRESIDENT| |1981/11/17|5000| | 10|
| 7902| FORD| ANALYST|7566| 1981/12/3|3000| | 20|
+-----+-----+---------+----+----------+----+----+------+
// 分组: empDF.groupBy($"deptno").count.show
println(df.groupBy($"deptno").count.show())
// 运行结果
|deptno|count|
+------+-----+
| 20| 5|
| 10| 3|
| 30| 6|
+------+-----+
// (2) SQL语句: 前提条件:需要把DataFrame注册成是一个Table或者View
df.createTempView("emp")
println(spark.sql("select ename from emp").show());
// 运行结果
+------+
| SMITH|
| ALLEN|
| WARD|
| JONES|
|MARTIN|
| BLAKE|
| CLARK|
| SCOTT|
| KING|
|TURNER|
| ADAMS|
| JAMES|
| FORD|
|MILLER|
+------+
println(spark.sql("select ename,sal,sal + 100 from emp").show())
// 运行结果
+------+----+-----------+
| ename| sal|(sal + 100)|
+------+----+-----------+
| SMITH| 800| 900|
| ALLEN|1600| 1700|
| WARD|1250| 1350|
| JONES|2975| 3075|
|MARTIN|1250| 1350|
| BLAKE|2850| 2950|
| CLARK|2450| 2550|
| SCOTT|3000| 3100|
| KING|5000| 5100|
|TURNER|1500| 1600|
| ADAMS|1100| 1200|
| JAMES| 950| 1050|
| FORD|3000| 3100|
|MILLER|1300| 1400|
+------+----+-----------+
println(spark.sql("select deptno,sum(sal) from emp group by deptno").show())
// 运行结果
+------+--------+
|deptno|sum(sal)|
+------+--------+
| 20| 10875|
| 10| 8750|
| 30| 9400|
+------+--------+
}
}
SparkSql操作表的两种方式
最新推荐文章于 2022-10-17 11:11:43 发布