声明:SQLContext 已在 2.x 被废除,建议使用 SparkSession
数据准备
person.json
{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}
dept.json
{"name":"草帽","deptno":1}
{"name":"白胡子","deptno":2}
{"name":"红发","deptno":3}
查看 Json格式数据信息
@Test
def readJson(): Unit = {
//1) 创建 sqlContext
val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// 1)相关处理
val person = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
//打印 Schema 约束
person.printSchema()
//打印数据,默认 20条
person.show()
//查看列信息
person.columns.foreach(println)
sc.stop()
}
运行结果为
schema 信息
Json 数据以及 列信息
操作数据
@Test
def handleJson ={
//1) 创建 sqlContext
val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// 1)相关处理
val person = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
// 年龄大于 18
person.filter("age = 18").show()
// 年龄大于 20
person.where("age > 20").show()
person.where("age > 20 and deptno = 3").show()
//排序
import sqlContext.implicits._
person.sort($"age".desc).show()
//添加一个列
person.withColumn("num", person("money")*10).show()
//修改列名
person.withColumnRenamed("age", "AGE").show()
sc.stop()
}
运行结果为:
// 年龄大于 18
person.filter("age = 18").show()
+---+------+-----+----+
|age|deptno|money|name|
+---+------+-----+----+
| 18| 1| 9|索隆|
| 18| 2| 18|艾斯|
| 18| 2| 16|萨博|
+---+------+-----+----+
// 年龄大于 20
person.where("age > 20").show()
+---+------+-----+------+
|age|deptno|money| name|
+---+------+-----+------+
| 32| 3| 30|香克斯|
+---+------+-----+------+
person.where("age > 20 and deptno = 3").show()
+---+------+-----+------+
|age|deptno|money| name|
+---+------+-----+------+
| 32| 3| 30|香克斯|
+---+------+-----+------+
//排序(导入隐式转换,使用 $语法,增强 sort 函数)
// person.sort("age").show() 默认是升序的
import sqlContext.implicits._
person.sort($"age".desc).show()
+---+------+-----+------+
|age|deptno|money| name|
+---+------+-----+------+
| 32| 3| 30|香克斯|
| 18| 1| 9| 索隆|
| 18| 2| 16| 萨博|
| 18| 2| 18| 艾斯|
| 17| 1| 15| 路飞|
| 5| 1| 5| 乔巴|
+---+------+-----+------+
//添加一个列
person.withColumn("num", person("money")*10).show()
+---+------+-----+------+---+
|age|deptno|money| name|num|
+---+------+-----+------+---+
| 17| 1| 15| 路飞|150|
| 18| 1| 9| 索隆| 90|
| 5| 1| 5| 乔巴| 50|
| 18| 2| 18| 艾斯|180|
| 18| 2| 16| 萨博|160|
| 32| 3| 30|香克斯|300|
+---+------+-----+------+---+
//修改列名
person.withColumnRenamed("age", "AGE").show()
+---+------+-----+------+
|AGE|deptno|money| name|
+---+------+-----+------+
| 17| 1| 15| 路飞|
| 18| 1| 9| 索隆|
| 5| 1| 5| 乔巴|
| 18| 2| 18| 艾斯|
| 18| 2| 16| 萨博|
| 32| 3| 30|香克斯|
+---+------+-----+------+
聚合操作
@Test
def handleJson2 ={
//1) 创建 sqlContext
val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// 1)相关处理
val person = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
val person2 = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
// union 两张表
person.union(person2).show()
//查看同年龄的人数
person.groupBy("age").count.show
//分组统计
person.groupBy("deptno").agg(Map("age" -> "max","money" ->"avg")).show()
//去重
person.select("age").distinct.show
//join
val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")
person.join(dept, person("deptno")===dept("deptno"), "outer").show()
//join 分组
val joinJson = person.join(dept, person("deptno")===dept("deptno"), "outer")
.groupBy(person("deptno")).agg(Map("age" -> "max","money" ->"avg"))
joinJson.show()
//另存为 Json
//分别拿出两张表的列名
val c_person = person.columns
val c_dept = dept.columns
//分别对两张表的别名进行设置(两张表中列名一样,无法保存,必须设置列名不一样)
val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)
// 由于文件比较小,使用 repartition 设置之生成一个 文件
person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
.write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
}
结果如下:
// union 两张表
person.union(person2).show()
+---+------+-----+------+
|age|deptno|money| name|
+---+------+-----+------+
| 17| 1| 15| 路飞|
| 18| 1| 9| 索隆|
| 5| 1| 5| 乔巴|
| 18| 2| 18| 艾斯|
| 18| 2| 16| 萨博|
| 32| 3| 30|香克斯|
| 17| 1| 15| 路飞|
| 18| 1| 9| 索隆|
| 5| 1| 5| 乔巴|
| 18| 2| 18| 艾斯|
| 18| 2| 16| 萨博|
| 32| 3| 30|香克斯|
+---+------+-----+------+
//查看同年龄的人数
person.groupBy("age").count.show
+---+-----+
|age|count|
+---+-----+
| 32| 1|
| 17| 1|
| 5| 1|
| 18| 3|
+---+-----+
//分组统计
person.groupBy("deptno").agg(Map("age" -> "max","money" ->"avg")).show()
+------+--------+-----------------+
|deptno|max(age)| avg(money)|
+------+--------+-----------------+
| 1| 18|9.666666666666666|
| 3| 32| 30.0|
| 2| 18| 17.0|
+------+--------+-----------------+
//去重
person.select("age").distinct.show
+---+
|age|
+---+
| 32|
| 17|
| 5|
| 18|
+---+
//join
val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")
person.join(dept, person("deptno")===dept("deptno"), "outer").show()
+---+------+-----+------+------+------+
|age|deptno|money| name|deptno| name|
+---+------+-----+------+------+------+
| 17| 1| 15| 路飞| 1| 草帽|
| 18| 1| 9| 索隆| 1| 草帽|
| 5| 1| 5| 乔巴| 1| 草帽|
| 32| 3| 30|香克斯| 3| 红发|
| 18| 2| 18| 艾斯| 2|白胡子|
| 18| 2| 16| 萨博| 2|白胡子|
+---+------+-----+------+------+------+
//join 分组
val joinJson = person.join(dept, person("deptno")===dept("deptno"), "outer")
.groupBy(person("deptno")).agg(Map("age" -> "max","money" ->"avg"))
joinJson.show()
+------+--------+-----------------+
|deptno|max(age)| avg(money)|
+------+--------+-----------------+
| 1| 18|9.666666666666666|
| 3| 32| 30.0|
| 2| 18| 17.0|
+------+--------+-----------------+
//另存为 Json
//分别拿出两张表的列名
val c_person = person.columns
val c_dept = dept.columns
//分别对两张表的别名进行设置(两张表中列名一样,无法保存,必须设置列名不一样)
val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)
// 由于文件比较小,使用 repartition 设置之生成一个 文件
person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
.write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
输出的文件如下:
{"person_age":17,"person_deptno":1,"person_money":15,"person_name":"路飞","dept_deptno":1,"dept_name":"草帽"}
{"person_age":18,"person_deptno":1,"person_money":9,"person_name":"索隆","dept_deptno":1,"dept_name":"草帽"}
{"person_age":5,"person_deptno":1,"person_money":5,"person_name":"乔巴","dept_deptno":1,"dept_name":"草帽"}
{"person_age":32,"person_deptno":3,"person_money":30,"person_name":"香克斯","dept_deptno":3,"dept_name":"红发"}
{"person_age":18,"person_deptno":2,"person_money":18,"person_name":"艾斯","dept_deptno":2,"dept_name":"白胡子"}
{"person_age":18,"person_deptno":2,"person_money":16,"person_name":"萨博","dept_deptno":2,"dept_name":"白胡子"}
@Test
def handleJson3 ={
//1) 创建 sqlContext
val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// 1)相关处理
val person = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
person.registerTempTable("p")
sqlContext.sql("show databases").show()
sqlContext.tableNames().foreach(println)
sqlContext.sql("select * from p").show()
//缓存表
val a2 = sqlContext.sql("select name from p")
sqlContext.cacheTable("p")
a2.show
//释放缓存
sqlContext.uncacheTable("p")
sc.stop()
}
结果如下:
person.registerTempTable("p")
sqlContext.sql("show databases").show()
+------------+
|databaseName|
+------------+
| default|
+------------+
sqlContext.tableNames().foreach(println)
p
sqlContext.sql("select * from p").show()
+---+------+-----+------+
| 17| 1| 15| 路飞|
| 18| 1| 9| 索隆|
| 5| 1| 5| 乔巴|
| 18| 2| 18| 艾斯|
| 18| 2| 16| 萨博|
| 32| 3| 30|香克斯|
+---+------+-----+------+
//缓存表
val a2 = sqlContext.sql("select name from p")
sqlContext.cacheTable("p")
a2.show
+------+
| name|
+------+
| 路飞|
| 索隆|
| 乔巴|
| 艾斯|
| 萨博|
|香克斯|
+------+
//释放缓存
sqlContext.uncacheTable("p")