Spark SQL | SQLContext 用法

21 篇文章 0 订阅
21 篇文章 0 订阅

 

声明:SQLContext 已在 2.x 被废除,建议使用 SparkSession

数据准备

person.json

{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}

dept.json

{"name":"草帽","deptno":1}
{"name":"白胡子","deptno":2}
{"name":"红发","deptno":3}

查看 Json格式数据信息

@Test
  def readJson(): Unit = {
      //1) 创建 sqlContext
    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    //打印 Schema 约束
    person.printSchema()
    //打印数据,默认 20条
    person.show()
    //查看列信息
    person.columns.foreach(println)
    sc.stop()
  }

运行结果为

schema 信息

Json 数据以及 列信息

 

操作数据

@Test
  def handleJson ={
    //1) 创建 sqlContext
    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    // 年龄大于 18
    person.filter("age = 18").show()
    // 年龄大于 20
    person.where("age > 20").show()

    person.where("age > 20 and deptno = 3").show()
    //排序
    import sqlContext.implicits._
    person.sort($"age".desc).show()

    //添加一个列
    person.withColumn("num", person("money")*10).show()

    //修改列名
    person.withColumnRenamed("age", "AGE").show()
    sc.stop()
  }

运行结果为:

    // 年龄大于 18
    person.filter("age = 18").show()

    +---+------+-----+----+
|age|deptno|money|name|
+---+------+-----+----+
| 18|     1|    9|索隆|
| 18|     2|   18|艾斯|
| 18|     2|   16|萨博|
+---+------+-----+----+

    // 年龄大于 20
    person.where("age > 20").show()

+---+------+-----+------+
|age|deptno|money|  name|
+---+------+-----+------+
| 32|     3|   30|香克斯|
+---+------+-----+------+

    person.where("age > 20 and deptno = 3").show()

+---+------+-----+------+
|age|deptno|money|  name|
+---+------+-----+------+
| 32|     3|   30|香克斯|
+---+------+-----+------+

    //排序(导入隐式转换,使用 $语法,增强 sort 函数)
    // person.sort("age").show() 默认是升序的
    import sqlContext.implicits._
    person.sort($"age".desc).show()

+---+------+-----+------+
|age|deptno|money|  name|
+---+------+-----+------+
| 32|     3|   30|香克斯|
| 18|     1|    9|  索隆|
| 18|     2|   16|  萨博|
| 18|     2|   18|  艾斯|
| 17|     1|   15|  路飞|
|  5|     1|    5|  乔巴|
+---+------+-----+------+

    //添加一个列
    person.withColumn("num", person("money")*10).show()

+---+------+-----+------+---+
|age|deptno|money|  name|num|
+---+------+-----+------+---+
| 17|     1|   15|  路飞|150|
| 18|     1|    9|  索隆| 90|
|  5|     1|    5|  乔巴| 50|
| 18|     2|   18|  艾斯|180|
| 18|     2|   16|  萨博|160|
| 32|     3|   30|香克斯|300|
+---+------+-----+------+---+

    //修改列名
    person.withColumnRenamed("age", "AGE").show()

+---+------+-----+------+
|AGE|deptno|money|  name|
+---+------+-----+------+
| 17|     1|   15|  路飞|
| 18|     1|    9|  索隆|
|  5|     1|    5|  乔巴|
| 18|     2|   18|  艾斯|
| 18|     2|   16|  萨博|
| 32|     3|   30|香克斯|
+---+------+-----+------+

 

聚合操作

@Test
  def handleJson2 ={
    //1) 创建 sqlContext
    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    val person2 =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")

    // union 两张表
    person.union(person2).show()

    //查看同年龄的人数
    person.groupBy("age").count.show

    //分组统计
    person.groupBy("deptno").agg(Map("age" -> "max","money" ->"avg")).show()

    //去重
    person.select("age").distinct.show

    //join
    val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")
    person.join(dept, person("deptno")===dept("deptno"), "outer").show()

    //join 分组
     val joinJson = person.join(dept, person("deptno")===dept("deptno"), "outer")
      .groupBy(person("deptno")).agg(Map("age" -> "max","money" ->"avg"))
    joinJson.show()


    //另存为 Json

    //分别拿出两张表的列名
    val c_person = person.columns
    val c_dept = dept.columns
    //分别对两张表的别名进行设置(两张表中列名一样,无法保存,必须设置列名不一样)
    val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
    val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)

    // 由于文件比较小,使用 repartition 设置之生成一个 文件
    person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
      .write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")
  }

结果如下:

 // union 两张表
    person.union(person2).show()

+---+------+-----+------+
|age|deptno|money|  name|
+---+------+-----+------+
| 17|     1|   15|  路飞|
| 18|     1|    9|  索隆|
|  5|     1|    5|  乔巴|
| 18|     2|   18|  艾斯|
| 18|     2|   16|  萨博|
| 32|     3|   30|香克斯|
| 17|     1|   15|  路飞|
| 18|     1|    9|  索隆|
|  5|     1|    5|  乔巴|
| 18|     2|   18|  艾斯|
| 18|     2|   16|  萨博|
| 32|     3|   30|香克斯|
+---+------+-----+------+

    //查看同年龄的人数
    person.groupBy("age").count.show
+---+-----+
|age|count|
+---+-----+
| 32|    1|
| 17|    1|
|  5|    1|
| 18|    3|
+---+-----+

    //分组统计
    person.groupBy("deptno").agg(Map("age" -> "max","money" ->"avg")).show()
+------+--------+-----------------+
|deptno|max(age)|       avg(money)|
+------+--------+-----------------+
|     1|      18|9.666666666666666|
|     3|      32|             30.0|
|     2|      18|             17.0|
+------+--------+-----------------+

    //去重
    person.select("age").distinct.show
+---+
|age|
+---+
| 32|
| 17|
|  5|
| 18|
+---+


    //join
    val dept = sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\dept.json")
    person.join(dept, person("deptno")===dept("deptno"), "outer").show()
+---+------+-----+------+------+------+
|age|deptno|money|  name|deptno|  name|
+---+------+-----+------+------+------+
| 17|     1|   15|  路飞|     1|  草帽|
| 18|     1|    9|  索隆|     1|  草帽|
|  5|     1|    5|  乔巴|     1|  草帽|
| 32|     3|   30|香克斯|     3|  红发|
| 18|     2|   18|  艾斯|     2|白胡子|
| 18|     2|   16|  萨博|     2|白胡子|
+---+------+-----+------+------+------+

    //join 分组
     val joinJson = person.join(dept, person("deptno")===dept("deptno"), "outer")
      .groupBy(person("deptno")).agg(Map("age" -> "max","money" ->"avg"))
    joinJson.show()
+------+--------+-----------------+
|deptno|max(age)|       avg(money)|
+------+--------+-----------------+
|     1|      18|9.666666666666666|
|     3|      32|             30.0|
|     2|      18|             17.0|
+------+--------+-----------------+

    //另存为 Json
    //分别拿出两张表的列名
    val c_person = person.columns
    val c_dept = dept.columns
    //分别对两张表的别名进行设置(两张表中列名一样,无法保存,必须设置列名不一样)
    val person_tmp = person.select(c_person.map(n => person(n).as("person_" + n)): _*)
    val dept_tmp = dept.select(c_dept.map(n => dept(n).as("dept_" + n)): _*)

    // 由于文件比较小,使用 repartition 设置之生成一个 文件
    person_tmp.join(dept_tmp, person_tmp("person_deptno") === dept_tmp("dept_deptno"), "outer").repartition(1)
      .write.format("json").save("C:\\Users\\LUFFY\\Desktop\\testData\\joinJson")

输出的文件如下:

{"person_age":17,"person_deptno":1,"person_money":15,"person_name":"路飞","dept_deptno":1,"dept_name":"草帽"}
{"person_age":18,"person_deptno":1,"person_money":9,"person_name":"索隆","dept_deptno":1,"dept_name":"草帽"}
{"person_age":5,"person_deptno":1,"person_money":5,"person_name":"乔巴","dept_deptno":1,"dept_name":"草帽"}
{"person_age":32,"person_deptno":3,"person_money":30,"person_name":"香克斯","dept_deptno":3,"dept_name":"红发"}
{"person_age":18,"person_deptno":2,"person_money":18,"person_name":"艾斯","dept_deptno":2,"dept_name":"白胡子"}
{"person_age":18,"person_deptno":2,"person_money":16,"person_name":"萨博","dept_deptno":2,"dept_name":"白胡子"}

 

@Test
  def handleJson3 ={
    //1) 创建 sqlContext
    val sparkConf = new SparkConf().setAppName("SQLContext").setMaster("local[*]")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)

    // 1)相关处理
    val person =  sqlContext.read.format("json").load("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    person.registerTempTable("p")
    sqlContext.sql("show databases").show()

    sqlContext.tableNames().foreach(println)
    sqlContext.sql("select * from p").show()

    //缓存表
    val a2 = sqlContext.sql("select name from p")
    sqlContext.cacheTable("p")
    a2.show
    //释放缓存
    sqlContext.uncacheTable("p")
    sc.stop()
  }

结果如下:

    person.registerTempTable("p")
    sqlContext.sql("show databases").show()
+------------+
|databaseName|
+------------+
|     default|
+------------+

    sqlContext.tableNames().foreach(println)
p

    sqlContext.sql("select * from p").show()
+---+------+-----+------+
| 17|     1|   15|  路飞|
| 18|     1|    9|  索隆|
|  5|     1|    5|  乔巴|
| 18|     2|   18|  艾斯|
| 18|     2|   16|  萨博|
| 32|     3|   30|香克斯|
+---+------+-----+------+

    //缓存表
    val a2 = sqlContext.sql("select name from p")
    sqlContext.cacheTable("p")
    a2.show
+------+
|  name|
+------+
|  路飞|
|  索隆|
|  乔巴|
|  艾斯|
|  萨博|
|香克斯|
+------+

    //释放缓存
    sqlContext.uncacheTable("p")

 

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值