DataFrame API 操作

21 篇文章 0 订阅
21 篇文章 0 订阅

 

person.json

{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}

基本操作

测试代码 一

class SparkSessionDemo {

  @Test
  def testDF1 = {
    val spark = SparkSession.builder().appName("SparkSessionDemo").master("local[2]").getOrCreate()
    val personDF = spark.read.json("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    //打印 schema 结构
    personDF.printSchema()
    // 输出前 20 条数据(show 方法默认返回 20条)
    personDF.show()
    //查询某列所有数据(三种写法)
    personDF.select("name").show()
    personDF.select(personDF.col("name"), personDF.col("age") + 10).show()
    personDF.select(personDF("name"), personDF("age") + 10).show()

    spark.stop()
  }
}

运行结果为:

//打印 schema 结构
personDF.printSchema()
root
 |-- age: long (nullable = true)
 |-- deptno: long (nullable = true)
 |-- money: long (nullable = true)
 |-- name: string (nullable = true)

// 输出前 20 条数据(show 方法默认返回 20条)
personDF.show()
+---+------+-----+------+
|age|deptno|money|  name|
+---+------+-----+------+
| 17|     1|   15|  路飞|
| 18|     1|    9|  索隆|
|  5|     1|    5|  乔巴|
| 18|     2|   18|  艾斯|
| 18|     2|   16|  萨博|
| 32|     3|   30|香克斯|
+---+------+-----+------+

//查询某列所有数据(三种写法)
personDF.select("name").show()
+------+
|  name|
+------+
|  路飞|
|  索隆|
|  乔巴|
|  艾斯|
|  萨博|
|香克斯|
+------+

personDF.select(personDF.col("name"), personDF.col("age") + 10).show()
+------+----------+
|  name|(age + 10)|
+------+----------+
|  路飞|        27|
|  索隆|        28|
|  乔巴|        15|
|  艾斯|        28|
|  萨博|        28|
|香克斯|        42|
+------+----------+

personDF.select(personDF("name"), personDF("age") + 10).show()
+------+----------+
|  name|(age + 10)|
+------+----------+
|  路飞|        27|
|  索隆|        28|
|  乔巴|        15|
|  艾斯|        28|
|  萨博|        28|
|香克斯|        42|
+------+----------+

测试代码 二

@Test
  def testDF2()={
    val spark = SparkSession.builder().appName("SparkSessionDemo").master("local[2]").getOrCreate()
    val personDF = spark.read.json("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
    // 别名
    personDF.select(personDF("name"), (personDF("age") + 10).as("AGE")).show()

    //根据某一行过滤
    personDF.filter(personDF.col("age") > 17).show()

    //分组聚合
    personDF.groupBy("age").count().show()
  }

运行结果为:

// 别名
personDF.select(personDF("name"), (personDF("age") + 10).as("AGE")).show()
+------+---+
|  name|AGE|
+------+---+
|  路飞| 27|
|  索隆| 28|
|  乔巴| 15|
|  艾斯| 28|
|  萨博| 28|
|香克斯| 42|
+------+---+


//根据某一行过滤
personDF.filter(personDF.col("age") > 17).show()
+---+------+-----+------+
|age|deptno|money|  name|
+---+------+-----+------+
| 18|     1|    9|  索隆|
| 18|     2|   18|  艾斯|
| 18|     2|   16|  萨博|
| 32|     3|   30|香克斯|
+---+------+-----+------+

//分组聚合
personDF.groupBy("age").count().show()
+---+-----+
|age|count|
+---+-----+
| 32|    1|
| 17|    1|
|  5|    1|
| 18|    3|
+---+-----+

 

RDD 互操作

person.txt

路飞 17 1 15 xxxxxxxxxxx@xxxxxxx.com
索隆 18 1 9 xxxxxxxxxxx@xxxxxxx.com
乔巴 5 1 5 xxxxxxxxxxx@xxxxxxx.com
艾斯 18 2 18 xxxxxxxxxxx@xxxxxxx.com
萨博 18 2 16 xxxxxxxxxxx@xxxxxxx.com
香克斯 32 3 30 xxxxxxxxxxx@xxxxxxx.com
 3 3 30 xxxxxxxxxxx@xxxxxxx.com
null 3 3 30 xxxxxxxxxxx@xxxxxxx.com
object DataFrameRDDDemo3 {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("DataFrameRDDDemo3").master("local[*]").getOrCreate()
    val rdd = spark.sparkContext.textFile("C:\\Users\\LUFFY\\Desktop\\testData\\person.txt")
    import spark.implicits._
    val personDF = rdd.map(_.split(" ")).map(line => Person(line(0), line(1).toInt, line(2).toInt, line(3).toDouble, line(4))).toDF()

    personDF.show()
    //显示 3 条,并且完全显示过长的列
    personDF.show(3, false)
    //select email from person
    personDF.select("email").show(3, false)
    //select * from where name = ' ' or name = null
    personDF.filter("name=' ' or name='null'").show()
    //查询 name 以 路开头的数据 (通过 spark.sql("show functions") 可以查询到所有支持的函数)
    personDF.filter("SUBSTR(name,0,1)='路'").show()

    //排序
    personDF.sort(personDF("name").asc, personDF("age").desc).show()

    //别名
    personDF.select(personDF("name").as("名称")).show()

    //join
    val personDF2 = rdd.map(_.split(" ")).map(line => Person(line(0), line(1).toInt, line(2).toInt, line(3).toDouble, line(4))).toDF()
    personDF.join(personDF2, personDF("name") === personDF2("name"), "outer").show()
    spark.stop()
  }
  case class Person(name: String, age: Int, deptId: Int, money: Double, email: String)
}

运行结果如下

 personDF.show()
+------+---+------+-----+--------------------+
|  name|age|deptId|money|               email|
+------+---+------+-----+--------------------+
|  路飞| 17|     1| 15.0|xxxxxxxxxxx@xxxxx...|
|  索隆| 18|     1|  9.0|xxxxxxxxxxx@xxxxx...|
|  乔巴|  5|     1|  5.0|xxxxxxxxxxx@xxxxx...|
|  艾斯| 18|     2| 18.0|xxxxxxxxxxx@xxxxx...|
|  萨博| 18|     2| 16.0|xxxxxxxxxxx@xxxxx...|
|香克斯| 32|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|      |  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|  null|  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
+------+---+------+-----+--------------------+

//显示 3 条,并且完全显示过长的列
personDF.show(3, false)
+----+---+------+-----+-----------------------+
|name|age|deptId|money|email                  |
+----+---+------+-----+-----------------------+
|路飞|17 |1     |15.0 |xxxxxxxxxxx@xxxxxxx.com|
|索隆|18 |1     |9.0  |xxxxxxxxxxx@xxxxxxx.com|
|乔巴|5  |1     |5.0  |xxxxxxxxxxx@xxxxxxx.com|
+----+---+------+-----+-----------------------+

//select email from person
personDF.select("email").show(3, false)
+-----------------------+
|email                  |
+-----------------------+
|xxxxxxxxxxx@xxxxxxx.com|
|xxxxxxxxxxx@xxxxxxx.com|
|xxxxxxxxxxx@xxxxxxx.com|
+-----------------------+

//select * from where name = '' or name = null
personDF.filter("name='' or name='null'").show()
+----+---+------+-----+--------------------+
|name|age|deptId|money|               email|
+----+---+------+-----+--------------------+
|    |  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|null|  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
+----+---+------+-----+--------------------+


//查询 name 以 路开头的数据 (通过 spark.sql("show functions") 可以查询到所有支持的函数)
personDF.filter("SUBSTR(name,0,1)='路'").show()
+----+---+------+-----+--------------------+
|name|age|deptId|money|               email|
+----+---+------+-----+--------------------+
|路飞| 17|     1| 15.0|xxxxxxxxxxx@xxxxx...|
+----+---+------+-----+--------------------+


//排序
personDF.sort(personDF("name").asc, personDF("age").desc).show()
+------+---+------+-----+--------------------+
|  name|age|deptId|money|               email|
+------+---+------+-----+--------------------+
|      |  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|  null|  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|  乔巴|  5|     1|  5.0|xxxxxxxxxxx@xxxxx...|
|  索隆| 18|     1|  9.0|xxxxxxxxxxx@xxxxx...|
|  艾斯| 18|     2| 18.0|xxxxxxxxxxx@xxxxx...|
|  萨博| 18|     2| 16.0|xxxxxxxxxxx@xxxxx...|
|  路飞| 17|     1| 15.0|xxxxxxxxxxx@xxxxx...|
|香克斯| 32|     3| 30.0|xxxxxxxxxxx@xxxxx...|
+------+---+------+-----+--------------------+


//别名
personDF.select(personDF("name").as("名称")).show()
+------+
|  名称|
+------+
|  路飞|
|  索隆|
|  乔巴|
|  艾斯|
|  萨博|
|香克斯|
|      |
|  null|
+------+

//join
val personDF2 = rdd.map(_.split(" ")).map(line => Person(line(0), line(1).toInt, line(2).toInt, line(3).toDouble, line(4))).toDF()
personDF.join(personDF2, personDF("name") === personDF2("name"), "outer").show()
+------+---+------+-----+--------------------+------+---+------+-----+--------------------+
|  name|age|deptId|money|               email|  name|age|deptId|money|               email|
+------+---+------+-----+--------------------+------+---+------+-----+--------------------+
|  乔巴|  5|     1|  5.0|xxxxxxxxxxx@xxxxx...|  乔巴|  5|     1|  5.0|xxxxxxxxxxx@xxxxx...|
|香克斯| 32|     3| 30.0|xxxxxxxxxxx@xxxxx...|香克斯| 32|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|  路飞| 17|     1| 15.0|xxxxxxxxxxx@xxxxx...|  路飞| 17|     1| 15.0|xxxxxxxxxxx@xxxxx...|
|  艾斯| 18|     2| 18.0|xxxxxxxxxxx@xxxxx...|  艾斯| 18|     2| 18.0|xxxxxxxxxxx@xxxxx...|
|  索隆| 18|     1|  9.0|xxxxxxxxxxx@xxxxx...|  索隆| 18|     1|  9.0|xxxxxxxxxxx@xxxxx...|
|  萨博| 18|     2| 16.0|xxxxxxxxxxx@xxxxx...|  萨博| 18|     2| 16.0|xxxxxxxxxxx@xxxxx...|
|      |  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|      |  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
|  null|  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|  null|  3|     3| 30.0|xxxxxxxxxxx@xxxxx...|
+------+---+------+-----+--------------------+------+---+------+-----+--------------------+

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值