person.json
{"name":"路飞","age":17,"deptno":1,"money":15}
{"name":"索隆","age":18,"deptno":1,"money":9}
{"name":"乔巴","age":5,"deptno":1,"money":5}
{"name":"艾斯","age":18,"deptno":2,"money":18}
{"name":"萨博","age":18,"deptno":2,"money":16}
{"name":"香克斯","age":32,"deptno":3,"money":30}
基本操作
测试代码 一
class SparkSessionDemo {
@Test
def testDF1 = {
val spark = SparkSession.builder().appName("SparkSessionDemo").master("local[2]").getOrCreate()
val personDF = spark.read.json("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
//打印 schema 结构
personDF.printSchema()
// 输出前 20 条数据(show 方法默认返回 20条)
personDF.show()
//查询某列所有数据(三种写法)
personDF.select("name").show()
personDF.select(personDF.col("name"), personDF.col("age") + 10).show()
personDF.select(personDF("name"), personDF("age") + 10).show()
spark.stop()
}
}
运行结果为:
//打印 schema 结构
personDF.printSchema()
root
|-- age: long (nullable = true)
|-- deptno: long (nullable = true)
|-- money: long (nullable = true)
|-- name: string (nullable = true)
// 输出前 20 条数据(show 方法默认返回 20条)
personDF.show()
+---+------+-----+------+
|age|deptno|money| name|
+---+------+-----+------+
| 17| 1| 15| 路飞|
| 18| 1| 9| 索隆|
| 5| 1| 5| 乔巴|
| 18| 2| 18| 艾斯|
| 18| 2| 16| 萨博|
| 32| 3| 30|香克斯|
+---+------+-----+------+
//查询某列所有数据(三种写法)
personDF.select("name").show()
+------+
| name|
+------+
| 路飞|
| 索隆|
| 乔巴|
| 艾斯|
| 萨博|
|香克斯|
+------+
personDF.select(personDF.col("name"), personDF.col("age") + 10).show()
+------+----------+
| name|(age + 10)|
+------+----------+
| 路飞| 27|
| 索隆| 28|
| 乔巴| 15|
| 艾斯| 28|
| 萨博| 28|
|香克斯| 42|
+------+----------+
personDF.select(personDF("name"), personDF("age") + 10).show()
+------+----------+
| name|(age + 10)|
+------+----------+
| 路飞| 27|
| 索隆| 28|
| 乔巴| 15|
| 艾斯| 28|
| 萨博| 28|
|香克斯| 42|
+------+----------+
测试代码 二
@Test
def testDF2()={
val spark = SparkSession.builder().appName("SparkSessionDemo").master("local[2]").getOrCreate()
val personDF = spark.read.json("C:\\Users\\LUFFY\\Desktop\\testData\\person.json")
// 别名
personDF.select(personDF("name"), (personDF("age") + 10).as("AGE")).show()
//根据某一行过滤
personDF.filter(personDF.col("age") > 17).show()
//分组聚合
personDF.groupBy("age").count().show()
}
运行结果为:
// 别名
personDF.select(personDF("name"), (personDF("age") + 10).as("AGE")).show()
+------+---+
| name|AGE|
+------+---+
| 路飞| 27|
| 索隆| 28|
| 乔巴| 15|
| 艾斯| 28|
| 萨博| 28|
|香克斯| 42|
+------+---+
//根据某一行过滤
personDF.filter(personDF.col("age") > 17).show()
+---+------+-----+------+
|age|deptno|money| name|
+---+------+-----+------+
| 18| 1| 9| 索隆|
| 18| 2| 18| 艾斯|
| 18| 2| 16| 萨博|
| 32| 3| 30|香克斯|
+---+------+-----+------+
//分组聚合
personDF.groupBy("age").count().show()
+---+-----+
|age|count|
+---+-----+
| 32| 1|
| 17| 1|
| 5| 1|
| 18| 3|
+---+-----+
RDD 互操作
person.txt
路飞 17 1 15 xxxxxxxxxxx@xxxxxxx.com
索隆 18 1 9 xxxxxxxxxxx@xxxxxxx.com
乔巴 5 1 5 xxxxxxxxxxx@xxxxxxx.com
艾斯 18 2 18 xxxxxxxxxxx@xxxxxxx.com
萨博 18 2 16 xxxxxxxxxxx@xxxxxxx.com
香克斯 32 3 30 xxxxxxxxxxx@xxxxxxx.com
3 3 30 xxxxxxxxxxx@xxxxxxx.com
null 3 3 30 xxxxxxxxxxx@xxxxxxx.com
object DataFrameRDDDemo3 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("DataFrameRDDDemo3").master("local[*]").getOrCreate()
val rdd = spark.sparkContext.textFile("C:\\Users\\LUFFY\\Desktop\\testData\\person.txt")
import spark.implicits._
val personDF = rdd.map(_.split(" ")).map(line => Person(line(0), line(1).toInt, line(2).toInt, line(3).toDouble, line(4))).toDF()
personDF.show()
//显示 3 条,并且完全显示过长的列
personDF.show(3, false)
//select email from person
personDF.select("email").show(3, false)
//select * from where name = ' ' or name = null
personDF.filter("name=' ' or name='null'").show()
//查询 name 以 路开头的数据 (通过 spark.sql("show functions") 可以查询到所有支持的函数)
personDF.filter("SUBSTR(name,0,1)='路'").show()
//排序
personDF.sort(personDF("name").asc, personDF("age").desc).show()
//别名
personDF.select(personDF("name").as("名称")).show()
//join
val personDF2 = rdd.map(_.split(" ")).map(line => Person(line(0), line(1).toInt, line(2).toInt, line(3).toDouble, line(4))).toDF()
personDF.join(personDF2, personDF("name") === personDF2("name"), "outer").show()
spark.stop()
}
case class Person(name: String, age: Int, deptId: Int, money: Double, email: String)
}
运行结果如下
personDF.show()
+------+---+------+-----+--------------------+
| name|age|deptId|money| email|
+------+---+------+-----+--------------------+
| 路飞| 17| 1| 15.0|xxxxxxxxxxx@xxxxx...|
| 索隆| 18| 1| 9.0|xxxxxxxxxxx@xxxxx...|
| 乔巴| 5| 1| 5.0|xxxxxxxxxxx@xxxxx...|
| 艾斯| 18| 2| 18.0|xxxxxxxxxxx@xxxxx...|
| 萨博| 18| 2| 16.0|xxxxxxxxxxx@xxxxx...|
|香克斯| 32| 3| 30.0|xxxxxxxxxxx@xxxxx...|
| | 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
| null| 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
+------+---+------+-----+--------------------+
//显示 3 条,并且完全显示过长的列
personDF.show(3, false)
+----+---+------+-----+-----------------------+
|name|age|deptId|money|email |
+----+---+------+-----+-----------------------+
|路飞|17 |1 |15.0 |xxxxxxxxxxx@xxxxxxx.com|
|索隆|18 |1 |9.0 |xxxxxxxxxxx@xxxxxxx.com|
|乔巴|5 |1 |5.0 |xxxxxxxxxxx@xxxxxxx.com|
+----+---+------+-----+-----------------------+
//select email from person
personDF.select("email").show(3, false)
+-----------------------+
|email |
+-----------------------+
|xxxxxxxxxxx@xxxxxxx.com|
|xxxxxxxxxxx@xxxxxxx.com|
|xxxxxxxxxxx@xxxxxxx.com|
+-----------------------+
//select * from where name = '' or name = null
personDF.filter("name='' or name='null'").show()
+----+---+------+-----+--------------------+
|name|age|deptId|money| email|
+----+---+------+-----+--------------------+
| | 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
|null| 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
+----+---+------+-----+--------------------+
//查询 name 以 路开头的数据 (通过 spark.sql("show functions") 可以查询到所有支持的函数)
personDF.filter("SUBSTR(name,0,1)='路'").show()
+----+---+------+-----+--------------------+
|name|age|deptId|money| email|
+----+---+------+-----+--------------------+
|路飞| 17| 1| 15.0|xxxxxxxxxxx@xxxxx...|
+----+---+------+-----+--------------------+
//排序
personDF.sort(personDF("name").asc, personDF("age").desc).show()
+------+---+------+-----+--------------------+
| name|age|deptId|money| email|
+------+---+------+-----+--------------------+
| | 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
| null| 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
| 乔巴| 5| 1| 5.0|xxxxxxxxxxx@xxxxx...|
| 索隆| 18| 1| 9.0|xxxxxxxxxxx@xxxxx...|
| 艾斯| 18| 2| 18.0|xxxxxxxxxxx@xxxxx...|
| 萨博| 18| 2| 16.0|xxxxxxxxxxx@xxxxx...|
| 路飞| 17| 1| 15.0|xxxxxxxxxxx@xxxxx...|
|香克斯| 32| 3| 30.0|xxxxxxxxxxx@xxxxx...|
+------+---+------+-----+--------------------+
//别名
personDF.select(personDF("name").as("名称")).show()
+------+
| 名称|
+------+
| 路飞|
| 索隆|
| 乔巴|
| 艾斯|
| 萨博|
|香克斯|
| |
| null|
+------+
//join
val personDF2 = rdd.map(_.split(" ")).map(line => Person(line(0), line(1).toInt, line(2).toInt, line(3).toDouble, line(4))).toDF()
personDF.join(personDF2, personDF("name") === personDF2("name"), "outer").show()
+------+---+------+-----+--------------------+------+---+------+-----+--------------------+
| name|age|deptId|money| email| name|age|deptId|money| email|
+------+---+------+-----+--------------------+------+---+------+-----+--------------------+
| 乔巴| 5| 1| 5.0|xxxxxxxxxxx@xxxxx...| 乔巴| 5| 1| 5.0|xxxxxxxxxxx@xxxxx...|
|香克斯| 32| 3| 30.0|xxxxxxxxxxx@xxxxx...|香克斯| 32| 3| 30.0|xxxxxxxxxxx@xxxxx...|
| 路飞| 17| 1| 15.0|xxxxxxxxxxx@xxxxx...| 路飞| 17| 1| 15.0|xxxxxxxxxxx@xxxxx...|
| 艾斯| 18| 2| 18.0|xxxxxxxxxxx@xxxxx...| 艾斯| 18| 2| 18.0|xxxxxxxxxxx@xxxxx...|
| 索隆| 18| 1| 9.0|xxxxxxxxxxx@xxxxx...| 索隆| 18| 1| 9.0|xxxxxxxxxxx@xxxxx...|
| 萨博| 18| 2| 16.0|xxxxxxxxxxx@xxxxx...| 萨博| 18| 2| 16.0|xxxxxxxxxxx@xxxxx...|
| | 3| 3| 30.0|xxxxxxxxxxx@xxxxx...| | 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
| null| 3| 3| 30.0|xxxxxxxxxxx@xxxxx...| null| 3| 3| 30.0|xxxxxxxxxxx@xxxxx...|
+------+---+------+-----+--------------------+------+---+------+-----+--------------------+