spark dataframe学习记录

文本数据
id,name,balance
1,Alice,100
2,Bob,200
3,Charlie,300
4,Denis,400
5,Edith,500

定义case class, 参数名即为表的列名
case class Account(id: Int, name: String, balance: Double)

从hdfs上的文本创建rdd
val rdd = sc.textFile("/tmp/wht/account/accounts.txt").map(_.split(","))

scala> rdd.collect()
res3: Array[Array[String]] = Array(Array(1, Alice, 100), Array(2, Bob, 200), Array(3, Charlie, 300), Array(4, Denis, 400), Array(5, Edith, 500))


创建包含case class的rdd
val rddContainingCaseClass = rdd.map(p => Account(p(0).trim.toInt, p(1),p(2).trim.toDouble))

scala> rddContainingCaseClass.collect()
res1: Array[Account] = Array(Account(1,Alice,100.0), Account(2,Bob,200.0), Account(3,Charlie,300.0), Account(4,Denis,400.0), Account(5,Edith,500.0))

将包含case class的RDD转换为DataFrame
val account = rddContainingCaseClass.toDF()


------------------DataFrame的基本操作------------------
将DataFrame的内容打印到标准输出
account.show()

+---+-------+-------+
| id|   name|balance|
+---+-------+-------+
|  1|  Alice|  100.0|
|  2|    Bob|  200.0|
|  3|Charlie|  300.0|
|  4|  Denis|  400.0|
|  5|  Edith|  500.0|
+---+-------+-------+


scala> account.select("id").show()

+---+
| id|
+---+
|  1|
|  2|
|  3|
|  4|
|  5|
+---+

scala> account.filter(account("id")>3).show()
+---+-----+-------+
| id| name|balance|
+---+-----+-------+
|  4|Denis|  400.0|
|  5|Edith|  500.0|
+---+-----+-------+


------------------将DataFrame保存为文件------------------
scala> account.select("name", "balance").write.save("nameAndBanlance.parquet")

18/10/19 10:57:25 INFO ParquetRelation: Listing hdfs://zetdep07.zetyun.com:8020/user/root/nameAndBanlance.parquet on driver
下载到本地:
hdfs dfs -get /user/root/nameAndBanlance.parquet

cd nameAndBanlance.parquet
[root@zetdep07 nameAndBanlance.parquet]# ll
总用量 16
-rw-r--r-- 1 root root 295 10月 19 10:57 _common_metadata
-rw-r--r-- 1 root root 777 10月 19 10:57 _metadata
-rw-r--r-- 1 root root 579 10月 19 10:57 part-r-00000-0874d8eb-1fe9-4f4c-bee9-3b01c545f236.gz.parquet
-rw-r--r-- 1 root root 561 10月 19 10:57 part-r-00001-0874d8eb-1fe9-4f4c-bee9-3b01c545f236.gz.parquet
-rw-r--r-- 1 root root   0 10月 19 10:57 _SUCCESS

保存为orcfile格式
scala> account.select("name", "balance").write.format("orc")save("nameAndBanlance.orc")

18/10/19 11:04:12 INFO OrcRelation: Listing hdfs://zetdep07.zetyun.com:8020/user/root/nameAndBanlance.orc on driver
下载到本地:
hdfs dfs -get /user/root/nameAndBanlance.orc

cd nameAndBanlance.orc
[root@zetdep07 nameAndBanlance.orc]# ll
总用量 8
-rw-r--r-- 1 root root 388 10月 19 11:05 part-r-00000-c656867f-d515-4ac1-b853-1cb62820d262.orc
-rw-r--r-- 1 root root 374 10月 19 11:05 part-r-00001-c656867f-d515-4ac1-b853-1cb62820d262.orc
-rw-r--r-- 1 root root   0 10月 19 11:05 _SUCCESS


------------------将DataFrame注册为表------------------
account.registerTempTable("accountTable")

scala> val result = sqlContext.sql("SELECT * FROM accountTable")
18/10/19 10:48:48 INFO ParseDriver: Parsing command: SELECT * FROM accountTable
18/10/19 10:48:48 INFO ParseDriver: Parse Completed
result: org.apache.spark.sql.DataFrame = [id: int, name: string, balance: double]

val result = sqlContext.sql("SELECT * FROM accountTable").collect()
result: Array[org.apache.spark.sql.Row] = Array([1,Alice,100.0], [2,Bob,200.0], [3,Charlie,300.0], [4,Denis,400.0], [5,Edith,500.0])

--------将parquet文件读取为dataframe-------
val dfpar = sqlContext.read.load("/user/root/nameAndBanlance.parquet")

scala> dfpar.select("name","balance").show()
+-------+-------+
|   name|balance|
+-------+-------+
|  Alice|  100.0|
|    Bob|  200.0|
|Charlie|  300.0|
|  Denis|  400.0|
|  Edith|  500.0|
+-------+-------+


scala> dfpar.printSchema
root
 |-- name: string (nullable = true)
 |-- balance: double (nullable = true)


--------将orc文件读取为dataframe-------
val dforc = sqlContext.read.format("orc").load("/user/root/nameAndBanlance.orc")
scala> dforc.select("name","balance").show()
+-------+-------+
|   name|balance|
+-------+-------+
|  Alice|  100.0|
|    Bob|  200.0|
|Charlie|  300.0|
|  Denis|  400.0|
|  Edith|  500.0|
+-------+-------+

scala> dforc.printSchema
root
 |-- name: string (nullable = true)
 |-- balance: double (nullable = true)

--------将json文件读取为dataframe-------
scala> val dfjson = sqlContext.read.format("json").load("/tmp/wht/employee/employee.json")
dfjson: org.apache.spark.sql.DataFrame = [_corrupt_record: string, firstName: string, lastName: string]

scala> dfjson.select("firstName","lastName").show()
+---------+--------+
|firstName|lastName|
+---------+--------+
|     null|    null|
|     null|    null|
|     Bill|   Gates|
|   George|    Bush|
|   Thomas|  Carter|
|     null|    null|
|     null|    null|
+---------+--------+


scala> dfjson.collect()
res22: Array[org.apache.spark.sql.Row] = Array([{,null,null], ["employees": [,null,null], [null,Bill,Gates], [null,George,Bush], [null,Thomas,Carter], [],null,null], [},null,null])

scala> dfjson.printSchema
root
 |-- _corrupt_record: string (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

汀桦坞

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值