DataFrame简介
DataFrame:一个命名列方式组织的分布式数据集,类似于关系数据库中的一个表 (1)可以由机构化数据文件得到 (2)hive表 (3)RDD转化
department.json文件
{"name":"Develoment Dept","depId":"1"}
{"name":"Personnel Dept","depId":"2"}
{"name":"Testing Dept","depId":"3"}
newpeople.json文件
{"name":"John","job number":"007","age":32,"gender":"male","depId":1,"salary":4000}
{"name":"Herry","job number":"008","age":20,"gender":"female","depId":2,"salary":4000}
{"name":"Jack","job number":"009","age":26,"gender":"male","depId":3,"salary":4000}
people.json文件
{"name":"Michael","job number":"001","age":33,"gender":"male","depId":1,"salary":3000}
{"name":"andy","job number":"002","age":30,"gender":"female","depId":2,"salary":4000}
{"name":"Justin","job number":"003","age":19,"gender":"male","depId":1,"salary":4000}
{"name":"John","job number":"004","age":32,"gender":"male","depId":1,"salary":4000}
{"name":"Herry","job number":"005","age":20,"gender":"female","depId":2,"salary":4000}
{"name":"Jack","job number":"006","age":26,"gender":"male","depId":3,"salary":4000}
package com.dt.spark.main.DataFrameLearn
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
/**
*
*/
object DataFrameSQL {
def main(args: Array[String]) {
val conf = new SparkConf()
conf.setAppName("test")
conf.setMaster("local")
val sc = new SparkContext(conf)
//设置日志级别
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.spark.sql").setLevel(Level.WARN)
val sqlContext = new HiveContext(sc)
//加载json文件
val people = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/people.json")
val newpeople = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/newpeople.json")
val dept = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/department.json")
//==========================================
/*
表格形式显示DataFrame信息
show():
默认输出前20条
show(n):
可以设置参数
*/
people.show()
// +---+-----+------+----------+-------+-------+
// |age|depId|gender|job number| name|sarlary|
// +---+-----+------+----------+-------+-------+
// | 33| 1| male| 001|Michael| 3000|
// | 30| 2|female| 002| andy| 4000|
// | 19| 1| male| 003| Justin| 4000|
// | 32| 1| male| 004| John| 4000|
// | 20| 2|female| 005| Herry| 4000|
// | 26| 3| male| 006| Jack| 4000|
// +---+-----+------+----------+-------+-------+
/*
打印printSchema
*/
people.printSchema()
// root
// |-- age: long (nullable = true)
// |-- depId: long (nullable = true)
// |-- gender: string (nullable = true)
// |-- job number: string (nullable = true)
// |-- name: string (nullable = true)
// |-- sarlary: long (nullable = true)
//==========================================
/*
DataFrame基本信息的查询
columns:已数组的形式返回列名字
count: 返回总记录数
take: 已数组的形式返回前n记录信息
toJSON: 转换为JsonRDD
*/
// people.columns
people.columns.foreach(println(_))
// age
// depId
// gender
// job number
// name
// salary
println(people.count())
//6
people.take(2).foreach(print(_))
//[33,1,male,001,Michael,3000][30,2,female,002,andy,4000]
people.toJSON.collect().foreach(println(_))
// {"age":33,"depId":1,"gender":"male","job number":"001","name":"Michael","salary":3000}
// {"age":30,"depId":2,"gender":"female","job number":"002","name":"andy","salary":4000}
// {"age":19,"depId":1,"gender":"male","job number":"003","name":"Justin","salary":4000}
// {"age":32,"depId":1,"gender":"male","job number":"004","name":"John","salary":4000}
// {"age":20,"depId":2,"gender":"female","job number":"005","name":"Herry","salary":4000}
// {"age":26,"depId":3,"gender":"male","job number":"006","name":"Jack","salary":4000}
//==========================================
/*
DataFrame的条件查询
filter:
where:
*/
println(people.filter("gender = 'male'").count())
//4
println(people.filter("age > 25").count())
//4
println(people.filter("age > 25").filter("gender = 'male'").count())
// 3
println(people.where("age > 25").count())
//4
//==========================================
/*
排序和分区排序
*/
people.sort(people("age").asc).show
// +---+-----+------+----------+-------+------+
// |age|depId|gender|job number| name|salary|
// +---+-----+------+----------+-------+------+
// | 19| 1| male| 003| Justin| 4000|
// | 20| 2|female| 005| Herry| 4000|
// | 26| 3| male| 006| Jack| 4000|
// | 30| 2|female| 002| andy| 4000|
// | 32| 1| male| 004| John| 4000|
// | 33| 1| male| 001|Michael| 3000|
// +---+-----+------+----------+-------+------+
people.sortWithinPartitions("gender","age").show()
// +---+-----+------+----------+-------+------+
// |age|depId|gender|job number| name|salary|
// +---+-----+------+----------+-------+------+
// | 20| 2|female| 005| Herry| 4000|
// | 30| 2|female| 002| andy| 4000|
// | 19| 1| male| 003| Justin| 4000|
// | 26| 3| male| 006| Jack| 4000|
// | 32| 1| male| 004| John| 4000|
// | 33| 1| male| 001|Michael| 3000|
// +---+-----+------+----------+-------+------+
//==========================================
/*
增加列
*/
//select only "name" column
people.select("age").show()
// +---+
// |age|
// +---+
// | 33|
// | 30|
// | 19|
// | 32|
// | 20|
// | 26|
// +---+
//select everybody,but increment the age by 1
//选择同时定义
people.select(people("age"), people("age") + 1).show()
// +---+---------+
// |age|(age + 1)|
// +---+---------+
// | 33| 34|
// | 30| 31|
// | 19| 20|
// | 32| 33|
// | 20| 21|
// | 26| 27|
// +---+---------+
people.withColumn("level",people("age")/10).show
// +---+-----+------+----------+-------+------+-----+
// |age|depId|gender|job number| name|salary|level|
// +---+-----+------+----------+-------+------+-----+
// | 33| 1| male| 001|Michael| 3000| 3.3|
// | 30| 2|female| 002| andy| 4000| 3.0|
// | 19| 1| male| 003| Justin| 4000| 1.9|
// | 32| 1| male| 004| John| 4000| 3.2|
// | 20| 2|female| 005| Herry| 4000| 2.0|
// | 26| 3| male| 006| Jack| 4000| 2.6|
// +---+-----+------+----------+-------+------+-----+
//==========================================
/*
修改列名字
*/
people.withColumnRenamed("job number","jobId").show()
// +---+-----+------+-----+-------+------+
// |age|depId|gender|jobId| name|salary|
// +---+-----+------+-----+-------+------+
// | 33| 1| male| 001|Michael| 3000|
// | 30| 2|female| 002| andy| 4000|
// | 19| 1| male| 003| Justin| 4000|
// | 32| 1| male| 004| John| 4000|
// | 20| 2|female| 005| Herry| 4000|
// | 26| 3| male| 006| Jack| 4000|
// +---+-----+------+-----+-------+------+
//==========================================
/*
unionAll操作
*/
people.unionAll(newpeople).show()
//
// +---+-----+------+----------+-------+------+
// |age|depId|gender|job number| name|salary|
// +---+-----+------+----------+-------+------+
// | 33| 1| male| 001|Michael| 3000|
// | 30| 2|female| 002| andy| 4000|
// | 19| 1| male| 003| Justin| 4000|
// | 32| 1| male| 004| John| 4000|
// | 20| 2|female| 005| Herry| 4000|
// | 26| 3| male| 006| Jack| 4000|
// | 32| 1| male| 007| John| 4000|
// | 20| 2|female| 008| Herry| 4000|
// | 26| 3| male| 009| Jack| 4000|
// +---+-----+------+----------+-------+------+
//==========================================
/*
聚合操作groupBy
*/
people.unionAll(newpeople).groupBy("name").count.show
// +-------+-----+
// | name|count|
// +-------+-----+
// | Jack| 2|
// | John| 2|
// |Michael| 1|
// | Justin| 1|
// | Herry| 2|
// | andy| 1|
// +-------+-----+
val depAgg = people.groupBy("depId").agg(
Map(
"age"->"max",
"gender"->"count"
)
)
depAgg.show()
//+-----+--------+-------------+
//|depId|max(age)|count(gender)|
//+-----+--------+-------------+
//| 1| 33| 3|
//| 2| 30| 2|
//| 3| 26| 1|
//+-----+--------+-------------+
depAgg.toDF("depId","maxAge","countGender").show()
// +-----+------+-----------+
// |depId|maxAge|countGender|
// +-----+------+-----------+
// | 1| 33| 3|
// | 2| 30| 2|
// | 3| 26| 1|
// +-----+------+-----------+
//==========================================
/*
去重操作distinct
*/
people.unionAll(newpeople).select("name").distinct().show()
// +-------+
// | name|
// +-------+
// | Jack|
// | John|
// |Michael|
// | Justin|
// | Herry|
// | andy|
// +-------+
//==========================================
/*
交集和异或
*/
people.select("name").except(newpeople.select("name")).show()
// +-------+
// | name|
// +-------+
// | andy|
// | Justin|
// |Michael|
// +-------+
people.select("name").intersect(newpeople.select("name")).show()
// +-----+
// | name|
// +-----+
// |Herry|
// | John|
// | Jack|
// +-----+
//==========================================
/*
DataFrame间的join
*/
people.join(dept,people("depId")===dept("depId"),"outer").show()
// +---+-----+------+----------+-------+------+-----+---------------+
// |age|depId|gender|job number| name|salary|depId| name|
// +---+-----+------+----------+-------+------+-----+---------------+
// | 33| 1| male| 001|Michael| 3000| 1|Develoment Dept|
// | 19| 1| male| 003| Justin| 4000| 1|Develoment Dept|
// | 32| 1| male| 004| John| 4000| 1|Develoment Dept|
// | 26| 3| male| 006| Jack| 4000| 3| Testing Dept|
// | 30| 2|female| 002| andy| 4000| 2| Personnel Dept|
// | 20| 2|female| 005| Herry| 4000| 2| Personnel Dept|
// +---+-----+------+----------+-------+------+-----+---------------+
sc.stop() }}
1465

被折叠的 条评论
为什么被折叠?



