DataFrame简介
DataFrame:一个命名列方式组织的分布式数据集,类似于关系数据库中的一个表 (1)可以由机构化数据文件得到 (2)hive表 (3)RDD转化
department.json文件
{"name":"Develoment Dept","depId":"1"} {"name":"Personnel Dept","depId":"2"} {"name":"Testing Dept","depId":"3"}
newpeople.json文件
{"name":"John","job number":"007","age":32,"gender":"male","depId":1,"salary":4000} {"name":"Herry","job number":"008","age":20,"gender":"female","depId":2,"salary":4000} {"name":"Jack","job number":"009","age":26,"gender":"male","depId":3,"salary":4000}
people.json文件
{"name":"Michael","job number":"001","age":33,"gender":"male","depId":1,"salary":3000} {"name":"andy","job number":"002","age":30,"gender":"female","depId":2,"salary":4000} {"name":"Justin","job number":"003","age":19,"gender":"male","depId":1,"salary":4000} {"name":"John","job number":"004","age":32,"gender":"male","depId":1,"salary":4000} {"name":"Herry","job number":"005","age":20,"gender":"female","depId":2,"salary":4000} {"name":"Jack","job number":"006","age":26,"gender":"male","depId":3,"salary":4000}
package com.dt.spark.main.DataFrameLearn import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} /** * */ object DataFrameSQL { def main(args: Array[String]) { val conf = new SparkConf() conf.setAppName("test") conf.setMaster("local") val sc = new SparkContext(conf) //设置日志级别 Logger.getLogger("org.apache.spark").setLevel(Level.WARN) Logger.getLogger("org.apache.spark.sql").setLevel(Level.WARN) val sqlContext = new HiveContext(sc) //加载json文件 val people = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/people.json") val newpeople = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/newpeople.json") val dept = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/department.json") //========================================== /* 表格形式显示DataFrame信息 show(): 默认输出前20条 show(n): 可以设置参数 */ people.show() // +---+-----+------+----------+-------+-------+ // |age|depId|gender|job number| name|sarlary| // +---+-----+------+----------+-------+-------+ // | 33| 1| male| 001|Michael| 3000| // | 30| 2|female| 002| andy| 4000| // | 19| 1| male| 003| Justin| 4000| // | 32| 1| male| 004| John| 4000| // | 20| 2|female| 005| Herry| 4000| // | 26| 3| male| 006| Jack| 4000| // +---+-----+------+----------+-------+-------+ /* 打印printSchema */ people.printSchema() // root // |-- age: long (nullable = true) // |-- depId: long (nullable = true) // |-- gender: string (nullable = true) // |-- job number: string (nullable = true) // |-- name: string (nullable = true) // |-- sarlary: long (nullable = true) //========================================== /* DataFrame基本信息的查询 columns:已数组的形式返回列名字 count: 返回总记录数 take: 已数组的形式返回前n记录信息 toJSON: 转换为JsonRDD */ // people.columns people.columns.foreach(println(_)) // age // depId // gender // job number // name // salary println(people.count()) //6 people.take(2).foreach(print(_)) //[33,1,male,001,Michael,3000][30,2,female,002,andy,4000] people.toJSON.collect().foreach(println(_)) // {"age":33,"depId":1,"gender":"male","job number":"001","name":"Michael","salary":3000} // {"age":30,"depId":2,"gender":"female","job number":"002","name":"andy","salary":4000} // {"age":19,"depId":1,"gender":"male","job number":"003","name":"Justin","salary":4000} // {"age":32,"depId":1,"gender":"male","job number":"004","name":"John","salary":4000} // {"age":20,"depId":2,"gender":"female","job number":"005","name":"Herry","salary":4000} // {"age":26,"depId":3,"gender":"male","job number":"006","name":"Jack","salary":4000} //========================================== /* DataFrame的条件查询 filter: where: */ println(people.filter("gender = 'male'").count()) //4 println(people.filter("age > 25").count()) //4 println(people.filter("age > 25").filter("gender = 'male'").count()) // 3 println(people.where("age > 25").count()) //4 //========================================== /* 排序和分区排序 */ people.sort(people("age").asc).show // +---+-----+------+----------+-------+------+ // |age|depId|gender|job number| name|salary| // +---+-----+------+----------+-------+------+ // | 19| 1| male| 003| Justin| 4000| // | 20| 2|female| 005| Herry| 4000| // | 26| 3| male| 006| Jack| 4000| // | 30| 2|female| 002| andy| 4000| // | 32| 1| male| 004| John| 4000| // | 33| 1| male| 001|Michael| 3000| // +---+-----+------+----------+-------+------+ people.sortWithinPartitions("gender","age").show() // +---+-----+------+----------+-------+------+ // |age|depId|gender|job number| name|salary| // +---+-----+------+----------+-------+------+ // | 20| 2|female| 005| Herry| 4000| // | 30| 2|female| 002| andy| 4000| // | 19| 1| male| 003| Justin| 4000| // | 26| 3| male| 006| Jack| 4000| // | 32| 1| male| 004| John| 4000| // | 33| 1| male| 001|Michael| 3000| // +---+-----+------+----------+-------+------+ //========================================== /* 增加列 */ //select only "name" column people.select("age").show() // +---+ // |age| // +---+ // | 33| // | 30| // | 19| // | 32| // | 20| // | 26| // +---+ //select everybody,but increment the age by 1 //选择同时定义 people.select(people("age"), people("age") + 1).show() // +---+---------+ // |age|(age + 1)| // +---+---------+ // | 33| 34| // | 30| 31| // | 19| 20| // | 32| 33| // | 20| 21| // | 26| 27| // +---+---------+ people.withColumn("level",people("age")/10).show // +---+-----+------+----------+-------+------+-----+ // |age|depId|gender|job number| name|salary|level| // +---+-----+------+----------+-------+------+-----+ // | 33| 1| male| 001|Michael| 3000| 3.3| // | 30| 2|female| 002| andy| 4000| 3.0| // | 19| 1| male| 003| Justin| 4000| 1.9| // | 32| 1| male| 004| John| 4000| 3.2| // | 20| 2|female| 005| Herry| 4000| 2.0| // | 26| 3| male| 006| Jack| 4000| 2.6| // +---+-----+------+----------+-------+------+-----+ //========================================== /* 修改列名字 */ people.withColumnRenamed("job number","jobId").show() // +---+-----+------+-----+-------+------+ // |age|depId|gender|jobId| name|salary| // +---+-----+------+-----+-------+------+ // | 33| 1| male| 001|Michael| 3000| // | 30| 2|female| 002| andy| 4000| // | 19| 1| male| 003| Justin| 4000| // | 32| 1| male| 004| John| 4000| // | 20| 2|female| 005| Herry| 4000| // | 26| 3| male| 006| Jack| 4000| // +---+-----+------+-----+-------+------+ //========================================== /* unionAll操作 */ people.unionAll(newpeople).show() // // +---+-----+------+----------+-------+------+ // |age|depId|gender|job number| name|salary| // +---+-----+------+----------+-------+------+ // | 33| 1| male| 001|Michael| 3000| // | 30| 2|female| 002| andy| 4000| // | 19| 1| male| 003| Justin| 4000| // | 32| 1| male| 004| John| 4000| // | 20| 2|female| 005| Herry| 4000| // | 26| 3| male| 006| Jack| 4000| // | 32| 1| male| 007| John| 4000| // | 20| 2|female| 008| Herry| 4000| // | 26| 3| male| 009| Jack| 4000| // +---+-----+------+----------+-------+------+ //========================================== /* 聚合操作groupBy */ people.unionAll(newpeople).groupBy("name").count.show // +-------+-----+ // | name|count| // +-------+-----+ // | Jack| 2| // | John| 2| // |Michael| 1| // | Justin| 1| // | Herry| 2| // | andy| 1| // +-------+-----+ val depAgg = people.groupBy("depId").agg( Map( "age"->"max", "gender"->"count" ) ) depAgg.show() //+-----+--------+-------------+ //|depId|max(age)|count(gender)| //+-----+--------+-------------+ //| 1| 33| 3| //| 2| 30| 2| //| 3| 26| 1| //+-----+--------+-------------+ depAgg.toDF("depId","maxAge","countGender").show() // +-----+------+-----------+ // |depId|maxAge|countGender| // +-----+------+-----------+ // | 1| 33| 3| // | 2| 30| 2| // | 3| 26| 1| // +-----+------+-----------+ //========================================== /* 去重操作distinct */ people.unionAll(newpeople).select("name").distinct().show() // +-------+ // | name| // +-------+ // | Jack| // | John| // |Michael| // | Justin| // | Herry| // | andy| // +-------+ //========================================== /* 交集和异或 */ people.select("name").except(newpeople.select("name")).show() // +-------+ // | name| // +-------+ // | andy| // | Justin| // |Michael| // +-------+ people.select("name").intersect(newpeople.select("name")).show() // +-----+ // | name| // +-----+ // |Herry| // | John| // | Jack| // +-----+ //========================================== /* DataFrame间的join */ people.join(dept,people("depId")===dept("depId"),"outer").show() // +---+-----+------+----------+-------+------+-----+---------------+ // |age|depId|gender|job number| name|salary|depId| name| // +---+-----+------+----------+-------+------+-----+---------------+ // | 33| 1| male| 001|Michael| 3000| 1|Develoment Dept| // | 19| 1| male| 003| Justin| 4000| 1|Develoment Dept| // | 32| 1| male| 004| John| 4000| 1|Develoment Dept| // | 26| 3| male| 006| Jack| 4000| 3| Testing Dept| // | 30| 2|female| 002| andy| 4000| 2| Personnel Dept| // | 20| 2|female| 005| Herry| 4000| 2| Personnel Dept| // +---+-----+------+----------+-------+------+-----+---------------+ sc.stop() }}