spark-DataFrame学习记录-[1]基础部分


DataFrame简介

DataFrame:一个命名列方式组织的分布式数据集,类似于关系数据库中的一个表 (1)可以由机构化数据文件得到 (2)hive表 (3)RDD转化


department.json文件

{"name":"Develoment Dept","depId":"1"}
{"name":"Personnel Dept","depId":"2"}
{"name":"Testing Dept","depId":"3"}

newpeople.json文件

{"name":"John","job number":"007","age":32,"gender":"male","depId":1,"salary":4000}
{"name":"Herry","job number":"008","age":20,"gender":"female","depId":2,"salary":4000}
{"name":"Jack","job number":"009","age":26,"gender":"male","depId":3,"salary":4000}

people.json文件

{"name":"Michael","job number":"001","age":33,"gender":"male","depId":1,"salary":3000}
{"name":"andy","job number":"002","age":30,"gender":"female","depId":2,"salary":4000}
{"name":"Justin","job number":"003","age":19,"gender":"male","depId":1,"salary":4000}
{"name":"John","job number":"004","age":32,"gender":"male","depId":1,"salary":4000}
{"name":"Herry","job number":"005","age":20,"gender":"female","depId":2,"salary":4000}
{"name":"Jack","job number":"006","age":26,"gender":"male","depId":3,"salary":4000}

package com.dt.spark.main.DataFrameLearn

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
/**
  *
  */
object DataFrameSQL {
  def main(args: Array[String]) {

    val conf = new SparkConf()
    conf.setAppName("test")
    conf.setMaster("local")

    val sc = new SparkContext(conf)

    //设置日志级别

    Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.sql").setLevel(Level.WARN)

    val sqlContext = new HiveContext(sc)

    //加载json文件

    val people = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/people.json")
    val newpeople = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/newpeople.json")
    val dept = sqlContext.read.json("./src/com/dt/spark/main/DataFrameLearn/srcFile/department.json")

    //==========================================
    /*
    表格形式显示DataFrame信息

    show():
            默认输出前20条
    show(n):
            可以设置参数
     */

    people.show()
    //    +---+-----+------+----------+-------+-------+
    //    |age|depId|gender|job number|   name|sarlary|
    //      +---+-----+------+----------+-------+-------+
    //    | 33|    1|  male|       001|Michael|   3000|
    //    | 30|    2|female|       002|   andy|   4000|
    //    | 19|    1|  male|       003| Justin|   4000|
    //    | 32|    1|  male|       004|   John|   4000|
    //    | 20|    2|female|       005|  Herry|   4000|
    //    | 26|    3|  male|       006|   Jack|   4000|
    //    +---+-----+------+----------+-------+-------+

    /*
      打印printSchema
    */
    people.printSchema()
    //    root
    //    |-- age: long (nullable = true)
    //    |-- depId: long (nullable = true)
    //    |-- gender: string (nullable = true)
    //    |-- job number: string (nullable = true)
    //    |-- name: string (nullable = true)
    //    |-- sarlary: long (nullable = true)

    //==========================================
    /*
    DataFrame基本信息的查询

    columns:已数组的形式返回列名字
    count:  返回总记录数
    take:   已数组的形式返回前n记录信息
    toJSON: 转换为JsonRDD
     */

    // people.columns
    people.columns.foreach(println(_))
    //    age
    //    depId
    //    gender
    //    job number
    //    name
    //    salary

    println(people.count())
    //6

    people.take(2).foreach(print(_))
    //[33,1,male,001,Michael,3000][30,2,female,002,andy,4000]

    people.toJSON.collect().foreach(println(_))

    //    {"age":33,"depId":1,"gender":"male","job number":"001","name":"Michael","salary":3000}
    //    {"age":30,"depId":2,"gender":"female","job number":"002","name":"andy","salary":4000}
    //    {"age":19,"depId":1,"gender":"male","job number":"003","name":"Justin","salary":4000}
    //    {"age":32,"depId":1,"gender":"male","job number":"004","name":"John","salary":4000}
    //    {"age":20,"depId":2,"gender":"female","job number":"005","name":"Herry","salary":4000}
    //    {"age":26,"depId":3,"gender":"male","job number":"006","name":"Jack","salary":4000}

   //==========================================
    /*
     DataFrame的条件查询

     filter:
     where:
    */

    println(people.filter("gender = 'male'").count())
    //4
    println(people.filter("age > 25").count())
    //4

    println(people.filter("age > 25").filter("gender = 'male'").count())
    // 3

    println(people.where("age > 25").count())
    //4

    //==========================================
    /*
    排序和分区排序

     */
    people.sort(people("age").asc).show

    //    +---+-----+------+----------+-------+------+
    //    |age|depId|gender|job number|   name|salary|
    //      +---+-----+------+----------+-------+------+
    //    | 19|    1|  male|       003| Justin|  4000|
    //    | 20|    2|female|       005|  Herry|  4000|
    //    | 26|    3|  male|       006|   Jack|  4000|
    //    | 30|    2|female|       002|   andy|  4000|
    //    | 32|    1|  male|       004|   John|  4000|
    //    | 33|    1|  male|       001|Michael|  3000|
    //    +---+-----+------+----------+-------+------+

    people.sortWithinPartitions("gender","age").show()
    //    +---+-----+------+----------+-------+------+
    //    |age|depId|gender|job number|   name|salary|
    //      +---+-----+------+----------+-------+------+
    //    | 20|    2|female|       005|  Herry|  4000|
    //    | 30|    2|female|       002|   andy|  4000|
    //    | 19|    1|  male|       003| Justin|  4000|
    //    | 26|    3|  male|       006|   Jack|  4000|
    //    | 32|    1|  male|       004|   John|  4000|
    //    | 33|    1|  male|       001|Michael|  3000|
    //    +---+-----+------+----------+-------+------+

    //==========================================
    /*
    增加列

     */

    //select only "name" column
    people.select("age").show()

    //    +---+
    //    |age|
    //    +---+
    //    | 33|
    //    | 30|
    //    | 19|
    //    | 32|
    //    | 20|
    //    | 26|
    //    +---+

    //select everybody,but increment the age by 1
    //选择同时定义
    people.select(people("age"), people("age") + 1).show()

    //      +---+---------+
    //      |age|(age + 1)|
    //      +---+---------+
    //      | 33|       34|
    //      | 30|       31|
    //      | 19|       20|
    //      | 32|       33|
    //      | 20|       21|
    //      | 26|       27|
    //      +---+---------+

    people.withColumn("level",people("age")/10).show
    //    +---+-----+------+----------+-------+------+-----+
    //    |age|depId|gender|job number|   name|salary|level|
    //    +---+-----+------+----------+-------+------+-----+
    //    | 33|    1|  male|       001|Michael|  3000|  3.3|
    //    | 30|    2|female|       002|   andy|  4000|  3.0|
    //    | 19|    1|  male|       003| Justin|  4000|  1.9|
    //    | 32|    1|  male|       004|   John|  4000|  3.2|
    //    | 20|    2|female|       005|  Herry|  4000|  2.0|
    //    | 26|    3|  male|       006|   Jack|  4000|  2.6|
    //    +---+-----+------+----------+-------+------+-----+


    //==========================================
    /*
    修改列名字

     */
    people.withColumnRenamed("job number","jobId").show()
//    +---+-----+------+-----+-------+------+
//    |age|depId|gender|jobId|   name|salary|
//    +---+-----+------+-----+-------+------+
//    | 33|    1|  male|  001|Michael|  3000|
//    | 30|    2|female|  002|   andy|  4000|
//    | 19|    1|  male|  003| Justin|  4000|
//    | 32|    1|  male|  004|   John|  4000|
//    | 20|    2|female|  005|  Herry|  4000|
//    | 26|    3|  male|  006|   Jack|  4000|
//    +---+-----+------+-----+-------+------+


    //==========================================
    /*
    unionAll操作

     */

    people.unionAll(newpeople).show()
    //
    //    +---+-----+------+----------+-------+------+
    //    |age|depId|gender|job number|   name|salary|
    //      +---+-----+------+----------+-------+------+
    //    | 33|    1|  male|       001|Michael|  3000|
    //    | 30|    2|female|       002|   andy|  4000|
    //    | 19|    1|  male|       003| Justin|  4000|
    //    | 32|    1|  male|       004|   John|  4000|
    //    | 20|    2|female|       005|  Herry|  4000|
    //    | 26|    3|  male|       006|   Jack|  4000|
    //    | 32|    1|  male|       007|   John|  4000|
    //    | 20|    2|female|       008|  Herry|  4000|
    //    | 26|    3|  male|       009|   Jack|  4000|
    //    +---+-----+------+----------+-------+------+


 	 
//==========================================
/*
聚合操作groupBy

 */
people.unionAll(newpeople).groupBy("name").count.show
//      +-------+-----+
//      |   name|count|
//      +-------+-----+
//      |   Jack|    2|
//      |   John|    2|
//      |Michael|    1|
//      | Justin|    1|
//      |  Herry|    2|
//      |   andy|    1|
//      +-------+-----+

val depAgg = people.groupBy("depId").agg(
  Map(
    "age"->"max",
    "gender"->"count"
  )
)
depAgg.show()
//+-----+--------+-------------+
//|depId|max(age)|count(gender)|
//+-----+--------+-------------+
//|    1|      33|            3|
//|    2|      30|            2|
//|    3|      26|            1|
//+-----+--------+-------------+
depAgg.toDF("depId","maxAge","countGender").show()
//    +-----+------+-----------+
//    |depId|maxAge|countGender|
//    +-----+------+-----------+
//    |    1|    33|          3|
//    |    2|    30|          2|
//    |    3|    26|          1|
//    +-----+------+-----------+


//==========================================
/*
去重操作distinct

 */
people.unionAll(newpeople).select("name").distinct().show()
//    +-------+
//    |   name|
//    +-------+
//    |   Jack|
//    |   John|
//    |Michael|
//    | Justin|
//    |  Herry|
//    |   andy|
//    +-------+

//==========================================
/*
交集和异或

 */
people.select("name").except(newpeople.select("name")).show()
//    +-------+
//    |   name|
//    +-------+
//    |   andy|
//    | Justin|
//    |Michael|
//    +-------+
people.select("name").intersect(newpeople.select("name")).show()
//    +-----+
//    | name|
//    +-----+
//    |Herry|
//    | John|
//    | Jack|
//    +-----+

//==========================================
/*
DataFrame间的join

 */
people.join(dept,people("depId")===dept("depId"),"outer").show()
//    +---+-----+------+----------+-------+------+-----+---------------+
//    |age|depId|gender|job number|   name|salary|depId|           name|
//      +---+-----+------+----------+-------+------+-----+---------------+
//    | 33|    1|  male|       001|Michael|  3000|    1|Develoment Dept|
//    | 19|    1|  male|       003| Justin|  4000|    1|Develoment Dept|
//    | 32|    1|  male|       004|   John|  4000|    1|Develoment Dept|
//    | 26|    3|  male|       006|   Jack|  4000|    3|   Testing Dept|
//    | 30|    2|female|       002|   andy|  4000|    2| Personnel Dept|
//    | 20|    2|female|       005|  Herry|  4000|    2| Personnel Dept|
//    +---+-----+------+----------+-------+------+-----+---------------+
    sc.stop()  }}


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值