spark 基本操作（二）

最新推荐文章于 2023-11-30 09:36:05 发布

weixin_30711917

最新推荐文章于 2023-11-30 09:36:05 发布

阅读量153

点赞数

文章标签：大数据 json

原文链接：http://www.cnblogs.com/jason-dong/p/9864977.html

版权

1.dataframe 基本操作

 def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName("test")
      .master("local[*]")
      .getOrCreate()
    import spark.implicits._
    val people = spark.read.format("json").load("people.json")
    people.show()
    /*    +----+-------+
        | age|   name|
        +----+-------+
        |null|Michael|
          |  30|   Andy|
          |  19| Justin|
          +----+-------+   */
    people.printSchema()
    /*root
    |-- age: long (nullable = true)
    |-- name: string (nullable = true)*/
    people.select($"name").show()
    /*    +-------+
        |   name|
        +-------+
        |Michael|
        |   Andy|
        | Justin|
        +-------+*/
    people.select($"name", $"age".cast("string").as("age")).printSchema()
    /*    root
        |-- name: string (nullable = true)
        |-- age: string (nullable = true)*/
    people.select($"name", ($"age" + 1).as("age")).show()
    /*    +-------+----+
        |   name| age|
        +-------+----+
        |Michael|null|
          |   Andy|  31|
          | Justin|  20|
          +-------+----+*/
    people.filter($"age" > 21).show()
    //    +---+----+
    //    |age|name|
    //    +---+----+
    //    | 30|Andy|
    //      +---+----+
    people.groupBy("age").count().show()
    //    +----+-----+
    //    | age|count|
    //    +----+-----+
    //    |  19|    1|
    //      |null|    1|
    //      |  30|    1|
    //      +----+-----+    
    spark.stop()
  }

2.用sql 访问dataframe

    val people = spark.read.format("json").load("people.json")
    people.createOrReplaceTempView("tb")
    spark.sql("select name,age from tb").show()
//    +-------+----+
//    |   name| age|
//    +-------+----+
//    |Michael|null|
//      |   Andy|  30|
//      | Justin|  19|
//      +-------+----+

3.创建dataset

    val ccDs = Seq(Person("jason",28),Person("dong",27)).toDS()
    ccDs.select("name").show()
    val pDs = Seq(1,2,3).toDS()
    pDs.map(_+1).show()
    pDs.printSchema()

4.反射推断模式

    val spark = SparkSession.builder()
      .appName("test")
      .master("local[*]")
      .getOrCreate()
    import spark.implicits._
    val sc = spark.sparkContext
    val peopleDF = sc.textFile("people.txt")
      .map(_.split(",", -1))
      .map(arr => Person(arr(0).trim, arr(1).trim.toInt))
      .toDF().cache().createOrReplaceTempView("people")
    val teenagerDF = spark.sql("select * from  people where age between 13 and 15").cache()
    teenagerDF.map(t => "name :" + t(0)).show()
//    +-------------+
//    |        value|
//    +-------------+
//    |name :Michael|
//      +-------------+    
    teenagerDF.map(t => "name:" + t.getAs[String]("name")).show()
//    +-------------+
//    |        value|
//    +-------------+
//    |name :Michael| 
//      +-------------+    
    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
    teenagerDF.map(t => t.getValuesMap[Any](Seq("name", "age"))).collect().foreach(println)
//    +-------------+
//    |        value|
//    +-------------+
//    |name :Michael|
//      +-------------+   
    spark.stop()

5.通过编程指定schema来创建DF

    val peopleRDD = sc.textFile("people.txt")
      .map(_.split(",", -1))
      .map(arr => Row(arr(0).trim, arr(1).trim))
    val schemaString = "name age"
    val structfield = schemaString.split("\\s+")
      .map(a => StructField(a, StringType, true))
    val schema = StructType(structfield)
    val peopleDF = spark.createDataFrame(peopleRDD, schema)
    peopleDF.show()
//    +-------+---+
//    |   name|age|
//    +-------+---+
//    |Michael| 15|
//      |   Andy| 30|
//      | Justin| 19|
//      +-------+---+

6.直接从file执行sql

    spark.sql("select name,age from json.`people.json`").show()
//    +-------+----+
//    |   name| age|
//    +-------+----+
//    |Michael|null|
//      |   Andy|  30|
//      | Justin|  19|
//      +-------+----+

7.合并schema

    val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
    squaresDF.write.parquet("data/test_table/key=1")
    val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube")
    cubesDF.write.parquet("data/test_table/key=2")
    val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
    mergedDF.printSchema()
//    root
//    |-- value: integer (nullable = true)
//    |-- square: integer (nullable = true)
//    |-- cube: integer (nullable = true)
//    |-- key: integer (nullable = true)
    mergedDF.show()
//    +-----+------+----+---+
//    |value|square|cube|key|
//    +-----+------+----+---+
//    |    4|    16|null|  1|
//      |    5|    25|null|  1|
//      |    9|  null| 729|  2|
//      |   10|  null|1000|  2|
//      |    1|     1|null|  1|
//      |    2|     4|null|  1|
//      |    3|     9|null|  1|
//      |    6|  null| 216|  2|
//      |    7|  null| 343|  2|
//      |    8|  null| 512|  2|
//      +-----+------+----+---+

8.dataframe 字符串拼接

    val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
    squaresDF.createOrReplaceTempView("vs")
    squaresDF.show()
    squaresDF.map{case Row(key:Int,value:Int)=>s"$key$value"}.toDF("vv").show()
    spark.sql("select concat(value,square) as vv from vs").show()

转载于:https://www.cnblogs.com/jason-dong/p/9864977.html

weixin_30711917

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
spark 基本操作（二）

1.dataframe 基本操作 def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName("test") .master("local[*]") .getOrCreate() import spark.implicit...
复制链接

扫一扫