spark学习-SQL

spark SQL基本操作

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{sSparkConf, SparkContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

/**
  * sparkSQL基本操作
  */
case class Person(name: String, age: Long)

object BasicSparkSQL {

  def main(args: Array[String]) {
    val conf = new SparkConf()
      .setMaster("local[1]")
      .setAppName(this.getClass.getSimpleName.filter(_.equals('$')))
    val sc = new SparkContext(conf)
    Logger.getRootLogger.setLevel(Level.WARN)

    // the entry point into all function is SparkSession class to create a basic SparkSession.build
    // init SparkSession
    val spark = SparkSession
      .builder()
      .appName("Spark SQL basic example")
      .getOrCreate()

    // for implicit conversions like converting RDDs to DataFrame
    runBasicDataFrameExample(spark)
    runDatasetCreationExample(spark)
    runInferSchemaExample(spark)
    runProgrammaticSchemaExample(spark)
  }

  private def runBasicDataFrameExample(spark: SparkSession): Unit = {
    val path = "c:/data/people.json"
    val df = spark.read.json(path)
    import spark.implicits._
    // displays the content of the DataFrame
    df.show()
    /*
                    +----+-------+
                    | age|   name|
                    +----+-------+
                    |null|Michael|
                    |  30|   Andy|
                    |  19| Justin|
                    +----+-------+
     */
    // print the schema in a tree format
    df.printSchema()
    /*
                root
                 |-- age: long (nullable = true)
                 |-- name: string (nullable = true)
     */
    // select only the 'name' column
    df.select("name").show()
    /*
                  +-------+
                  |   name|
                  +-------+
                  |Michael|
                  |   Andy|
                  | Justin|
                  +-------+
     */
    // select everybody , but increment the age by 1
    df.select($"name", $"age" + 1).show()
    /*
              +-------+---------+
              |   name|(age + 1)|
              +-------+---------+
              |Michael|     null|
              |   Andy|       31|
              | Justin|       20|
              +-------+---------+
    */
    // select people older than 21
    df.filter($"age" > 21).show()
    /*
              +---+----+
              |age|name|
              +---+----+
              | 30|Andy|
              +---+----+
    */
    // count people by age
    df.groupBy("age").count().show()
    /*
                  +----+-----+
                  | age|count|
                  +----+-----+
                  |  19|    1|
                  |null|    1|
                  |  30|    1|
                  +----+-----+
     */
    // register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("people")

    // running SQL Queries Program
    val sqlDF = spark.sql("SELECT * FROM people")
    sqlDF.show()
    /*
                +----+-------+
                | age|   name|
                +----+-------+
                |null|Michael|
                |  30|   Andy|
                |  19| Justin|
                +----+-------+
     */
    // register the DataFrame as a global temporary view
    df.createGlobalTempView("people")
    /*
              Temporary view in SparkSQL are session-scoped and will disappear
              if the session creates in terminates ,if you want to have a temporary
              view that is shared all sessions and keep alive until the Spark application
              terminates , you can create a global temporary view , global temporary view is
              tied to a system preserved database global_temp, and we must use the qualified name
              to refer it,
     */
    // Global temporary view is tied to a system preserved database
    spark.sql("SELECT * FROM global_temp.people").show()
    /*
                    +----+-------+
                    | age|   name|
                    +----+-------+
                    |null|Michael|
                    |  30|   Andy|
                    |  19| Justin|
                    +----+-------+
     */
  }

  // creating DataSet
  private def runDatasetCreationExample(spark: SparkSession): Unit = {


    import spark.implicits._
    // Encoders are created for case classes
    val caseclassDS = Seq(Person("Andy", 32)).toDS()
    caseclassDS.show()
    // Encoders for most common types are automatically provided by implicits._
    val primitiveDS = Seq(1, 2, 3).toDS()
    primitiveDS.map(_ + 1).collect() // Array(2,3,4)

    // DataFrames can be converted to a DataSet
    val path = "c:/data/people.json"
    val peopleDS = spark.read.json(path).as[Person]
    peopleDS.show()
  }

  /*
          spark supports two different methods for converting existing RDDs into Dataset
           the first one uses reflection to infer the schema of an RDD that contains specific
           types of objects
           the second is through a programmatic interface that allows you to construct a schema
           and then apply it to an existing RDD,it allow you to constructs Datasets when the columns
           and their are not know until runtime
   */
  private def runInferSchemaExample(spark: SparkSession): Unit = {
    // for implicit conversion from RDDs to DataFrame
    import spark.implicits._
    // create an RDD of person objects from a text file,covert it to a DataFrame
    val path = "c:/data/people.txt"
    val peopleDF = spark.sparkContext
      .textFile(path)
      .map(_.split(","))
      .map(attributes => Person(attributes(0), attributes(1).trim.toInt))
      .toDF()
    // Register the DataFrame as a temporary view
    peopleDF.createOrReplaceTempView("people")

    // SQL statements can be run by using the sql methods provide by spark
    val teenagerDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
    // The columns of a row in the result can be accessed by field index
    teenagerDF.map(teenager => "Name:" + teenager(0)).show()
    // or by field name
    teenagerDF.map(teenager => "Name:" + teenager.getAs[String]("name")).show()
    // No pre-defined encoders for Dataset[Map[K,V]], define explicitly
    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
    // Primitive types and case classes can be also defined as
    // implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()

    // row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
    teenagerDF.map(teenager => teenager.getValuesMap[Any](List("name", "age"))).collect()
    // Array(Map("name" -> "Justin", "age" -> 19))
  }


  private def runProgrammaticSchemaExample(spark: SparkSession): Unit = {
    import spark.implicits._
    // create an RDD
    val path = "c:/data/people.txt"
    val peopleRDD = spark.sparkContext
      .textFile(path)
    // The schema is encoded in a string
    val schemaString = "name age"
    // Generate the schema based on the String of schema
    val fields = schemaString.split(" ")
      .map(fieldName => StructField(fieldName, StringType, nullable = true))
    val schema = StructType(fields)
    // Convert records of the RDD (people) to Rows
    val rowRDD = peopleRDD.map(_.split(","))
      .map(attributes => Row(attributes(0), attributes(1).trim))
    // apply the schema to RDD
    val peopleDF = spark.createDataFrame(rowRDD, schema)

    // create a temporary view usring the DataFrame
    peopleDF.createOrReplaceTempView("people")

    // SQL can be run over a temporary view created using DataFrame
    val results = spark.sql("SELECT name FROM people")
    // the result of SQL queries are DataFrames and support all the normal RDD  operations
    // the columns of a row in the result can be accessed by field index
    results.map(attributes => "Name:" + attributes(0)).show()

  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值