spark学习-SQL

最新推荐文章于 2023-04-25 17:47:21 发布
u014372225
最新推荐文章于 2023-04-25 17:47:21 发布
阅读量418
点赞数
分类专栏： spark 文章标签： spark
本文链接：https://blog.csdn.net/u014372225/article/details/68947908
版权
spark 专栏收录该内容
3 篇文章 0 订阅
订阅专栏
spark SQL基本操作
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
import org.apache.spark.{sSparkConf, SparkContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

/**
  * sparkSQL基本操作
  */
case class Person(name: String, age: Long)

object BasicSparkSQL {

  def main(args: Array[String]) {
    val conf = new SparkConf()
      .setMaster("local[1]")
      .setAppName(this.getClass.getSimpleName.filter(_.equals('$')))
    val sc = new SparkContext(conf)
    Logger.getRootLogger.setLevel(Level.WARN)

    // the entry point into all function is SparkSession class to create a basic SparkSession.build
    // init SparkSession
    val spark = SparkSession
      .builder()
      .appName("Spark SQL basic example")
      .getOrCreate()

    // for implicit conversions like converting RDDs to DataFrame
    runBasicDataFrameExample(spark)
    runDatasetCreationExample(spark)
    runInferSchemaExample(spark)
    runProgrammaticSchemaExample(spark)
  }

  private def runBasicDataFrameExample(spark: SparkSession): Unit = {
    val path = "c:/data/people.json"
    val df = spark.read.json(path)
    import spark.implicits._
    // displays the content of the DataFrame
    df.show()
    /*
                    +----+-------+
                    | age|   name|
                    +----+-------+
                    |null|Michael|
                    |  30|   Andy|
                    |  19| Justin|
                    +----+-------+
     */
    // print the schema in a tree format
    df.printSchema()
    /*
                root
                 |-- age: long (nullable = true)
                 |-- name: string (nullable = true)
     */
    // select only the 'name' column
    df.select("name").show()
    /*
                  +-------+
                  |   name|
                  +-------+
                  |Michael|
                  |   Andy|
                  | Justin|
                  +-------+
     */
    // select everybody , but increment the age by 1
    df.select($"name", $"age" + 1).show()
    /*
              +-------+---------+
              |   name|(age + 1)|
              +-------+---------+
              |Michael|     null|
              |   Andy|       31|
              | Justin|       20|
              +-------+---------+
    */
    // select people older than 21
    df.filter($"age" > 21).show()
    /*
              +---+----+
              |age|name|
              +---+----+
              | 30|Andy|
              +---+----+
    */
    // count people by age
    df.groupBy("age").count().show()
    /*
                  +----+-----+
                  | age|count|
                  +----+-----+
                  |  19|    1|
                  |null|    1|
                  |  30|    1|
                  +----+-----+
     */
    // register the DataFrame as a SQL temporary view
    df.createOrReplaceTempView("people")

    // running SQL Queries Program
    val sqlDF = spark.sql("SELECT * FROM people")
    sqlDF.show()
    /*
                +----+-------+
                | age|   name|
                +----+-------+
                |null|Michael|
                |  30|   Andy|
                |  19| Justin|
                +----+-------+
     */
    // register the DataFrame as a global temporary view
    df.createGlobalTempView("people")
    /*
              Temporary view in SparkSQL　are session-scoped and will disappear
              if the session creates in terminates ,if you want to have a temporary
              view that is shared all sessions and keep alive until the Spark application
              terminates , you can create a global temporary view , global temporary view is
              tied to a system preserved database global_temp, and we must use the qualified name
              to refer it,
     */
    // Global temporary view is tied to a system preserved database
    spark.sql("SELECT * FROM global_temp.people").show()
    /*
                    +----+-------+
                    | age|   name|
                    +----+-------+
                    |null|Michael|
                    |  30|   Andy|
                    |  19| Justin|
                    +----+-------+
     */
  }

  // creating DataSet
  private def runDatasetCreationExample(spark: SparkSession): Unit = {


    import spark.implicits._
    // Encoders are created for case classes
    val caseclassDS = Seq(Person("Andy", 32)).toDS()
    caseclassDS.show()
    // Encoders for most common types are automatically provided by implicits._
    val primitiveDS = Seq(1, 2, 3).toDS()
    primitiveDS.map(_ + 1).collect() // Array(2,3,4)

    // DataFrames can be converted to a DataSet
    val path = "c:/data/people.json"
    val peopleDS = spark.read.json(path).as[Person]
    peopleDS.show()
  }

  /*
          spark supports two different methods for converting existing RDDs into Dataset
           the first one uses reflection to infer the schema of an RDD that contains specific
           types of objects
           the second is through a programmatic interface that allows you to construct a schema
           and then apply it to an existing RDD,it allow you to constructs Datasets when the columns
           and their are not know until runtime
   */
  private def runInferSchemaExample(spark: SparkSession): Unit = {
    // for implicit conversion from RDDs to DataFrame
    import spark.implicits._
    // create an RDD of person objects from a text file,covert it to a DataFrame
    val path = "c:/data/people.txt"
    val peopleDF = spark.sparkContext
      .textFile(path)
      .map(_.split(","))
      .map(attributes => Person(attributes(0), attributes(1).trim.toInt))
      .toDF()
    // Register the DataFrame as a temporary view
    peopleDF.createOrReplaceTempView("people")

    // SQL statements can be run by using the sql methods provide by spark
    val teenagerDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
    // The columns of a row in the result can be accessed by field index
    teenagerDF.map(teenager => "Name:" + teenager(0)).show()
    // or by field name
    teenagerDF.map(teenager => "Name:" + teenager.getAs[String]("name")).show()
    // No pre-defined encoders for Dataset[Map[K,V]], define explicitly
    implicit val mapEncoder = org.apache.spark.sql.Encoders.kryo[Map[String, Any]]
    // Primitive types and case classes can be also defined as
    // implicit val stringIntMapEncoder: Encoder[Map[String, Any]] = ExpressionEncoder()

    // row.getValuesMap[T] retrieves multiple columns at once into a Map[String, T]
    teenagerDF.map(teenager => teenager.getValuesMap[Any](List("name", "age"))).collect()
    // Array(Map("name" -> "Justin", "age" -> 19))
  }


  private def runProgrammaticSchemaExample(spark: SparkSession): Unit = {
    import spark.implicits._
    // create an RDD
    val path = "c:/data/people.txt"
    val peopleRDD = spark.sparkContext
      .textFile(path)
    // The schema is encoded in a string
    val schemaString = "name age"
    // Generate the schema based on the String of schema
    val fields = schemaString.split(" ")
      .map(fieldName => StructField(fieldName, StringType, nullable = true))
    val schema = StructType(fields)
    // Convert records of the RDD (people) to Rows
    val rowRDD = peopleRDD.map(_.split(","))
      .map(attributes => Row(attributes(0), attributes(1).trim))
    // apply the schema to RDD
    val peopleDF = spark.createDataFrame(rowRDD, schema)

    // create a temporary view usring the DataFrame
    peopleDF.createOrReplaceTempView("people")

    // SQL can be run over a temporary view created using DataFrame
    val results = spark.sql("SELECT name FROM people")
    // the result of SQL queries are DataFrames and support all the normal RDD  operations
    // the columns of a row in the result can be accessed by field index
    results.map(attributes => "Name:" + attributes(0)).show()

  }
}