[Spark][spark_sql]#3_SparkSQL API

SparkSession


import org.apache.spark.sql.{DataFrame, SparkSession}

bject SparkSessionApp {

  def main(args: Array[String]): Unit = {

    // DF/DS编程的入口点
    val spark: SparkSession = SparkSession.builder()
      .master("local").getOrCreate()

    // 读取文件
    val df: DataFrame = spark.read.text("file:///Users/eric/Desktop/coding385/sparksql-train/data/input.txt")

    // TODO... 业务逻辑处理,通过DF/DS提供的API来完成业务
    df.printSchema()
    df.show()  
    spark.stop()
  }
}

DataFrame

import spark.implicits._


val people: DataFrame = spark.read.json("file:///data/people.json")


    // TODO... DF里面有两列,只要name列 ==> select name from people
people.select("name").show()
people.select($"name").show()

    // TODO...  select * from people where age > 21
people.filter($"age" > 21).show()
people.filter("age > 21").show()

    // TODO... select age, count(1) from people group by age
people.groupBy("age").count().show()

    // TODO... select name,age+10 from people
 people.select($"name", ($"age"+10).as("new_age")).show()


    // TODO... 使用SQL的方式操作
   people.createOrReplaceTempView("people")
 spark.sql("select name from people where age > 21").show()

DataSet


import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}

object DatasetApp {

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").appName("DatasetApp").getOrCreate()
    import spark.implicits._

    val ds: Dataset[Person] = Seq(Person("PK","30")).toDS()
    ds.show()

    val primitiveDS: Dataset[Int] = Seq(1,2,3).toDS()
    primitiveDS.map(x => x+1).collect().foreach(println)

    val peopleDF: DataFrame = spark.read.json("file:///data/people.json")
    val peopleDS: Dataset[Person] = peopleDF.as[Person]
    peopleDS.show(false)


    peopleDF.select("anme").show()     // 是在运行期报错
    peopleDS.map(x => x.name).show()  //编译期报错

    spark.stop()
  }

  case class Person(name: String, age: String)

}

InteroperatingRDD


import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object InteroperatingRDDApp {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().master("local").appName("DatasetApp").getOrCreate()
   
runInferSchema(spark)

runProgrammaticSchema(spark)
    spark.stop()
  }

  /**
    * 第二种方式:自定义编程
    */
  def runProgrammaticSchema(spark:SparkSession): Unit = {
    import spark.implicits._


    // step1
    val peopleRDD: RDD[String] = spark.sparkContext.textFile("file:///data/people.txt")
    val peopleRowRDD: RDD[Row] = peopleRDD.map(_.split(",")) // RDD
      .map(x => Row(x(0), x(1).trim.toInt))

    // step2
    val struct =
        StructType(
          StructField("name", StringType, true) ::
          StructField("age", IntegerType, false) ::Nil)

    // step3
    val peopleDF: DataFrame = spark.createDataFrame(peopleRowRDD, struct)

    peopleDF.show()

    peopleRowRDD
  }

  /**
    * 第一种方式:反射
    * 1)定义case class
    * 2)RDD map,map中每一行数据转成case class
    */
  def runInferSchema(spark: SparkSession): Unit = {
    import spark.implicits._

    val peopleRDD: RDD[String] = spark.sparkContext.textFile("file:///data/people.txt")

    //TODO... RDD => DF
    val peopleDF: DataFrame = peopleRDD.map(_.split(",")) //RDD
      .map(x => People(x(0), x(1).trim.toInt)) //RDD
      .toDF()
    //peopleDF.show(false)

    peopleDF.createOrReplaceTempView("people")
    val queryDF: DataFrame = spark.sql("select name,age from people where age between 19 and 29")
    //queryDF.show()

    //queryDF.map(x => "Name:" + x(0)).show()  // from index
    queryDF.map(x => "Name:" + x.getAs[String]("name")).show // from field
  }

  case class People(name:String, age:Int)
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值