Spark SQL支持的外部数据源、内置函数

Spark SQL支持的外部数据源

hive

//用spark读取hive数据
    val spark: SparkSession = SparkSession.builder().appName("demo0")
      .master("local[*]")
      .config("hive.metastore.uris", "thrift://192.168.232.211:9083")
      .enableHiveSupport()
      .getOrCreate()
//    val frame: DataFrame = spark.sql("show databases")
//    frame.show()
    val frame: DataFrame = spark.sql("select * from toronto")
    frame.printSchema()
    frame.show()

mysql

val spark: SparkSession = SparkSession.builder().appName("demo0")
      .master("local[*]")
      .config("hive.metastore.uris", "thrift://192.168.232.211:9083")
      .enableHiveSupport()
      .getOrCreate()
    val url = "jdbc:mysql://192.168.232.211:3306"
    val user = "root"
    val pwd = "ok"
    val properties = new java.util.Properties()
    properties.setProperty("user",user)
    properties.setProperty("password",pwd)
    properties.setProperty("driver","com.mysql.jdbc.Driver")
    val tableDF: DataFrame = spark.read.jdbc(url,table = "test.stu",properties)
    tableDF.printSchema()
    tableDF.show()
    import org.apache.spark.sql.functions._
    tableDF.agg(max("stu_id")).show()

json

val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo1")
    val sc = SparkContext.getOrCreate(conf)
    val spark: SparkSession = SparkSession.builder().master("local[*]").appName("demo")
      .getOrCreate()
    import spark.implicits._
    val userDF: DataFrame = spark.read.format("json").json("in/user.json")
//    userDF.printSchema()
//    userDF.select("name").show()
    //userDF.select(userDF("name"),(userDF("age")+1).as("ageincrease")).filter($"ageincrease">22).show()
//    val countDF: DataFrame = userDF.groupBy("age").count()
//    val countDF1: DataFrame = countDF.withColumnRenamed("count","number")
//    countDF1.printSchema()
//    countDF1.show()
    userDF.createOrReplaceTempView("user")
    val frame1: DataFrame = spark.sql("select name,age from user where name = 'lisi'")
    frame1.show()

内置函数

org.apache.spark.sql.functions._
在这里插入图片描述

内置函数的使用

val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    //模拟用户访问日志信息
    val accessLog = Array(
      "2016-12-27,001",
      "2016-12-27,001",
      "2016-12-27,002",
      "2016-12-28,003",
      "2016-12-28,004",
      "2016-12-28,002",
      "2016-12-28,002",
      "2016-12-28,001"
    )
    val rdd: RDD[Row] = sc.parallelize(accessLog).map(x => {
      val strings: Array[String] = x.split(",")
      Row(strings(0), strings(1).toInt)
    })
    val schema = StructType(Array(
      StructField("day", StringType),
      StructField("userid", IntegerType)
    ))
    val frame: DataFrame = spark.createDataFrame(rdd,schema)
    frame.printSchema()
    frame.show()
    import org.apache.spark.sql.functions._
    frame.groupBy("day").agg(countDistinct("userid").as("uv"))
      .select("day","uv").show()

在这里插入图片描述
在这里插入图片描述

package function

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.functions._
case class Student(id:Integer,name:String,gender:String,age:Integer)
object InnerFunction2 {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("demo")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    val students = Seq(
      Student(1, "a", "F", 10),
      Student(2, "b", "F", 20),
      Student(3, "c", "M", 19),
      Student(4, "d", "F", 22),
      Student(5, "e", "F", 35),
      Student(6, "f", "M", 18),
      Student(7, "g", "F", 23),
      Student(8, "h", "F", 22),
      Student(9, "i", "M", 15)
    )
    import spark.implicits._
    //两种方式seq -> frame
    val df: DataFrame = students.toDF()
    val frame: DataFrame = spark.createDataFrame(students)
    df.printSchema()

    //avg(age)
    val avgAgeDF: DataFrame = frame.agg(avg("age"))
    avgAgeDF.show()

    //按性别分组 求平均年龄
    frame.groupBy("gender").agg(avg("age").as("avgage")).show()
    frame.groupBy("gender").agg("age"->"avg","age"->"max","age"->"min").show()

    //按性别、年龄分组  不同性别年龄的人数
    val frame2: DataFrame = frame.groupBy("gender","age").count()
    frame2.printSchema()
    frame2.show()

    //按年龄排序  select  from student order by age asc/desc
    frame.sort("age").show()//默认升序
    frame.sort($"age".desc).show()
    frame.groupBy("gender").agg(avg("age").as("avgage"))
      .sort($"avgage").show()
  }
}

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值