spark入门学习-数据结构

1 SparkSQL的数据抽象

DataFrame和DataSet,底层是RDD。
在这里插入图片描述
DataFrame = RDD - 泛型 +Schema(指定了字段名和类型)+ SQL操作 + 优化
DataFrame 就是在RDD的基础之上做了进一步的封装,支持SQL操作!
DataFrame 就是一个分布式表!

DataSet = DataFrame + 泛型
DataSet = RDD + Schema约束(指定了字段名和类型) + SQL操作 + 优化

在这里插入图片描述

2 RDD共享变量

package com.spark学习.RDD

import com.baidu.utils.LoggerTrait
import com.spark学习.SparkUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.util.LongAccumulator

import java.lang


/**
 * 在默认情况下,当Spark在集群的多个不同节点的多个任务上并行运行一个函数时,它会把函数中涉及到的每个变量,在每个任务上都生成一个副本。
 * 但是,有时候需要在多个任务之间共享变量,或者在任务(Task)和任务控制节点(Driver Program)之间共享变量。
 * 为了满足这种需求,Spark提供了两种类型的变量:
 *
 * 1、广播变量Broadcast Variables
 * 广播变量用来把变量在所有节点的内存之间进行共享,在每个机器上缓存一个只读的变量,而不是为机器上的每个任务都生成一个副本;
 * 2、累加器Accumulators
 * 累加器支持在所有不同节点之间进行累加计算(比如计数或者求和);
 *
 * 没有广播变量,该变量发送给各个Task
 * 有广播变量,该变量发送给各个节点
 *
 * @author zhaoshuai11
 * @date 2022 /10/03
 */
object RDD共享变量 extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    // 1 准备sc-SparkContext上下文执行环境
    val spark = SparkUtils.getSparkSession("RDD的创建")
    val sc = SparkUtils.getSparkContext(spark)
    val lines: RDD[String] =
      sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/helloworld.txt")

    val myCounter: LongAccumulator = sc.longAccumulator("myCounter")
    // 定义特殊字符集合
    val ruleList: List[String] = List(",", ".", "@", "!", "*", "%", "$", "#")
    // 将集合所谓广播变量 广播到各个节点
    val broadcast: Broadcast[List[String]] = sc.broadcast(ruleList)

    accumulators_1(lines, myCounter, broadcast)

  }

  /**
   * 累加器Accumulators
   * 累加器支持在所有不同节点之间进行累加计算(比如计数或者求和);
   */
  def accumulators_1(lines: RDD[String], myCounter: LongAccumulator, broadcast: Broadcast[List[String]]): Unit = {
    val result: RDD[(String, Int)] = lines.filter(StringUtils.isNoneBlank(_))
      .flatMap(_.split("\\s+"))
      .filter(ch => {
        // 获取广播数据
        val list: List[String] = broadcast.value
        if (list.contains(ch)) {
          myCounter.add(1)
          false
        } else {
          true
        }
      }).map((_, 1))
      .reduceByKey(_ + _)
    result.foreach(println)
    println("***************")
    val value: lang.Long = myCounter.value
    println(value)
  }

  def accumulators_2(lines: RDD[String], myCounter: LongAccumulator, ruleList: List[String]): Unit = {
    val result: RDD[(String, Int)] = lines.filter(StringUtils.isNoneBlank(_))
      .flatMap(_.split("\\s+"))
      .filter(ch => {
        if (ruleList.contains(ch)) {
          myCounter.add(1)
          false
        } else {
          true
        }
      }).map((_, 1))
      .reduceByKey(_ + _)
    result.foreach(println)
    println("***************")
    val value: lang.Long = myCounter.value
    println(value)
  }
}

在这里插入图片描述

3 RDD的基本算子API

package com.spark学习.RDD

import com.baidu.utils.LoggerTrait
import com.spark学习.SparkUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD

object RDD的基本算子 extends LoggerTrait {

  def main(args: Array[String]): Unit = {
    val spark = SparkUtils.getSparkSession("RDD的基本算子")
    val sc = SparkUtils.getSparkContext(spark)
    reduceByKey(sc)
  }

  def flatMap(sc: SparkContext): Unit = {
    val lines: RDD[String] =
      sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
    val result: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
      .flatMap(_.split(" "))
      .map((_, 1)) // 针对分区中每一条数据进行操作
      .reduceByKey(_ + _)
    result.foreach(println)

    /**
     * (don't,2)
     * (Say,2)
     * (goodbye,2)
     * ... ... ...
     */
    result.saveAsTextFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld_count")
  }

  def mapPartitions(sc: SparkContext): Unit = {
    val lines =
      sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
    println(lines.getNumPartitions) // 2
    val result: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
      .flatMap(_.split(" "))
      .mapPartitions(iter => { // 针对每个分区进行操作
        // 开启连接 -- 有几个分区就执行几次
        iter.map((_, 1)) // 作用在该分区的每一条数据上
        // 关闭连接
      }).reduceByKey(_ + _)
    result.foreach(println)
  }

  def reduceAPI(sc: SparkContext): Unit = {
    val rdd1: RDD[Int] = sc.parallelize(1 to 10)
    reduce_add_API(rdd1)
  }

  def reduce_add_API(rdd1: RDD[Int]): Unit = {
    // rdd1 各元素的和
    println(rdd1.sum()) // 55
    println(rdd1.reduce(_ + _))
    println(rdd1.fold(0)(_ + _))
    println(rdd1.aggregate(0)(_ + _, _ + _))
  }

  def groupByKey(sc: SparkContext): Unit = {
    val lines: RDD[String] =
      sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
    val wordAndOne: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
      .flatMap(_.split(" "))
      .map((_, 1))
    val grouped: RDD[(String, Iterable[Int])] = wordAndOne.groupByKey()
    val result: RDD[(String, Int)] = grouped.mapValues(_.sum)
    result.foreach(println)
  }

  def reduceByKey(sc: SparkContext): Unit = {
    val lines: RDD[String] =
      sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
    val wordAndOne: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
      .flatMap(_.split(" "))
      .map((_, 1))
    val result2: RDD[(String, Int)] = wordAndOne.reduceByKey(_ + _)
    result2.foreach(println)
  }
}

在这里插入图片描述
在这里插入图片描述

4 RDD 的持久化-缓存

缓存解决什么问题? 解决的是热点数据频繁访问的效率问题,在Spark开发中某些RDD的计算或转换可能会比较耗费时间,如果这些RDD后续还会频繁的被使用到,那么可以将这些RDD进行持久化/缓存,这样下次再使用到的时候就不用再重新计算了,提高了程序运行的效率。

package com.spark学习.RDD

import com.baidu.utils.LoggerTrait
import com.spark学习.SparkUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.rdd.RDD

object RDD_cache extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark = SparkUtils.getSparkSession("RDD_cache")
    val sc = SparkUtils.getSparkContext(spark)
    val lines: RDD[String] =
      sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/helloworld.txt")

    val result: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
      .flatMap(_.split(" "))
      .map((_, 1))
      .reduceByKey(_ + _)

    // result 在后面会被频繁使用 且该RDD的计算过程比较复杂,所以为了提高后续访问该RDD的效率,放到缓存中
    result.cache()
    //    result.persist()
    //    result.persist(StorageLevel.MEMORY_AND_DISK)

    /**
     * cache底层是persist
     * def cache(): this.type = persist()
     * Persist this RDD with the default storage level (MEMORY_ONLY). persist底层默认是内存
     * rdd.persist(StorageLeveL.MEMORY_AND_DISK)// 开发中使用内存+磁盘(内存不够时才用!)
     *
     */

    result.unpersist()
  }

}

5 DataFrame 加载数据

package com.spark学习.RDD.RDDDFDS

import com.baidu.utils.LoggerTrait
import org.apache.spark.sql.{DataFrame, SparkSession}

object 加载数据 extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    // 环境创建
    val spark = SparkSession.builder().appName("Demo").master("local[*]").getOrCreate()
    // 加载数据
    val dfText: DataFrame =
      spark.read.text("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
    val dfJson: DataFrame =
      spark.read.json("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/people.json")
    val dfCSV: DataFrame =
      spark.read.csv("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/country.csv")

    dfText.show(10000, false)
    dfJson.show(10000, false)
    dfCSV.show(10000, false)
  }
}
package com.spark学习

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.utils.WriteUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

import java.util.Properties

object Data_load_upload extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark = SparkUtils.getSparkSession("Data_load_upload")
    save(spark)
  }

  def save(spark: SparkSession): Unit = {
    appendToLocalDb(spark)
  }

  def appendToLocalDb(spark: SparkSession): Unit = {
    import spark.implicits._
    val sc = spark.sparkContext
    val rdd: RDD[String] = sc.makeRDD(Seq("10002;{\"ID\":121,\"message\":{\"name\":\"Asher\",\"location\":[{\"county\":\"浦东\",\"city\":\"上海\"},{\"county\":\"西直门\",\"city\":\"北京\"}]}}; {\"fieldFill\":[{\"field\":\"productTitle\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":0,\"reference\":[],\"fieldType\":1},{\"field\":\"tags\",\"score\":1,\"problem\":\"卖点标签数量较少\",\"solution\":\"您可以通过卖点标签将产品特点优势用短语直观呈现,吸引用户关注,建议提交3个\",\"type\":0,\"reference\":[],\"fieldType\":1},{\"field\":\"desc\",\"score\":1,\"problem\":\"产品简介字数较少\",\"solution\":\"产品简介有利于帮助用户快速获取产品信息,建议字符数≥20字符,当前字符数18\",\"type\":0,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":1,\"problem\":\"产品详情无图片\",\"solution\":\"产品详情是对产品详细信息和特点的整体描述,图文并茂的产品详情页对用户转化有重要影响,建议字符数≥100字符,并至少提交1张详情图片,当前字符数283,图片数0\",\"type\":0,\"reference\":[],\"fieldType\":1}],\"fieldRepeat\":[{\"field\":\"desc\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":1,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":1,\"reference\":[],\"fieldType\":1},{\"field\":\"image\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":1,\"reference\":[],\"fieldType\":2},{\"field\":\"productTitle\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":1},{\"field\":\"desc\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":1},{\"field\":\"image\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":2}],\"pictureVision\":[{\"field\":\"\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":3,\"reference\":[],\"fieldType\":2},{\"field\":\"\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":4,\"reference\":[],\"fieldType\":2},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":5,\"reference\":[],\"fieldType\":2}],\"textQuality\":[{\"field\":\"productTitle,desc,content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":6,\"reference\":[],\"fieldType\":1},{\"field\":\"productTitle,desc,content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":7,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":7,\"reference\":[],\"fieldType\":1}]}"))
    val value: RDD[(String, String, String)] = rdd.map(item => {
      val strings: Array[String] = item.split(";")
      (strings(0), strings(1), strings(2))
    })
    val df: DataFrame = value.toDF("product_id", "image", "quality_detail")
    df.show(1000, false)
    WriteUtils.appendToLocalDb(spark, df, "tb_product")
  }

  def load(spark: SparkSession): Unit = {
    jdbc(spark)
  }

  def json(spark: SparkSession): Unit = {
    val path = "file:Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/people.json"
    val df = spark.read.json(path) // 常用写法
    df.show(10000, false)
  }

  /**
   * 加载文本 只能解析成一列
   *
   * @param spark
   */
  def txt(spark: SparkSession): Unit = {
    val path = "file:Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt"
    val df = spark.read.text(path)
    df.show(10000, false)
  }

  def csv(spark: SparkSession): Unit = {
    val path = "file:Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/country.csv"
    val df = spark.read.csv(path).toDF("id", "name", "simple_name")
    df.show(10000, false)
  }

  def jdbc(spark: SparkSession): Unit = {
    val url = "jdbc:mysql://localhost:3306/wzg"
    val table = "tb_product"
    val props = new Properties()
    props.put("user", "root");
    props.put("password", "root");
    val df = spark.read.jdbc(url, table, props)
    df.show(10000, false)
  }
}

6 由RDD生成DF

package com.spark学习.RDD.RDDDFDS

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.Person
import com.spark学习.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession};

object RDD的生成 extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("RDD的生成")
    val sc: SparkContext = SparkUtils.getSparkContext(spark)

    val rdd: RDD[Person] = generateRDD_CaseClass(sc)
    import spark.implicits._
    rdd.toDF()

    val rowRDD: RDD[Row] = generateRDD_defineScheme(sc)
    val scheme: StructType = StructType(
      List(StructField("id", IntegerType, false),
        StructField("name", StringType, false),
        StructField("age", IntegerType, false)))
    val df: DataFrame = spark.createDataFrame(rowRDD, scheme)

    val classRDD: RDD[(Int, String, Int)] = generateRDD_class(sc)
    classRDD.toDF()

  }

  /**
  *   指定类型
  *
  * @param sc sc
  * @return {  @link RDD  }  <  {  @link Tuple3  }  <  {  @link Object  },   {  @link String  },   {  @link Object  }  >>
  */
  def generateRDD_class(sc: SparkContext): RDD[(Int, String, Int)] = {
    val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
    val classRDD: RDD[(Int, String, Int)] = lines.map(line => {
      val arr: Array[String] = line.split(" ")
      ((arr(0).toInt), (arr(1)), (arr(2).toInt))
    })
    classRDD
  }

  /**
   * 自定义scheme
   *
   * @param sc sc
   * @return {  @link RDD  }  <  {  @link Row  }  >
   */
  def generateRDD_defineScheme(sc: SparkContext): RDD[Row] = {
    val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
    val rowRDD: RDD[Row] = lines.map(line => {
      val arr = line.split(" ")
      Row(arr(0).toInt, arr(1), arr(2).toInt)
    })
    rowRDD
  }

  /**
   * 样例类
   *
   * @param sc sc
   * @return {  @link RDD  }  <  {  @link Person  }  >
   */
  def generateRDD_CaseClass(sc: SparkContext): RDD[Person] = {
    val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
    val personRDD: RDD[Person] = lines.map(line => {
      val personArray: Array[String] = line.split(" ")
      Person(personArray(0).toInt, personArray(1), personArray(2).toInt)
    })
    personRDD
  }

}

7 RDD DF DS 相互转化

在这里插入图片描述

package com.spark学习.RDD.RDDDFDS

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.Person
import com.spark学习.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object RDD_DF_DS extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("RDD_DF_DS")
    val sc: SparkContext = SparkUtils.getSparkContext(spark)
    val personRDD: RDD[Person] = generateRDD(sc)
    val ds: Dataset[Person] = RDD2DS(personRDD, spark)
    val df: DataFrame = RDD2DF(personRDD, spark)
    DF2DS(df, spark)
    DS2DS(ds)
  }

  def DS2DS(ds: Dataset[Person]): DataFrame = {
    val df: DataFrame = ds.toDF()
    df
  }

  def DF2DS(df: DataFrame, spark: SparkSession): Dataset[Person] = {
    import spark.implicits._
    val ds: Dataset[Person] = df.as[Person]
    ds
  }

  def DS2RDD(ds: Dataset[Person]): Unit = {
    val rdd: RDD[Person] = ds.rdd
    rdd.foreach(println)

    /**
     * Person(1,zhangsan,20)
     * Person(4,zhaoliu,30)
     * Person(5,tianqi,35)
     * Person(5,kobe,40)
     * Person(2,lisi,29)
     * Person(3,wangwu,25)
     */
  }

  def DF2RDD(personDf: DataFrame): Unit = {
    val rdd: RDD[Row] = personDf.rdd
    rdd.foreach(println)

    /**
     * [1,zhangsan,20]
     * [4,zhaoliu,30]
     * [2,lisi,29]
     * [5,tianqi,35]
     * [3,wangwu,25]
     * [5,kobe,40]
     */
  }

  def RDD2DS(personRDD: RDD[Person], spark: SparkSession): Dataset[Person] = {
    import spark.implicits._
    val ds: Dataset[Person] = personRDD.toDS()
    ds
  }

  def RDD2DF(personRDD: RDD[Person], spark: SparkSession): DataFrame = {
    import spark.implicits._
    val df: DataFrame = personRDD.toDF()
    df
  }

  def generateRDD(sc: SparkContext): RDD[Person] = {
    val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
    val personRDD: RDD[Person] = lines.map(line => {
      val personArray: Array[String] = line.split(" ")
      Person(personArray(0).toInt, personArray(1), personArray(2).toInt)
    })
    personRDD
  }

}

8 wordCount

package com.baidu.sparkcodetest.sql

import com.baidu.utils.LoggerTrait
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Row, SparkSession}

object Demo_spark_wordcount extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    // 环境创建
    val spark = SparkSession.builder().appName("Demo_spark_wordcount").master("local[*]").getOrCreate()
    // 加载数据
    val ds: Dataset[String] =
      spark.read.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/helloworld.txt")
    ds.show()
    /**
     * +------------------+
     * |             value|
     * +------------------+
     * |hello me you her !|
     * |      hello me you|
     * |          hello me|
     * |             hello|
     * |                 #|
     * |                 #|
     * |                 #|
     * |                  |
     * +------------------+
     */
    import spark.implicits._
    val words: Dataset[String] = ds.flatMap(_.split(" "))
    words.createOrReplaceTempView("tb_words")
    spark.sql(
      """
        |select value, count(*) as counts from tb_words group by  value order by counts desc
        |""".stripMargin).show()

    words.groupBy('value)
      .count()
      .orderBy('count.desc).show()

  }
}

9 JDBC

9.1 JDBC工具类

package com.spark学习.SQL.jdbc.utils

import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}

import java.sql.DriverManager
import java.util.Properties

object WriteUtils {
  val wzgUser = "root"
  val wzgPassword = "root"
  val wzgUrl = "jdbc:mysql://localhost:3306/wzg"

  /**
   * 获取local链接
   *
   * @return
   */
  def getLocalJDBC(spark: SparkSession, table: String) = {
    val props = new Properties()
    props.put("user", wzgUser);
    props.put("password", wzgPassword);
    val df = spark.read.jdbc(wzgUrl, table, props)
    df
  }

  /**
   * 追加输出到 local db
   */
  def appendToLocalDb(spark: SparkSession, df: DataFrame, tableName: String): Unit = {
    println("start output .")

    df.repartition(1)
      .write.mode(SaveMode.Append)
      .format("jdbc")
      .option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver")
      .option(JDBCOptions.JDBC_URL, wzgUrl)
      .option(JDBCOptions.JDBC_TXN_ISOLATION_LEVEL, "NONE")
      .option(JDBCOptions.JDBC_TABLE_NAME, tableName)
      .option("user", wzgUser)
      .option("password", wzgPassword)
      .save()
  }

  /**
   * 通用的以 overwrite 模式写到 hdfs csv 文件,\t 作为分隔符
   */
  def writeCommonCsvToHdfs(df: DataFrame, numPartitions: Int, header: Boolean, hdfsPath: String): Unit = {
    writeCommonCsvToHdfs(df, numPartitions, header, hdfsPath, "\t", "csv")
  }

  /**
   * 通用的以 overwrite 模式写到 hdfs csv 文件,separator 作为分隔符
   */
  def writeCommonCsvToHdfs(df: DataFrame, numPartitions: Int, header: Boolean, hdfsPath: String, separator: String): Unit = {
    writeCommonCsvToHdfs(df, numPartitions, header, hdfsPath, separator, "csv")
  }

  /**
   * 通用的以 overwrite 模式写到 hdfs scv 文件,separator 作为分隔符,format格式
   */
  def writeCommonCsvToHdfs(df: DataFrame, numPartitions: Int, header: Boolean, hdfsPath: String, separator: String, format: String): Unit = {
    df.repartition(numPartitions)
      .write.mode("Overwrite")
      .format(format).option("delimiter", separator).option("sep", separator)
      .option("header", if (header) "true" else "false")
      .save(hdfsPath)
  }

  /**
   * 执行删除语句
   *
   * @param spark spark
   * @param sql   sql
   */
  def deleteFromLocalDb(spark: SparkSession, sql: String): Unit = {
    println("start delete . " + sql)
    val conn = DriverManager.getConnection(wzgUrl, wzgUser, wzgPassword)
    val statement = conn.createStatement()
    statement.execute(sql)
    statement.close()
    conn.close()
  }

  /**
   * 执行update语句
   *
   * @param spark spark
   * @param sql   sql
   */
  def updateFromFcDiDb(spark: SparkSession, sql: String): Unit = {
    println("start update . " + sql)
    val conn = DriverManager.getConnection(wzgUrl, wzgUrl, wzgPassword)
    val statement = conn.createStatement()
    statement.execute(sql)
    statement.close()
    conn.close()
  }
}

9.2 JdbcOption相关操作

package com.spark学习.SQL.jdbc

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.Person
import com.spark学习.SQL.jdbc.utils.WriteUtils
import com.spark学习.SparkUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}

object JdbcOption extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("JdbcOption")


  }

  /**
  *   delete
  *
  * @param spark
  */
  def deleteFromDb(spark: SparkSession): Unit = {
    val sql = "delete from tb_person where id=5"
    WriteUtils.deleteFromLocalDb(spark, sql)
  }

  /**
   * insert
   *
   * @param spark
   */
  def addDfToDb(spark: SparkSession): Unit = {
    val df: DataFrame = generateDF(spark)
    WriteUtils.appendToLocalDb(spark, df, "tb_person")
  }

  /**
   * select
   *
   * @param spark
   */
  def getFromDB(spark: SparkSession): DataFrame = {
    val table = "tb_person"
    val df: DataFrame = WriteUtils.getLocalJDBC(spark, table)
    df
  }

  def generateDF(spark: SparkSession): DataFrame = {
    val sc = spark.sparkContext
    val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
    val personRDD: RDD[Person] = lines.map(line => {
      val personArray: Array[String] = line.split(" ")
      Person(personArray(0).toInt, personArray(1).toString, personArray(2).toInt)
    })
    import spark.implicits._
    val df: DataFrame = personRDD.toDF()
    df
  }
}

9.3 crud_DataFrame对象上Action操作

package com.spark学习.SQL.jdbc

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.JdbcOption.generateDF
import com.spark学习.SparkUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import java.util

object crud_DataFrame对象上Action操作 extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("crud")
    val df: DataFrame = generateDF(spark)
    first_head_take_takeAsList(df)

  }

  /**
   * 获取若干行记录
   * (1)first获取第一行记录
   *   (2)head获取第一行记录,head(n: Int)获取前n行记录
   *   (3)take(n: Int)获取前n行数据
   *   (4)takeAsList(n: Int)获取前n行数据,并以List的形式展现
   * take和takeAsList方法会将获得到的数据返回到Driver端,所以,使用这两个方法时需要注意数据量,以免Driver发生OutOfMemoryError
   *
   * @param df df
   */
  def first_head_take_takeAsList(df: DataFrame): Unit = {
    val fistData: String = df.first().mkString(",")
    // 1,zhangsan,20
    val headRows: Array[Row] = df.head(2)
    val headArrayString: String = headRows.mkString(",")
    // [1,zhangsan,20],[2,lisi,29]
    val headString: String = headRows.map(row => row.mkString(",")).mkString(",")
    // 1,zhangsan,20,2,lisi,29
  }

  /**
   * 获取指定字段的统计信息
   *
   * 这个方法可以动态的传入一个或多个String类型的字段名,结果仍然为DataFrame对象,用于统计数值类型字段的统计值,比如count, mean, stddev, min, max等。
   *
   * @param df df
   */
  def describe(df: DataFrame): Unit = {
    val summaryDf: DataFrame = df.describe("age")
    summaryDf.show(1, false)

    /**
     * +-------+---+
     * |summary|age|
     * +-------+---+
     * |count  |6  |
     * +-------+---+
     */
    summaryDf.select("age").where("summary = 'count'").show()

    /**
     * +---+
     * |age|
     * +---+
     * |  6|
     * +---+
     */
  }

  /**
   * 功能和collect类似,只不过将返回结构变成了List对象名单
   *
   * @param df df
   */
  def collectAsList(df: DataFrame): Unit = {
    val rows: util.List[Row] = df.collectAsList()
    rows.forEach(row => {
      println(row)
      // [1,zhangsan,20]
      val headers: String = row.schema.mkString(",")
      // StructField(id,IntegerType,false),StructField(name,StringType,true),StructField(age,IntegerType,false)
      val datas: String = row.mkString(",")
      // 1,zhangsan,20
    })
  }

  /**
   * collect方法会将jdbcDF中的所有数据都获取到,并返回一个Array对象
   *
   * @param df df
   */
  def collect(df: DataFrame): Unit = {
    val rows: Array[Row] = df.collect()
    for (elem <- rows) {
      val schema: StructType = elem.schema
      // StructType(StructField(id,IntegerType,false), StructField(name,StringType,true), StructField(age,IntegerType,false))
      val names: Array[String] = schema.names
      val headers: String = names.mkString(",")
      // id,name,age
      val datas: String = elem.mkString(",")
      // 1,zhangsan,20
    }
  }

  /**
   * 以表格的形式在输出中展示jdbcDF中的数据,类似于select * from spark_sql_test的功能
   *
   * @param df df
   */
  def show(df: DataFrame): Unit = {
    /**
     * 综合前面的显示记录条数,以及对过长字符串的显示格式 是否最多只显示20个字符,默认为true
     */
    df.show(2, false)
  }
}

9.4 crud_DataFrame对象上的条件查询和join

package com.spark学习.SQL.jdbc

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.JdbcOption.getFromDB
import com.spark学习.SparkUtils
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object crud_DataFrame对象上的条件查询和join extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("crud")
    val df: DataFrame = getFromDB(spark)
    explode_fun(df)
  }

  /**
   * 炸裂:将一行数据,炸裂为多行, 也就是“列转行”
   *
   * @param df df
   */
  def explode_fun(df: DataFrame): Unit = {
    df.explode("name", "name_") {
      item: String => item.split(" ")
    }.select("name_").show(1000, false)
  }

  /**
   * whtiColumn(colName: String , col: Column)方法根据指定colName往DataFrame中新增一列,如果colName已存在,则会覆盖当前列。
   *
   * @param df df
   */
  def withColumn(df: DataFrame): Unit = {
    df.withColumn("new_id", df.col("id")).show()
  }

  /**
   * withColumnRenamed:重命名DataFrame中的指定字段名
   *   如果指定的字段名不存在,不进行任何操作。下面示例中将jdbcDF中的id字段重命名为idx。
   *
   * @param df df
   */
  def withColumnRenamed(df: DataFrame): Unit = {
    df.withColumnRenamed("id", "person_id")
      .withColumnRenamed("name", "person_name")
      .withColumnRenamed("age", "person_age").show(1000, false)
  }

  /**
   * 获取一个DataFrame中有另一个DataFrame中没有的记录
   *
   * @param df df
   */
  def except(df: DataFrame): Unit = {
    df.except(df.limit(1)).show(1000, false)
  }

  /**
   * intersect方法可以计算出两个DataFrame中相同的记录
   *
   * @param df df
   */
  def intersect(df: DataFrame): Unit = {
    df.intersect(df.limit(1)).show(1000, false)
  }

  /**
   * stat方法可以用于计算指定字段或指定字段之间的统计信息,比如方差,协方差等。这个方法返回一个DataFramesStatFunctions类型对象。
   * 下面代码演示根据c1字段,统计该字段值出现频率在30%以上的内容。
   * 在df中字段c1的内容为"a, b, a, c, d, b"。其中a和b出现的频率为2 / 6,大于0.3
   *
   * @param df df
   */
  def stat(df: DataFrame): Unit = {
    val freqDf: DataFrame = df.stat.freqItems(Seq("name"), 0.3)
    val headers: String = freqDf.schema.names.mkString(",") // name_freqItems
    val row: Row = freqDf.head()
    // [WrappedArray(zhangsan, zhaoliu)]
  }

  /**
   * 在SQL语言中用得很多的就是join操作,DataFrame中同样也提供了join的功能。
   *   在DataFrame中提供了六个重载的join方法。
   *
   */
  def join(df: DataFrame): Unit = {
    /**
     * 笛卡尔积
     */
    df.join(df).show(1000, false)

    /**
     * using 一个字段
     */
    df.join(df, "name").show(1000, false)

    /**
     * using 两个字段
     */
    df.join(df, Seq("name", "id")).show(1000, false)

    /**
     * 两个DataFrame的join操作有inner, outer, left_outer, right_outer, leftsemi类型。
     * 在上面的using多个字段的join情况下,可以写第三个String类型参数,指定join的类型
     */
    df.join(df, Seq("name"), "inner").show(1000, false)

    /**
     * 使用Column类型来join
     *   如果不用using模式,灵活指定join字段的话
     */
    df.join(df, df.col("id") === df.col("id"), "inner").show(1000, false)

    /**
     * +---+--------+---+---+--------+---+
     * |id |name    |age|id |name    |age|
     * +---+--------+---+---+--------+---+
     * |1  |zhangsan|20 |1  |zhangsan|20 |
     * |3  |wangwu  |25 |3  |wangwu  |25 |
     * |5  |zhangsan|35 |5  |zhangsan|35 |
     * |4  |zhaoliu |30 |4  |zhaoliu |30 |
     * |2  |lisi    |29 |2  |lisi    |29 |
     * +---+--------+---+---+--------+---+
     *
     */
  }

  /**
   * unionAll方法:对两个DataFrame进行组合
   *   类似于SQL中的UNION ALL操作。
   *
   * @param df df
   */
  def union(df: DataFrame): Unit = {
    df.union(df.limit(1)).show()
  }

  /**
   * 聚合操作调用的是agg方法,该方法有多种调用方式。一般与groupBy方法配合使用。
   *   以下示例其中最简单直观的一种用法,对id字段求最大值,字段age求和
   *
   * @param df df
   */
  def agg(df: DataFrame): Unit = {
    df.agg("id" -> "max", "age" -> "sum").show(false)
  }

  /**
   * 根据指定字段去重。类似于select distinct a, b操作
   *
   */
  def dropDuplicates(df: DataFrame): Unit = {
    df.dropDuplicates(Seq("name")).show()
  }

  /**
   * 返回当前DataFrame中不重复的Row记录。该方法和接下来的dropDuplicates()方法不传入指定字段时的结果相同。
   *
   * @param df df
   */
  def distinct(df: DataFrame): Unit = {
    df.distinct().show()
  }

  /**
   * groupBy:根据字段进行group by操作
   *   groupBy方法有两种调用方式,可以传入String类型的字段名,也可传入Column类型的对象
   * max(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的最大值,只能作用于数字型字段
   * min(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的最小值,只能作用于数字型字段
   * mean(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的平均值,只能作用于数字型字段
   * sum(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的和值,只能作用于数字型字段
   * count()方法,获取分组中的元素个数
   *
   * @param df df
   */
  def group_by(df: DataFrame): Unit = {
    val group_by_df: DataFrame = df.groupBy(df.col("name").as("name_counts")).count()
    group_by_df.select(group_by_df.col("name"), group_by_df.col("count").as("name_counts")).show()
  }

  /**
   * orderBy和sort:按指定字段排序,默认为升序
   *   示例1,按指定字段排序。加个-表示降序排序。sort和orderBy使用方法相同
   *
   * @param df df
   */
  def order_by(df: DataFrame): Unit = {
    /**
     * 方式1
     */
    df.orderBy(df.col("age").desc).show(false)

    /**
     * 方式2
     */
    df.orderBy(-df.col("age")).show(false)
  }

  /**
   * limit方法获取指定DataFrame的前n行记录,得到一个新的DataFrame对象。和take与head不同的是,limit方法不是Action操作。
   *
   * @param df df
   */
  def limit(df: DataFrame): Unit = {
    df.limit(3).show(false)
  }

  /**
   * 返回一个新的DataFrame对象,其中不包含去除的字段,一次只能去除一个字段
   * 不会改变源数据
   *
   * @param df df
   */
  def drop(df: DataFrame): Unit = {
    val d_df: DataFrame = df.drop("id")
    d_df.show()
    df.show()
  }

  /**
   * 只能获取一个字段,返回对象为Column类型
   *
   * @param df df
   */
  def apply(df: DataFrame): Unit = {
    df.apply("id").as("person_id")
    df.select(df.apply("id").as("person_id"),
      df.apply("name").as("person_name"),
      df.apply("age").as("person_age")).show()
  }

  /**
   * 获取指定字段
   *   只能获取一个字段,返回对象为Column类型
   *
   * @param df df
   */
  def col(df: DataFrame): Unit = {
    val person_id = df.col("id").as("person_id")
    val person_age = df.col("age").+(1000).as("person_age")
    val person_name = df.col("name").as("person_name")
    df.select(person_id, person_name, person_age).show()
  }

  /**
   * 根据传入的String类型字段名,获取指定字段的值,以DataFrame类型返回
   *
   * @param df df
   */
  def select(df: DataFrame): Unit = {
    df.select("id", "name").show()
  }

  /**
   * 可以对指定字段进行特殊处理
   *   可以直接对指定字段调用UDF函数,或者指定别名等。传入String类型参数,得到DataFrame对象
   *
   */
  def selectExpr(df: DataFrame): Unit = {
    df.selectExpr("id as person_id", "name as person_name", "age as person_age").show()
  }

  /**
   * SQL语言中where关键字后的条件
   *   传入筛选条件表达式,可以用and和or。得到DataFrame类型的返回结果
   *
   */
  def where(df: DataFrame): Unit = {
    df.where("id = 1 or name = 'lisi'").show()

    /**
     * 根据字段进行筛选
     *   传入筛选条件表达式,得到DataFrame类型的返回结果。和where使用条件相同
     */
    df.filter("id = 1 or name = 'lisi'").show()
  }

}

10 SparkSQL练习题_学生成绩

package com.spark学习.SQL.jdbc.SparkSQL练习题_学生成绩

import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.utils.WriteUtils
import com.spark学习.SparkUtils
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

object demo extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("demo")
    val courseDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Course")
    val studentDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Student")
    val teacherDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Teacher")
    val scoreDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Score")
    fun21(courseDf, scoreDf, teacherDf, studentDf, spark)
    //    fun9(scoreDf, spark)
  }

  /**
   * 练习20:查询各科成绩前三名的记录(考虑并列,考虑空缺)
   *
   */
  def fun21(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.select($"s_courseId", $"s_stuId", $"s_score",
      rank().over(Window.partitionBy($"s_courseId")
        .orderBy($"s_score".desc)).as("rank"))
      .filter($"rank".leq(3))
      .show()

    /**
     * +----------+-------+-------+----+
     * |s_courseId|s_stuId|s_score|rank|
     * +----------+-------+-------+----+
     * |         1|      1|     80|   1|
     * |         1|      3|     80|   1|
     * |         1|      5|     76|   3|
     * |         3|      1|     99|   1|
     * |         3|      7|     98|   2|
     * |         3|      2|     80|   3|
     * |         3|      3|     80|   3|
     * |         2|      1|     90|   1|
     * |         2|      7|     89|   2|
     * |         2|      5|     87|   3|
     * +----------+-------+-------+----+
     */
  }

  /**
   * 练习19:查询各科成绩前三名的记录(考虑并列,不考虑空缺)
   *
   */
  def fun20(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.select($"s_courseId", $"s_stuId", $"s_score",
      dense_rank().over(Window.partitionBy($"s_courseId")
        .orderBy($"s_score".desc)).as("rank"))
      .filter($"rank".leq(3))
      .show()

    /**
     * +----------+-------+-------+----+
     * |s_courseId|s_stuId|s_score|rank|
     * +----------+-------+-------+----+
     * |         1|      1|     80|   1|
     * |         1|      3|     80|   1|
     * |         1|      5|     76|   2|
     * |         1|      2|     70|   3|
     * |         3|      1|     99|   1|
     * |         3|      7|     98|   2|
     * |         3|      2|     80|   3|
     * |         3|      3|     80|   3|
     * |         2|      1|     90|   1|
     * |         2|      7|     89|   2|
     * |         2|      5|     87|   3|
     * +----------+-------+-------+----+
     *
     */
  }

  /**
   * 练习18:查询各科成绩前三名的记录(不考虑并列)
   *
   */
  def fun19(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.select($"s_courseId", $"s_stuId", $"s_score",
      row_number().over(Window.partitionBy($"s_courseId")
        .orderBy($"s_score".desc)).as("rank"))
      .filter($"rank".leq(3))
      .show()

    /**
     * +----------+-------+-------+----+
     * |s_courseId|s_stuId|s_score|rank|
     * +----------+-------+-------+----+
     * |         1|      1|     80|   1|
     * |         1|      3|     80|   2|
     * |         1|      5|     76|   3|
     * |         3|      1|     99|   1|
     * |         3|      7|     98|   2|
     * |         3|      2|     80|   3|
     * |         2|      1|     90|   1|
     * |         2|      7|     89|   2|
     * |         2|      5|     87|   3|
     * +----------+-------+-------+----+
     */

  }

  /**
   * 练习17:查询学生的总成绩,并进行排名,总分重复时保留名次空缺
   *
   */
  def fun18(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.groupBy('s_stuId).agg(sum('s_score).as("totalScore"))
      .select('s_stuId, 'totalScore,
        rank().over(Window.orderBy('totalScore.desc)).alias("rank")).show()

  }

  /**
   * 练习16:按各科成绩进行排序,并显示排名, Score 重复时并列,排名重复有空缺
   * rank().over(Window.partitionBy('s_courseId)
   *
   */
  def fun17(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.select('s_courseId, 's_stuId, 's_score,
      rank().over(Window.partitionBy('s_courseId)
        .orderBy('s_score.desc)).alias("rank")).show(1000, false)
  }

  /**
   * 练习15:按各科成绩进行排序,并显示排名, Score 重复时并列,排名连续
   * dense_rank().over(Window.partitionBy('s_courseId)
   *
   */
  def fun16(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.select('s_courseId, 's_stuId, 's_score,
      dense_rank().over(Window.partitionBy('s_courseId)
        .orderBy('s_score.desc)).alias("rank")).show(1000, false)
  }

  /**
   * 练习14:要求输出课程号和选修人数,查询结果按人数降序排列,若人数相同,按课程号升序排列
   *
   */
  def fun15(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.groupBy("s_courseId").agg(count('s_courseId).alias("courseIdCounts"))
      .orderBy('courseIdCounts.desc, 's_courseId).show()
  }

  /**
   * 练习13:查询各科成绩最高分、最低分和平均分
   *
   */
  def fun14(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    scoreDf.groupBy("s_courseId").agg(max("s_score"), min("s_score"), mean("s_score")).show()
  }

  /**
   * 练习12:按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩
   *
   */
  def fun13(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.groupBy("s_stuId").agg(mean('s_score).alias("score_mean"))
      .join(scoreDf, "s_stuId")
      .orderBy('score_mean.desc)
      .select("s_stuId", "s_courseId", "s_score", "score_mean")
      .show()
  }

  /**
   * 练习11:检索" 01 "课程分数小于 60,按分数降序排列的学生信息
   *
   */
  def fun12(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.where("s_courseId = 1 and s_score < 60").dropDuplicates("s_stuId")
      .join(studentDf, 'stu_stuId === 's_stuId)
      .orderBy('s_score.desc)
      .select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender", "s_score").show(1000, false)

  }

  /**
   * 练习10:查询两门及其以上不及格课程的同学的学号,姓名及其平均成绩
   *
   */
  def fun11(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.filter('s_score < 60)
      .groupBy("s_stuId")
      .agg(count('s_score).alias("score_counts"), mean('s_score).alias("score_mean"))
      .filter('score_counts >= 2)
      .join(studentDf, 's_stuId === 'stu_stuId)
      .select("stu_stuId", "stu_stuName", "score_mean").show(1000, false)
  }

  /**
   * 练习9:查询没学过"张三"老师讲授的任一门课程的学生姓名
   *
   */
  def fun10(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    val courseIdStr: String = teacherDf.where("t_teacherName = 'zhangsan'")
      .join(courseDf, 'c_teacherId === 't_teacherId).select("c_courseId").collect()
      .map(item => item.mkString(",")).mkString(",")

    val stuIdStr: String = scoreDf.where(s"s_courseId in ($courseIdStr)")
      .dropDuplicates("s_stuId").select("s_stuId").collect()
      .map(item => item.mkString(",")).mkString(",")

    scoreDf.where(s"s_stuId not in ($stuIdStr)").dropDuplicates("s_stuId")
      .join(studentDf, 'stu_stuId === 's_stuId)
      .select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender").show(1000, false)


  }

  /**
   * 意外收获:行转列,没有补空值
   *
   */
  def fun9(scoreDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.show()
    scoreDf.groupBy("s_stuId").pivot($"s_courseId").sum("s_score").orderBy("s_stuId").show()

    /**
     * +-------+----------+-------+
     * |s_stuId|s_courseId|s_score|
     * +-------+----------+-------+
     * |      1|         1|     80|
     * |      1|         2|     90|
     * |      1|         3|     99|
     * |      2|         1|     70|
     * |      2|         2|     60|
     * |      2|         3|     80|
     * ...   ...  ...
     * +-------+----------+-------+
     *
     */
    /**
     * +-------+----+----+----+
     * |s_stuId|   1|   2|   3|
     * +-------+----+----+----+
     * |      1|  80|  90|  99|
     * ...  ...  ...
     * |      7|null|  89|  98|
     * +-------+----+----+----+
     *
     */
  }

  /**
   * 练习8:查询和" 01 "号的同学学习的课程 完全相同的其他同学的信息
   *
   */
  def fun8(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    val courseIdCountArray: Array[String] = scoreDf.filter('s_stuId === 1).select("s_courseId").collect().map(item => item.mkString(","))
    scoreDf.filter(s"s_courseId in (${courseIdCountArray.mkString(",")}) and s_stuId != 1")
      .groupBy("s_stuId").agg(count("s_courseId").alias("sc_courseIdCounts"))
      .filter(s"sc_courseIdCounts = ${courseIdCountArray.length}")
      .select("s_stuId").distinct()
      .join(studentDf, 'stu_stuId === 's_stuId)
      .select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender")
      .orderBy('stu_stuId)
      .show(1000, false)
  }

  /**
   * //练习7:查询至少有一门课与学号为" 01 "的同学所学相同的同学的信息
   *
   */
  def fun7(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.where(s"s_courseId in (${scoreDf.filter('s_stuId === 1).select("s_courseId").collect().map(item => item.mkString(",")).mkString(",")}) and s_stuId != 1")
      .select("s_stuId").distinct()
      .join(studentDf, 'stu_stuId === 's_stuId)
      .select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender")
      .orderBy('stu_stuId)
      .show(1000, false)
  }

  /**
   * //练习6:查询没有学全所有课程的同学的信息
   *
   */
  def fun6(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.groupBy("s_stuId").agg(count("s_courseId").alias("s_courseIdCounts"))
      .filter('s_courseIdCounts < courseDf.select("c_courseId").distinct().count())
      .join(studentDf, 'stu_stuId === 's_stuId)
      .select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender").orderBy('stu_stuId)
      .show(1000, false)


  }

  /**
   * //练习5:查询学过「张三」老师授课的同学的信息
   *
   */
  def fun5(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    fun5_2(courseDf, scoreDf, teacherDf, studentDf, spark)
  }

  def fun5_2(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    teacherDf.where("t_teacherName='zhangsan'")
      .join(courseDf, 't_teacherId === 'c_teacherId)
      .join(scoreDf, 's_courseId === 'c_courseId)
      .join(studentDf, 's_stuId === 'stu_stuId)
      .select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender").distinct()
      .orderBy(studentDf.col("stu_stuId")).show(1000, false)

  }

  def fun5_1(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
    val teacher_id = teacherDf.select("t_teacherId").where("t_teacherName='zhangsan'").head().mkString(",")
    val c_courseId = courseDf.where(s"c_teacherId=$teacher_id").select("c_courseId").head().mkString(",")
    val rows: Array[Row] = scoreDf.where(s"s_courseId = $c_courseId").select("s_stuId").collect()
    val stuIds: String = rows.map(item => item.mkString(",")).mkString(",")
    studentDf.where(s"stu_stuId in ($stuIds)").show()
  }

  /**
   * //练习4:查询「李」姓老师的数量
   *
   */
  def fun4(teacherDf: DataFrame): Unit = {
    teacherDf.where("t_teacherName like '李%%'")
      .groupBy("t_teacherId")
      .agg(count("t_teacherId").alias("counts")).select("*").show()
  }

  /**
   * //练习3:查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩(没成绩的显示为 null )
   *
   */
  def fun3(studentDf: DataFrame, scoreDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.groupBy("s_stuId")
      .agg(count("s_score").alias("counts"), sum("s_score").alias("sum_score"))
      .join(studentDf, 's_stuId === 'stu_stuId, "right")
      .select("stu_stuId", "stu_stuName", "counts", "sum_score")
      .orderBy("stu_stuId")
      .show(1000, false)
  }

  /**
   * //练习2:查询平均成绩大于等于 60 分的同学的学生编号和学生姓名和平均成绩(无成绩不参与平均值计算)
   *
   */
  def fun2(studentDf: DataFrame, scoreDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.groupBy('s_stuId).agg(avg('s_score).alias("avgScore"))
      .filter('avgScore >= 60)
      .join(studentDf, 's_stuId === 'stu_stuId, "inner")
      .select("s_stuId", "stu_stuName", "avgScore")
      .orderBy(desc("avgScore"))
      .show()
  }

  /**
   * //练习1:查询" 01 "课程比" 02 "课程成绩高的学生的信息(没有成绩不做比较)
   *
   * @param studentDf 学生df
   * @param scoreDf   分数df
   * @param spark
   */
  def fun(studentDf: DataFrame, scoreDf: DataFrame, spark: SparkSession): Unit = {
    import spark.implicits._
    scoreDf.filter('s_courseId === 1)
      .join(scoreDf.filter('s_courseId === 2).select('s_stuId.alias("_s_stuId"),
        's_score.alias("_s_score")), 's_stuId === '_s_stuId, "inner"
      ).filter('s_score > '_s_score)
      .join(studentDf, 's_stuId === 'stu_stuId, "inner").select('stu_stuId, 'stu_stuName)
      .show(1000, false)
  }

}

10.1 SQL

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for tb_Course
-- ----------------------------
DROP TABLE IF EXISTS `tb_Course`;
CREATE TABLE `tb_Course` (
  `c_courseId` int(11) DEFAULT NULL,
  `c_course` varchar(255) DEFAULT NULL,
  `c_teacherId` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of tb_Course
-- ----------------------------
BEGIN;
INSERT INTO `tb_Course` (`c_courseId`, `c_course`, `c_teacherId`) VALUES (1, '语文', 2);
INSERT INTO `tb_Course` (`c_courseId`, `c_course`, `c_teacherId`) VALUES (2, '数学', 1);
INSERT INTO `tb_Course` (`c_courseId`, `c_course`, `c_teacherId`) VALUES (3, '英语', 3);
COMMIT;

SET FOREIGN_KEY_CHECKS = 1;

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for tb_Score
-- ----------------------------
DROP TABLE IF EXISTS `tb_Score`;
CREATE TABLE `tb_Score` (
  `s_stuId` int(11) DEFAULT NULL,
  `s_courseId` int(11) DEFAULT NULL,
  `s_score` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of tb_Score
-- ----------------------------
BEGIN;
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (1, 1, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (1, 2, 90);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (1, 3, 99);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (2, 1, 70);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (2, 2, 60);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (2, 3, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (3, 1, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (3, 2, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (3, 3, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (4, 1, 50);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (4, 2, 30);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (4, 3, 20);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (5, 1, 76);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (5, 2, 87);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (6, 1, 31);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (6, 3, 34);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (7, 2, 89);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (7, 3, 98);
COMMIT;

SET FOREIGN_KEY_CHECKS = 1;

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for tb_Student
-- ----------------------------
DROP TABLE IF EXISTS `tb_Student`;
CREATE TABLE `tb_Student` (
  `stu_stuId` int(11) DEFAULT NULL,
  `stu_stuName` varchar(255) DEFAULT NULL,
  `stu_birthday` varchar(255) DEFAULT NULL,
  `stu_gender` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of tb_Student
-- ----------------------------
BEGIN;
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (1, '赵雷', '1990/1/1', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (2, '钱电', '1990/12/21', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (3, '孙风', '1990/5/20', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (4, '李云', '1990/8/6', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (5, '周梅', '1991/12/1', '女');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (6, '吴兰', '1992/3/1', '女');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (7, '郑竹', '1989/7/1', '女');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (8, '王菊', '1990/1/20', '女');
COMMIT;

SET FOREIGN_KEY_CHECKS = 1;

SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;

-- ----------------------------
-- Table structure for tb_Teacher
-- ----------------------------
DROP TABLE IF EXISTS `tb_Teacher`;
CREATE TABLE `tb_Teacher` (
  `t_teacherId` int(11) DEFAULT NULL,
  `t_teacherName` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;

-- ----------------------------
-- Records of tb_Teacher
-- ----------------------------
BEGIN;
INSERT INTO `tb_Teacher` (`t_teacherId`, `t_teacherName`) VALUES (1, 'zhangsan');
INSERT INTO `tb_Teacher` (`t_teacherId`, `t_teacherName`) VALUES (2, 'lisi');
INSERT INTO `tb_Teacher` (`t_teacherId`, `t_teacherName`) VALUES (3, 'wangwu');
COMMIT;

SET FOREIGN_KEY_CHECKS = 1;

11 UDF的学习

package com.spark学习

import com.baidu.utils.LoggerTrait
import org.apache.spark.sql.api.java.UDF1
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.sql.{DataFrame, SparkSession}

object UDF的学习 extends LoggerTrait {
  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkUtils.getSparkSession("UDF的学习")
    // 加载数据
    val df: DataFrame =
      spark.read.text("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
    df.show(1000, false)
    fun_2(spark, df)

  }

  def fun_1(spark: SparkSession, df: DataFrame): Unit = {
    registerCommonUdfToSpark(spark)
    // UDF函数
    spark.udf.register("small2big", (value: String) => {
      value.toUpperCase()
    })
    df.createOrReplaceTempView("tb_person")
    spark.sql(
      s"""
         |select small2big(value) from tb_person
         |""".stripMargin).show(1000, false)

    import spark.implicits._
    val small3big = udf((value: String) => {
      value.toUpperCase()
    })
    df.select('value, small3big('value)).show(1000, false)
  }

  def fun_2(spark: SparkSession, df: DataFrame): Unit = {
    registerCommonUdfToSpark(spark)
    df.createOrReplaceTempView("tb_person")
    spark.sql(
      s"""
         |select small2big(value) from tb_person
         |""".stripMargin).show(1000, false)
  }

  private def registerCommonUdfToSpark(spark: SparkSession): Unit = {
    /**
     * UDF1<Int,Int>   第一个Int表示你输入的参数,即你sql中要传入的字段或者数据,
     * 最后一个Int表示调用UDF方法后返回的数据类型,即return 的类型。
     */
    spark.udf.register("small2big", new UDF1[String, String] {
      override def call(t1: String): String = {
        t1.toUpperCase()
      }
    }, DataTypes.StringType)
  }
}

12 sparkOnHive

https://blog.csdn.net/qq_43659234/article/details/117298401
https://blog.csdn.net/weixin_44911081/article/details/121858327

SparkOnHive: 仅仅使用Hive的元数据(库/表/字段/位置等信息);剩下的用SparkSQL的,如:执行引擎,语法解析,物理执行计划,SQL优化。
在这里插入图片描述
首先要启动: hive --service metastore

package com.baidu.sparkcodetest.hive

import com.baidu.utils.LoggerTrait
import org.apache.spark.sql.SparkSession

object Demo_hive extends LoggerTrait{
  def main(args: Array[String]): Unit = {
    // 环境创建  hive --service metastore
    val spark = SparkSession.builder().appName("Demo_flatMap_map").master("local[*]")
      .config("spark.sql.shuffle.partitions", "4")
      .config("spark.sql.warehouse.dir", "hdfs://localhost:8020/user/hive/warehouse") // 指定hdfs位置
      .config("hive.metastore.uris", "thrift://localhost:9083")
      .enableHiveSupport()
      .getOrCreate()

    import spark.implicits._
    spark.sql("show databases").show(false)
    spark.sql("show tables").show(false)
    spark.sql("create table person(id int, name string, age int) row format delimited fields terminated by ' '")
    spark.sql("load data local inpath 'file:///Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/person.txt'" +
      " into table person")
    spark.sql("select * from person").show(false)
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值