1 SparkSQL的数据抽象
DataFrame和DataSet,底层是RDD。
DataFrame = RDD - 泛型 +Schema(指定了字段名和类型)+ SQL操作 + 优化
DataFrame 就是在RDD的基础之上做了进一步的封装,支持SQL操作!
DataFrame 就是一个分布式表!
DataSet = DataFrame + 泛型
DataSet = RDD + Schema约束(指定了字段名和类型) + SQL操作 + 优化
2 RDD共享变量
package com.spark学习.RDD
import com.baidu.utils.LoggerTrait
import com.spark学习.SparkUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.util.LongAccumulator
import java.lang
/**
* 在默认情况下,当Spark在集群的多个不同节点的多个任务上并行运行一个函数时,它会把函数中涉及到的每个变量,在每个任务上都生成一个副本。
* 但是,有时候需要在多个任务之间共享变量,或者在任务(Task)和任务控制节点(Driver Program)之间共享变量。
* 为了满足这种需求,Spark提供了两种类型的变量:
*
* 1、广播变量Broadcast Variables
* 广播变量用来把变量在所有节点的内存之间进行共享,在每个机器上缓存一个只读的变量,而不是为机器上的每个任务都生成一个副本;
* 2、累加器Accumulators
* 累加器支持在所有不同节点之间进行累加计算(比如计数或者求和);
*
* 没有广播变量,该变量发送给各个Task
* 有广播变量,该变量发送给各个节点
*
* @author zhaoshuai11
* @date 2022 /10/03
*/
object RDD共享变量 extends LoggerTrait {
def main(args: Array[String]): Unit = {
// 1 准备sc-SparkContext上下文执行环境
val spark = SparkUtils.getSparkSession("RDD的创建")
val sc = SparkUtils.getSparkContext(spark)
val lines: RDD[String] =
sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/helloworld.txt")
val myCounter: LongAccumulator = sc.longAccumulator("myCounter")
// 定义特殊字符集合
val ruleList: List[String] = List(",", ".", "@", "!", "*", "%", "$", "#")
// 将集合所谓广播变量 广播到各个节点
val broadcast: Broadcast[List[String]] = sc.broadcast(ruleList)
accumulators_1(lines, myCounter, broadcast)
}
/**
* 累加器Accumulators
* 累加器支持在所有不同节点之间进行累加计算(比如计数或者求和);
*/
def accumulators_1(lines: RDD[String], myCounter: LongAccumulator, broadcast: Broadcast[List[String]]): Unit = {
val result: RDD[(String, Int)] = lines.filter(StringUtils.isNoneBlank(_))
.flatMap(_.split("\\s+"))
.filter(ch => {
// 获取广播数据
val list: List[String] = broadcast.value
if (list.contains(ch)) {
myCounter.add(1)
false
} else {
true
}
}).map((_, 1))
.reduceByKey(_ + _)
result.foreach(println)
println("***************")
val value: lang.Long = myCounter.value
println(value)
}
def accumulators_2(lines: RDD[String], myCounter: LongAccumulator, ruleList: List[String]): Unit = {
val result: RDD[(String, Int)] = lines.filter(StringUtils.isNoneBlank(_))
.flatMap(_.split("\\s+"))
.filter(ch => {
if (ruleList.contains(ch)) {
myCounter.add(1)
false
} else {
true
}
}).map((_, 1))
.reduceByKey(_ + _)
result.foreach(println)
println("***************")
val value: lang.Long = myCounter.value
println(value)
}
}
3 RDD的基本算子API
package com.spark学习.RDD
import com.baidu.utils.LoggerTrait
import com.spark学习.SparkUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object RDD的基本算子 extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark = SparkUtils.getSparkSession("RDD的基本算子")
val sc = SparkUtils.getSparkContext(spark)
reduceByKey(sc)
}
def flatMap(sc: SparkContext): Unit = {
val lines: RDD[String] =
sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
val result: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
.flatMap(_.split(" "))
.map((_, 1)) // 针对分区中每一条数据进行操作
.reduceByKey(_ + _)
result.foreach(println)
/**
* (don't,2)
* (Say,2)
* (goodbye,2)
* ... ... ...
*/
result.saveAsTextFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld_count")
}
def mapPartitions(sc: SparkContext): Unit = {
val lines =
sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
println(lines.getNumPartitions) // 2
val result: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
.flatMap(_.split(" "))
.mapPartitions(iter => { // 针对每个分区进行操作
// 开启连接 -- 有几个分区就执行几次
iter.map((_, 1)) // 作用在该分区的每一条数据上
// 关闭连接
}).reduceByKey(_ + _)
result.foreach(println)
}
def reduceAPI(sc: SparkContext): Unit = {
val rdd1: RDD[Int] = sc.parallelize(1 to 10)
reduce_add_API(rdd1)
}
def reduce_add_API(rdd1: RDD[Int]): Unit = {
// rdd1 各元素的和
println(rdd1.sum()) // 55
println(rdd1.reduce(_ + _))
println(rdd1.fold(0)(_ + _))
println(rdd1.aggregate(0)(_ + _, _ + _))
}
def groupByKey(sc: SparkContext): Unit = {
val lines: RDD[String] =
sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
val wordAndOne: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
.flatMap(_.split(" "))
.map((_, 1))
val grouped: RDD[(String, Iterable[Int])] = wordAndOne.groupByKey()
val result: RDD[(String, Int)] = grouped.mapValues(_.sum)
result.foreach(println)
}
def reduceByKey(sc: SparkContext): Unit = {
val lines: RDD[String] =
sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
val wordAndOne: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
.flatMap(_.split(" "))
.map((_, 1))
val result2: RDD[(String, Int)] = wordAndOne.reduceByKey(_ + _)
result2.foreach(println)
}
}
4 RDD 的持久化-缓存
缓存解决什么问题? 解决的是热点数据频繁访问的效率问题,在Spark开发中某些RDD的计算或转换可能会比较耗费时间,如果这些RDD后续还会频繁的被使用到,那么可以将这些RDD进行持久化/缓存,这样下次再使用到的时候就不用再重新计算了,提高了程序运行的效率。
package com.spark学习.RDD
import com.baidu.utils.LoggerTrait
import com.spark学习.SparkUtils
import org.apache.commons.lang3.StringUtils
import org.apache.spark.rdd.RDD
object RDD_cache extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark = SparkUtils.getSparkSession("RDD_cache")
val sc = SparkUtils.getSparkContext(spark)
val lines: RDD[String] =
sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/helloworld.txt")
val result: RDD[(String, Int)] = lines.filter(StringUtils.isNotBlank(_))
.flatMap(_.split(" "))
.map((_, 1))
.reduceByKey(_ + _)
// result 在后面会被频繁使用 且该RDD的计算过程比较复杂,所以为了提高后续访问该RDD的效率,放到缓存中
result.cache()
// result.persist()
// result.persist(StorageLevel.MEMORY_AND_DISK)
/**
* cache底层是persist
* def cache(): this.type = persist()
* Persist this RDD with the default storage level (MEMORY_ONLY). persist底层默认是内存
* rdd.persist(StorageLeveL.MEMORY_AND_DISK)// 开发中使用内存+磁盘(内存不够时才用!)
*
*/
result.unpersist()
}
}
5 DataFrame 加载数据
package com.spark学习.RDD.RDDDFDS
import com.baidu.utils.LoggerTrait
import org.apache.spark.sql.{DataFrame, SparkSession}
object 加载数据 extends LoggerTrait {
def main(args: Array[String]): Unit = {
// 环境创建
val spark = SparkSession.builder().appName("Demo").master("local[*]").getOrCreate()
// 加载数据
val dfText: DataFrame =
spark.read.text("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt")
val dfJson: DataFrame =
spark.read.json("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/people.json")
val dfCSV: DataFrame =
spark.read.csv("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/country.csv")
dfText.show(10000, false)
dfJson.show(10000, false)
dfCSV.show(10000, false)
}
}
package com.spark学习
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.utils.WriteUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import java.util.Properties
object Data_load_upload extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark = SparkUtils.getSparkSession("Data_load_upload")
save(spark)
}
def save(spark: SparkSession): Unit = {
appendToLocalDb(spark)
}
def appendToLocalDb(spark: SparkSession): Unit = {
import spark.implicits._
val sc = spark.sparkContext
val rdd: RDD[String] = sc.makeRDD(Seq("10002;{\"ID\":121,\"message\":{\"name\":\"Asher\",\"location\":[{\"county\":\"浦东\",\"city\":\"上海\"},{\"county\":\"西直门\",\"city\":\"北京\"}]}}; {\"fieldFill\":[{\"field\":\"productTitle\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":0,\"reference\":[],\"fieldType\":1},{\"field\":\"tags\",\"score\":1,\"problem\":\"卖点标签数量较少\",\"solution\":\"您可以通过卖点标签将产品特点优势用短语直观呈现,吸引用户关注,建议提交3个\",\"type\":0,\"reference\":[],\"fieldType\":1},{\"field\":\"desc\",\"score\":1,\"problem\":\"产品简介字数较少\",\"solution\":\"产品简介有利于帮助用户快速获取产品信息,建议字符数≥20字符,当前字符数18\",\"type\":0,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":1,\"problem\":\"产品详情无图片\",\"solution\":\"产品详情是对产品详细信息和特点的整体描述,图文并茂的产品详情页对用户转化有重要影响,建议字符数≥100字符,并至少提交1张详情图片,当前字符数283,图片数0\",\"type\":0,\"reference\":[],\"fieldType\":1}],\"fieldRepeat\":[{\"field\":\"desc\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":1,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":1,\"reference\":[],\"fieldType\":1},{\"field\":\"image\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":1,\"reference\":[],\"fieldType\":2},{\"field\":\"productTitle\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":1},{\"field\":\"desc\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":1},{\"field\":\"image\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":2,\"reference\":[],\"fieldType\":2}],\"pictureVision\":[{\"field\":\"\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":3,\"reference\":[],\"fieldType\":2},{\"field\":\"\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":4,\"reference\":[],\"fieldType\":2},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":5,\"reference\":[],\"fieldType\":2}],\"textQuality\":[{\"field\":\"productTitle,desc,content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":6,\"reference\":[],\"fieldType\":1},{\"field\":\"productTitle,desc,content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":7,\"reference\":[],\"fieldType\":1},{\"field\":\"content\",\"score\":2,\"problem\":\"\",\"solution\":\"\",\"type\":7,\"reference\":[],\"fieldType\":1}]}"))
val value: RDD[(String, String, String)] = rdd.map(item => {
val strings: Array[String] = item.split(";")
(strings(0), strings(1), strings(2))
})
val df: DataFrame = value.toDF("product_id", "image", "quality_detail")
df.show(1000, false)
WriteUtils.appendToLocalDb(spark, df, "tb_product")
}
def load(spark: SparkSession): Unit = {
jdbc(spark)
}
def json(spark: SparkSession): Unit = {
val path = "file:Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/people.json"
val df = spark.read.json(path) // 常用写法
df.show(10000, false)
}
/**
* 加载文本 只能解析成一列
*
* @param spark
*/
def txt(spark: SparkSession): Unit = {
val path = "file:Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/helloworld.txt"
val df = spark.read.text(path)
df.show(10000, false)
}
def csv(spark: SparkSession): Unit = {
val path = "file:Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/country.csv"
val df = spark.read.csv(path).toDF("id", "name", "simple_name")
df.show(10000, false)
}
def jdbc(spark: SparkSession): Unit = {
val url = "jdbc:mysql://localhost:3306/wzg"
val table = "tb_product"
val props = new Properties()
props.put("user", "root");
props.put("password", "root");
val df = spark.read.jdbc(url, table, props)
df.show(10000, false)
}
}
6 由RDD生成DF
package com.spark学习.RDD.RDDDFDS
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.Person
import com.spark学习.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession};
object RDD的生成 extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("RDD的生成")
val sc: SparkContext = SparkUtils.getSparkContext(spark)
val rdd: RDD[Person] = generateRDD_CaseClass(sc)
import spark.implicits._
rdd.toDF()
val rowRDD: RDD[Row] = generateRDD_defineScheme(sc)
val scheme: StructType = StructType(
List(StructField("id", IntegerType, false),
StructField("name", StringType, false),
StructField("age", IntegerType, false)))
val df: DataFrame = spark.createDataFrame(rowRDD, scheme)
val classRDD: RDD[(Int, String, Int)] = generateRDD_class(sc)
classRDD.toDF()
}
/**
* 指定类型
*
* @param sc sc
* @return { @link RDD } < { @link Tuple3 } < { @link Object }, { @link String }, { @link Object } >>
*/
def generateRDD_class(sc: SparkContext): RDD[(Int, String, Int)] = {
val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
val classRDD: RDD[(Int, String, Int)] = lines.map(line => {
val arr: Array[String] = line.split(" ")
((arr(0).toInt), (arr(1)), (arr(2).toInt))
})
classRDD
}
/**
* 自定义scheme
*
* @param sc sc
* @return { @link RDD } < { @link Row } >
*/
def generateRDD_defineScheme(sc: SparkContext): RDD[Row] = {
val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
val rowRDD: RDD[Row] = lines.map(line => {
val arr = line.split(" ")
Row(arr(0).toInt, arr(1), arr(2).toInt)
})
rowRDD
}
/**
* 样例类
*
* @param sc sc
* @return { @link RDD } < { @link Person } >
*/
def generateRDD_CaseClass(sc: SparkContext): RDD[Person] = {
val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
val personRDD: RDD[Person] = lines.map(line => {
val personArray: Array[String] = line.split(" ")
Person(personArray(0).toInt, personArray(1), personArray(2).toInt)
})
personRDD
}
}
7 RDD DF DS 相互转化
package com.spark学习.RDD.RDDDFDS
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.Person
import com.spark学习.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object RDD_DF_DS extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("RDD_DF_DS")
val sc: SparkContext = SparkUtils.getSparkContext(spark)
val personRDD: RDD[Person] = generateRDD(sc)
val ds: Dataset[Person] = RDD2DS(personRDD, spark)
val df: DataFrame = RDD2DF(personRDD, spark)
DF2DS(df, spark)
DS2DS(ds)
}
def DS2DS(ds: Dataset[Person]): DataFrame = {
val df: DataFrame = ds.toDF()
df
}
def DF2DS(df: DataFrame, spark: SparkSession): Dataset[Person] = {
import spark.implicits._
val ds: Dataset[Person] = df.as[Person]
ds
}
def DS2RDD(ds: Dataset[Person]): Unit = {
val rdd: RDD[Person] = ds.rdd
rdd.foreach(println)
/**
* Person(1,zhangsan,20)
* Person(4,zhaoliu,30)
* Person(5,tianqi,35)
* Person(5,kobe,40)
* Person(2,lisi,29)
* Person(3,wangwu,25)
*/
}
def DF2RDD(personDf: DataFrame): Unit = {
val rdd: RDD[Row] = personDf.rdd
rdd.foreach(println)
/**
* [1,zhangsan,20]
* [4,zhaoliu,30]
* [2,lisi,29]
* [5,tianqi,35]
* [3,wangwu,25]
* [5,kobe,40]
*/
}
def RDD2DS(personRDD: RDD[Person], spark: SparkSession): Dataset[Person] = {
import spark.implicits._
val ds: Dataset[Person] = personRDD.toDS()
ds
}
def RDD2DF(personRDD: RDD[Person], spark: SparkSession): DataFrame = {
import spark.implicits._
val df: DataFrame = personRDD.toDF()
df
}
def generateRDD(sc: SparkContext): RDD[Person] = {
val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
val personRDD: RDD[Person] = lines.map(line => {
val personArray: Array[String] = line.split(" ")
Person(personArray(0).toInt, personArray(1), personArray(2).toInt)
})
personRDD
}
}
8 wordCount
package com.baidu.sparkcodetest.sql
import com.baidu.utils.LoggerTrait
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, Row, SparkSession}
object Demo_spark_wordcount extends LoggerTrait {
def main(args: Array[String]): Unit = {
// 环境创建
val spark = SparkSession.builder().appName("Demo_spark_wordcount").master("local[*]").getOrCreate()
// 加载数据
val ds: Dataset[String] =
spark.read.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/helloworld.txt")
ds.show()
/**
* +------------------+
* | value|
* +------------------+
* |hello me you her !|
* | hello me you|
* | hello me|
* | hello|
* | #|
* | #|
* | #|
* | |
* +------------------+
*/
import spark.implicits._
val words: Dataset[String] = ds.flatMap(_.split(" "))
words.createOrReplaceTempView("tb_words")
spark.sql(
"""
|select value, count(*) as counts from tb_words group by value order by counts desc
|""".stripMargin).show()
words.groupBy('value)
.count()
.orderBy('count.desc).show()
}
}
9 JDBC
9.1 JDBC工具类
package com.spark学习.SQL.jdbc.utils
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import java.sql.DriverManager
import java.util.Properties
object WriteUtils {
val wzgUser = "root"
val wzgPassword = "root"
val wzgUrl = "jdbc:mysql://localhost:3306/wzg"
/**
* 获取local链接
*
* @return
*/
def getLocalJDBC(spark: SparkSession, table: String) = {
val props = new Properties()
props.put("user", wzgUser);
props.put("password", wzgPassword);
val df = spark.read.jdbc(wzgUrl, table, props)
df
}
/**
* 追加输出到 local db
*/
def appendToLocalDb(spark: SparkSession, df: DataFrame, tableName: String): Unit = {
println("start output .")
df.repartition(1)
.write.mode(SaveMode.Append)
.format("jdbc")
.option(JDBCOptions.JDBC_DRIVER_CLASS, "com.mysql.jdbc.Driver")
.option(JDBCOptions.JDBC_URL, wzgUrl)
.option(JDBCOptions.JDBC_TXN_ISOLATION_LEVEL, "NONE")
.option(JDBCOptions.JDBC_TABLE_NAME, tableName)
.option("user", wzgUser)
.option("password", wzgPassword)
.save()
}
/**
* 通用的以 overwrite 模式写到 hdfs csv 文件,\t 作为分隔符
*/
def writeCommonCsvToHdfs(df: DataFrame, numPartitions: Int, header: Boolean, hdfsPath: String): Unit = {
writeCommonCsvToHdfs(df, numPartitions, header, hdfsPath, "\t", "csv")
}
/**
* 通用的以 overwrite 模式写到 hdfs csv 文件,separator 作为分隔符
*/
def writeCommonCsvToHdfs(df: DataFrame, numPartitions: Int, header: Boolean, hdfsPath: String, separator: String): Unit = {
writeCommonCsvToHdfs(df, numPartitions, header, hdfsPath, separator, "csv")
}
/**
* 通用的以 overwrite 模式写到 hdfs scv 文件,separator 作为分隔符,format格式
*/
def writeCommonCsvToHdfs(df: DataFrame, numPartitions: Int, header: Boolean, hdfsPath: String, separator: String, format: String): Unit = {
df.repartition(numPartitions)
.write.mode("Overwrite")
.format(format).option("delimiter", separator).option("sep", separator)
.option("header", if (header) "true" else "false")
.save(hdfsPath)
}
/**
* 执行删除语句
*
* @param spark spark
* @param sql sql
*/
def deleteFromLocalDb(spark: SparkSession, sql: String): Unit = {
println("start delete . " + sql)
val conn = DriverManager.getConnection(wzgUrl, wzgUser, wzgPassword)
val statement = conn.createStatement()
statement.execute(sql)
statement.close()
conn.close()
}
/**
* 执行update语句
*
* @param spark spark
* @param sql sql
*/
def updateFromFcDiDb(spark: SparkSession, sql: String): Unit = {
println("start update . " + sql)
val conn = DriverManager.getConnection(wzgUrl, wzgUrl, wzgPassword)
val statement = conn.createStatement()
statement.execute(sql)
statement.close()
conn.close()
}
}
9.2 JdbcOption相关操作
package com.spark学习.SQL.jdbc
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.Person
import com.spark学习.SQL.jdbc.utils.WriteUtils
import com.spark学习.SparkUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object JdbcOption extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("JdbcOption")
}
/**
* delete
*
* @param spark
*/
def deleteFromDb(spark: SparkSession): Unit = {
val sql = "delete from tb_person where id=5"
WriteUtils.deleteFromLocalDb(spark, sql)
}
/**
* insert
*
* @param spark
*/
def addDfToDb(spark: SparkSession): Unit = {
val df: DataFrame = generateDF(spark)
WriteUtils.appendToLocalDb(spark, df, "tb_person")
}
/**
* select
*
* @param spark
*/
def getFromDB(spark: SparkSession): DataFrame = {
val table = "tb_person"
val df: DataFrame = WriteUtils.getLocalJDBC(spark, table)
df
}
def generateDF(spark: SparkSession): DataFrame = {
val sc = spark.sparkContext
val lines: RDD[String] = sc.textFile("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
val personRDD: RDD[Person] = lines.map(line => {
val personArray: Array[String] = line.split(" ")
Person(personArray(0).toInt, personArray(1).toString, personArray(2).toInt)
})
import spark.implicits._
val df: DataFrame = personRDD.toDF()
df
}
}
9.3 crud_DataFrame对象上Action操作
package com.spark学习.SQL.jdbc
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.JdbcOption.generateDF
import com.spark学习.SparkUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import java.util
object crud_DataFrame对象上Action操作 extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("crud")
val df: DataFrame = generateDF(spark)
first_head_take_takeAsList(df)
}
/**
* 获取若干行记录
* (1)first获取第一行记录
* (2)head获取第一行记录,head(n: Int)获取前n行记录
* (3)take(n: Int)获取前n行数据
* (4)takeAsList(n: Int)获取前n行数据,并以List的形式展现
* take和takeAsList方法会将获得到的数据返回到Driver端,所以,使用这两个方法时需要注意数据量,以免Driver发生OutOfMemoryError
*
* @param df df
*/
def first_head_take_takeAsList(df: DataFrame): Unit = {
val fistData: String = df.first().mkString(",")
// 1,zhangsan,20
val headRows: Array[Row] = df.head(2)
val headArrayString: String = headRows.mkString(",")
// [1,zhangsan,20],[2,lisi,29]
val headString: String = headRows.map(row => row.mkString(",")).mkString(",")
// 1,zhangsan,20,2,lisi,29
}
/**
* 获取指定字段的统计信息
*
* 这个方法可以动态的传入一个或多个String类型的字段名,结果仍然为DataFrame对象,用于统计数值类型字段的统计值,比如count, mean, stddev, min, max等。
*
* @param df df
*/
def describe(df: DataFrame): Unit = {
val summaryDf: DataFrame = df.describe("age")
summaryDf.show(1, false)
/**
* +-------+---+
* |summary|age|
* +-------+---+
* |count |6 |
* +-------+---+
*/
summaryDf.select("age").where("summary = 'count'").show()
/**
* +---+
* |age|
* +---+
* | 6|
* +---+
*/
}
/**
* 功能和collect类似,只不过将返回结构变成了List对象名单
*
* @param df df
*/
def collectAsList(df: DataFrame): Unit = {
val rows: util.List[Row] = df.collectAsList()
rows.forEach(row => {
println(row)
// [1,zhangsan,20]
val headers: String = row.schema.mkString(",")
// StructField(id,IntegerType,false),StructField(name,StringType,true),StructField(age,IntegerType,false)
val datas: String = row.mkString(",")
// 1,zhangsan,20
})
}
/**
* collect方法会将jdbcDF中的所有数据都获取到,并返回一个Array对象
*
* @param df df
*/
def collect(df: DataFrame): Unit = {
val rows: Array[Row] = df.collect()
for (elem <- rows) {
val schema: StructType = elem.schema
// StructType(StructField(id,IntegerType,false), StructField(name,StringType,true), StructField(age,IntegerType,false))
val names: Array[String] = schema.names
val headers: String = names.mkString(",")
// id,name,age
val datas: String = elem.mkString(",")
// 1,zhangsan,20
}
}
/**
* 以表格的形式在输出中展示jdbcDF中的数据,类似于select * from spark_sql_test的功能
*
* @param df df
*/
def show(df: DataFrame): Unit = {
/**
* 综合前面的显示记录条数,以及对过长字符串的显示格式 是否最多只显示20个字符,默认为true
*/
df.show(2, false)
}
}
9.4 crud_DataFrame对象上的条件查询和join
package com.spark学习.SQL.jdbc
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.JdbcOption.getFromDB
import com.spark学习.SparkUtils
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object crud_DataFrame对象上的条件查询和join extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("crud")
val df: DataFrame = getFromDB(spark)
explode_fun(df)
}
/**
* 炸裂:将一行数据,炸裂为多行, 也就是“列转行”
*
* @param df df
*/
def explode_fun(df: DataFrame): Unit = {
df.explode("name", "name_") {
item: String => item.split(" ")
}.select("name_").show(1000, false)
}
/**
* whtiColumn(colName: String , col: Column)方法根据指定colName往DataFrame中新增一列,如果colName已存在,则会覆盖当前列。
*
* @param df df
*/
def withColumn(df: DataFrame): Unit = {
df.withColumn("new_id", df.col("id")).show()
}
/**
* withColumnRenamed:重命名DataFrame中的指定字段名
* 如果指定的字段名不存在,不进行任何操作。下面示例中将jdbcDF中的id字段重命名为idx。
*
* @param df df
*/
def withColumnRenamed(df: DataFrame): Unit = {
df.withColumnRenamed("id", "person_id")
.withColumnRenamed("name", "person_name")
.withColumnRenamed("age", "person_age").show(1000, false)
}
/**
* 获取一个DataFrame中有另一个DataFrame中没有的记录
*
* @param df df
*/
def except(df: DataFrame): Unit = {
df.except(df.limit(1)).show(1000, false)
}
/**
* intersect方法可以计算出两个DataFrame中相同的记录
*
* @param df df
*/
def intersect(df: DataFrame): Unit = {
df.intersect(df.limit(1)).show(1000, false)
}
/**
* stat方法可以用于计算指定字段或指定字段之间的统计信息,比如方差,协方差等。这个方法返回一个DataFramesStatFunctions类型对象。
* 下面代码演示根据c1字段,统计该字段值出现频率在30%以上的内容。
* 在df中字段c1的内容为"a, b, a, c, d, b"。其中a和b出现的频率为2 / 6,大于0.3
*
* @param df df
*/
def stat(df: DataFrame): Unit = {
val freqDf: DataFrame = df.stat.freqItems(Seq("name"), 0.3)
val headers: String = freqDf.schema.names.mkString(",") // name_freqItems
val row: Row = freqDf.head()
// [WrappedArray(zhangsan, zhaoliu)]
}
/**
* 在SQL语言中用得很多的就是join操作,DataFrame中同样也提供了join的功能。
* 在DataFrame中提供了六个重载的join方法。
*
*/
def join(df: DataFrame): Unit = {
/**
* 笛卡尔积
*/
df.join(df).show(1000, false)
/**
* using 一个字段
*/
df.join(df, "name").show(1000, false)
/**
* using 两个字段
*/
df.join(df, Seq("name", "id")).show(1000, false)
/**
* 两个DataFrame的join操作有inner, outer, left_outer, right_outer, leftsemi类型。
* 在上面的using多个字段的join情况下,可以写第三个String类型参数,指定join的类型
*/
df.join(df, Seq("name"), "inner").show(1000, false)
/**
* 使用Column类型来join
* 如果不用using模式,灵活指定join字段的话
*/
df.join(df, df.col("id") === df.col("id"), "inner").show(1000, false)
/**
* +---+--------+---+---+--------+---+
* |id |name |age|id |name |age|
* +---+--------+---+---+--------+---+
* |1 |zhangsan|20 |1 |zhangsan|20 |
* |3 |wangwu |25 |3 |wangwu |25 |
* |5 |zhangsan|35 |5 |zhangsan|35 |
* |4 |zhaoliu |30 |4 |zhaoliu |30 |
* |2 |lisi |29 |2 |lisi |29 |
* +---+--------+---+---+--------+---+
*
*/
}
/**
* unionAll方法:对两个DataFrame进行组合
* 类似于SQL中的UNION ALL操作。
*
* @param df df
*/
def union(df: DataFrame): Unit = {
df.union(df.limit(1)).show()
}
/**
* 聚合操作调用的是agg方法,该方法有多种调用方式。一般与groupBy方法配合使用。
* 以下示例其中最简单直观的一种用法,对id字段求最大值,字段age求和
*
* @param df df
*/
def agg(df: DataFrame): Unit = {
df.agg("id" -> "max", "age" -> "sum").show(false)
}
/**
* 根据指定字段去重。类似于select distinct a, b操作
*
*/
def dropDuplicates(df: DataFrame): Unit = {
df.dropDuplicates(Seq("name")).show()
}
/**
* 返回当前DataFrame中不重复的Row记录。该方法和接下来的dropDuplicates()方法不传入指定字段时的结果相同。
*
* @param df df
*/
def distinct(df: DataFrame): Unit = {
df.distinct().show()
}
/**
* groupBy:根据字段进行group by操作
* groupBy方法有两种调用方式,可以传入String类型的字段名,也可传入Column类型的对象
* max(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的最大值,只能作用于数字型字段
* min(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的最小值,只能作用于数字型字段
* mean(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的平均值,只能作用于数字型字段
* sum(colNames: String*)方法,获取分组中指定字段或者所有的数字类型字段的和值,只能作用于数字型字段
* count()方法,获取分组中的元素个数
*
* @param df df
*/
def group_by(df: DataFrame): Unit = {
val group_by_df: DataFrame = df.groupBy(df.col("name").as("name_counts")).count()
group_by_df.select(group_by_df.col("name"), group_by_df.col("count").as("name_counts")).show()
}
/**
* orderBy和sort:按指定字段排序,默认为升序
* 示例1,按指定字段排序。加个-表示降序排序。sort和orderBy使用方法相同
*
* @param df df
*/
def order_by(df: DataFrame): Unit = {
/**
* 方式1
*/
df.orderBy(df.col("age").desc).show(false)
/**
* 方式2
*/
df.orderBy(-df.col("age")).show(false)
}
/**
* limit方法获取指定DataFrame的前n行记录,得到一个新的DataFrame对象。和take与head不同的是,limit方法不是Action操作。
*
* @param df df
*/
def limit(df: DataFrame): Unit = {
df.limit(3).show(false)
}
/**
* 返回一个新的DataFrame对象,其中不包含去除的字段,一次只能去除一个字段
* 不会改变源数据
*
* @param df df
*/
def drop(df: DataFrame): Unit = {
val d_df: DataFrame = df.drop("id")
d_df.show()
df.show()
}
/**
* 只能获取一个字段,返回对象为Column类型
*
* @param df df
*/
def apply(df: DataFrame): Unit = {
df.apply("id").as("person_id")
df.select(df.apply("id").as("person_id"),
df.apply("name").as("person_name"),
df.apply("age").as("person_age")).show()
}
/**
* 获取指定字段
* 只能获取一个字段,返回对象为Column类型
*
* @param df df
*/
def col(df: DataFrame): Unit = {
val person_id = df.col("id").as("person_id")
val person_age = df.col("age").+(1000).as("person_age")
val person_name = df.col("name").as("person_name")
df.select(person_id, person_name, person_age).show()
}
/**
* 根据传入的String类型字段名,获取指定字段的值,以DataFrame类型返回
*
* @param df df
*/
def select(df: DataFrame): Unit = {
df.select("id", "name").show()
}
/**
* 可以对指定字段进行特殊处理
* 可以直接对指定字段调用UDF函数,或者指定别名等。传入String类型参数,得到DataFrame对象
*
*/
def selectExpr(df: DataFrame): Unit = {
df.selectExpr("id as person_id", "name as person_name", "age as person_age").show()
}
/**
* SQL语言中where关键字后的条件
* 传入筛选条件表达式,可以用and和or。得到DataFrame类型的返回结果
*
*/
def where(df: DataFrame): Unit = {
df.where("id = 1 or name = 'lisi'").show()
/**
* 根据字段进行筛选
* 传入筛选条件表达式,得到DataFrame类型的返回结果。和where使用条件相同
*/
df.filter("id = 1 or name = 'lisi'").show()
}
}
10 SparkSQL练习题_学生成绩
package com.spark学习.SQL.jdbc.SparkSQL练习题_学生成绩
import com.baidu.utils.LoggerTrait
import com.spark学习.SQL.jdbc.utils.WriteUtils
import com.spark学习.SparkUtils
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object demo extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("demo")
val courseDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Course")
val studentDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Student")
val teacherDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Teacher")
val scoreDf: DataFrame = WriteUtils.getLocalJDBC(spark, "tb_Score")
fun21(courseDf, scoreDf, teacherDf, studentDf, spark)
// fun9(scoreDf, spark)
}
/**
* 练习20:查询各科成绩前三名的记录(考虑并列,考虑空缺)
*
*/
def fun21(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.select($"s_courseId", $"s_stuId", $"s_score",
rank().over(Window.partitionBy($"s_courseId")
.orderBy($"s_score".desc)).as("rank"))
.filter($"rank".leq(3))
.show()
/**
* +----------+-------+-------+----+
* |s_courseId|s_stuId|s_score|rank|
* +----------+-------+-------+----+
* | 1| 1| 80| 1|
* | 1| 3| 80| 1|
* | 1| 5| 76| 3|
* | 3| 1| 99| 1|
* | 3| 7| 98| 2|
* | 3| 2| 80| 3|
* | 3| 3| 80| 3|
* | 2| 1| 90| 1|
* | 2| 7| 89| 2|
* | 2| 5| 87| 3|
* +----------+-------+-------+----+
*/
}
/**
* 练习19:查询各科成绩前三名的记录(考虑并列,不考虑空缺)
*
*/
def fun20(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.select($"s_courseId", $"s_stuId", $"s_score",
dense_rank().over(Window.partitionBy($"s_courseId")
.orderBy($"s_score".desc)).as("rank"))
.filter($"rank".leq(3))
.show()
/**
* +----------+-------+-------+----+
* |s_courseId|s_stuId|s_score|rank|
* +----------+-------+-------+----+
* | 1| 1| 80| 1|
* | 1| 3| 80| 1|
* | 1| 5| 76| 2|
* | 1| 2| 70| 3|
* | 3| 1| 99| 1|
* | 3| 7| 98| 2|
* | 3| 2| 80| 3|
* | 3| 3| 80| 3|
* | 2| 1| 90| 1|
* | 2| 7| 89| 2|
* | 2| 5| 87| 3|
* +----------+-------+-------+----+
*
*/
}
/**
* 练习18:查询各科成绩前三名的记录(不考虑并列)
*
*/
def fun19(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.select($"s_courseId", $"s_stuId", $"s_score",
row_number().over(Window.partitionBy($"s_courseId")
.orderBy($"s_score".desc)).as("rank"))
.filter($"rank".leq(3))
.show()
/**
* +----------+-------+-------+----+
* |s_courseId|s_stuId|s_score|rank|
* +----------+-------+-------+----+
* | 1| 1| 80| 1|
* | 1| 3| 80| 2|
* | 1| 5| 76| 3|
* | 3| 1| 99| 1|
* | 3| 7| 98| 2|
* | 3| 2| 80| 3|
* | 2| 1| 90| 1|
* | 2| 7| 89| 2|
* | 2| 5| 87| 3|
* +----------+-------+-------+----+
*/
}
/**
* 练习17:查询学生的总成绩,并进行排名,总分重复时保留名次空缺
*
*/
def fun18(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.groupBy('s_stuId).agg(sum('s_score).as("totalScore"))
.select('s_stuId, 'totalScore,
rank().over(Window.orderBy('totalScore.desc)).alias("rank")).show()
}
/**
* 练习16:按各科成绩进行排序,并显示排名, Score 重复时并列,排名重复有空缺
* rank().over(Window.partitionBy('s_courseId)
*
*/
def fun17(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.select('s_courseId, 's_stuId, 's_score,
rank().over(Window.partitionBy('s_courseId)
.orderBy('s_score.desc)).alias("rank")).show(1000, false)
}
/**
* 练习15:按各科成绩进行排序,并显示排名, Score 重复时并列,排名连续
* dense_rank().over(Window.partitionBy('s_courseId)
*
*/
def fun16(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.select('s_courseId, 's_stuId, 's_score,
dense_rank().over(Window.partitionBy('s_courseId)
.orderBy('s_score.desc)).alias("rank")).show(1000, false)
}
/**
* 练习14:要求输出课程号和选修人数,查询结果按人数降序排列,若人数相同,按课程号升序排列
*
*/
def fun15(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.groupBy("s_courseId").agg(count('s_courseId).alias("courseIdCounts"))
.orderBy('courseIdCounts.desc, 's_courseId).show()
}
/**
* 练习13:查询各科成绩最高分、最低分和平均分
*
*/
def fun14(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
scoreDf.groupBy("s_courseId").agg(max("s_score"), min("s_score"), mean("s_score")).show()
}
/**
* 练习12:按平均成绩从高到低显示所有学生的所有课程的成绩以及平均成绩
*
*/
def fun13(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.groupBy("s_stuId").agg(mean('s_score).alias("score_mean"))
.join(scoreDf, "s_stuId")
.orderBy('score_mean.desc)
.select("s_stuId", "s_courseId", "s_score", "score_mean")
.show()
}
/**
* 练习11:检索" 01 "课程分数小于 60,按分数降序排列的学生信息
*
*/
def fun12(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.where("s_courseId = 1 and s_score < 60").dropDuplicates("s_stuId")
.join(studentDf, 'stu_stuId === 's_stuId)
.orderBy('s_score.desc)
.select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender", "s_score").show(1000, false)
}
/**
* 练习10:查询两门及其以上不及格课程的同学的学号,姓名及其平均成绩
*
*/
def fun11(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.filter('s_score < 60)
.groupBy("s_stuId")
.agg(count('s_score).alias("score_counts"), mean('s_score).alias("score_mean"))
.filter('score_counts >= 2)
.join(studentDf, 's_stuId === 'stu_stuId)
.select("stu_stuId", "stu_stuName", "score_mean").show(1000, false)
}
/**
* 练习9:查询没学过"张三"老师讲授的任一门课程的学生姓名
*
*/
def fun10(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
val courseIdStr: String = teacherDf.where("t_teacherName = 'zhangsan'")
.join(courseDf, 'c_teacherId === 't_teacherId).select("c_courseId").collect()
.map(item => item.mkString(",")).mkString(",")
val stuIdStr: String = scoreDf.where(s"s_courseId in ($courseIdStr)")
.dropDuplicates("s_stuId").select("s_stuId").collect()
.map(item => item.mkString(",")).mkString(",")
scoreDf.where(s"s_stuId not in ($stuIdStr)").dropDuplicates("s_stuId")
.join(studentDf, 'stu_stuId === 's_stuId)
.select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender").show(1000, false)
}
/**
* 意外收获:行转列,没有补空值
*
*/
def fun9(scoreDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.show()
scoreDf.groupBy("s_stuId").pivot($"s_courseId").sum("s_score").orderBy("s_stuId").show()
/**
* +-------+----------+-------+
* |s_stuId|s_courseId|s_score|
* +-------+----------+-------+
* | 1| 1| 80|
* | 1| 2| 90|
* | 1| 3| 99|
* | 2| 1| 70|
* | 2| 2| 60|
* | 2| 3| 80|
* ... ... ...
* +-------+----------+-------+
*
*/
/**
* +-------+----+----+----+
* |s_stuId| 1| 2| 3|
* +-------+----+----+----+
* | 1| 80| 90| 99|
* ... ... ...
* | 7|null| 89| 98|
* +-------+----+----+----+
*
*/
}
/**
* 练习8:查询和" 01 "号的同学学习的课程 完全相同的其他同学的信息
*
*/
def fun8(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
val courseIdCountArray: Array[String] = scoreDf.filter('s_stuId === 1).select("s_courseId").collect().map(item => item.mkString(","))
scoreDf.filter(s"s_courseId in (${courseIdCountArray.mkString(",")}) and s_stuId != 1")
.groupBy("s_stuId").agg(count("s_courseId").alias("sc_courseIdCounts"))
.filter(s"sc_courseIdCounts = ${courseIdCountArray.length}")
.select("s_stuId").distinct()
.join(studentDf, 'stu_stuId === 's_stuId)
.select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender")
.orderBy('stu_stuId)
.show(1000, false)
}
/**
* //练习7:查询至少有一门课与学号为" 01 "的同学所学相同的同学的信息
*
*/
def fun7(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.where(s"s_courseId in (${scoreDf.filter('s_stuId === 1).select("s_courseId").collect().map(item => item.mkString(",")).mkString(",")}) and s_stuId != 1")
.select("s_stuId").distinct()
.join(studentDf, 'stu_stuId === 's_stuId)
.select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender")
.orderBy('stu_stuId)
.show(1000, false)
}
/**
* //练习6:查询没有学全所有课程的同学的信息
*
*/
def fun6(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.groupBy("s_stuId").agg(count("s_courseId").alias("s_courseIdCounts"))
.filter('s_courseIdCounts < courseDf.select("c_courseId").distinct().count())
.join(studentDf, 'stu_stuId === 's_stuId)
.select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender").orderBy('stu_stuId)
.show(1000, false)
}
/**
* //练习5:查询学过「张三」老师授课的同学的信息
*
*/
def fun5(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
fun5_2(courseDf, scoreDf, teacherDf, studentDf, spark)
}
def fun5_2(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
teacherDf.where("t_teacherName='zhangsan'")
.join(courseDf, 't_teacherId === 'c_teacherId)
.join(scoreDf, 's_courseId === 'c_courseId)
.join(studentDf, 's_stuId === 'stu_stuId)
.select("stu_stuId", "stu_stuName", "stu_birthday", "stu_gender").distinct()
.orderBy(studentDf.col("stu_stuId")).show(1000, false)
}
def fun5_1(courseDf: DataFrame, scoreDf: DataFrame, teacherDf: DataFrame, studentDf: DataFrame, spark: SparkSession): Unit = {
val teacher_id = teacherDf.select("t_teacherId").where("t_teacherName='zhangsan'").head().mkString(",")
val c_courseId = courseDf.where(s"c_teacherId=$teacher_id").select("c_courseId").head().mkString(",")
val rows: Array[Row] = scoreDf.where(s"s_courseId = $c_courseId").select("s_stuId").collect()
val stuIds: String = rows.map(item => item.mkString(",")).mkString(",")
studentDf.where(s"stu_stuId in ($stuIds)").show()
}
/**
* //练习4:查询「李」姓老师的数量
*
*/
def fun4(teacherDf: DataFrame): Unit = {
teacherDf.where("t_teacherName like '李%%'")
.groupBy("t_teacherId")
.agg(count("t_teacherId").alias("counts")).select("*").show()
}
/**
* //练习3:查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩(没成绩的显示为 null )
*
*/
def fun3(studentDf: DataFrame, scoreDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.groupBy("s_stuId")
.agg(count("s_score").alias("counts"), sum("s_score").alias("sum_score"))
.join(studentDf, 's_stuId === 'stu_stuId, "right")
.select("stu_stuId", "stu_stuName", "counts", "sum_score")
.orderBy("stu_stuId")
.show(1000, false)
}
/**
* //练习2:查询平均成绩大于等于 60 分的同学的学生编号和学生姓名和平均成绩(无成绩不参与平均值计算)
*
*/
def fun2(studentDf: DataFrame, scoreDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.groupBy('s_stuId).agg(avg('s_score).alias("avgScore"))
.filter('avgScore >= 60)
.join(studentDf, 's_stuId === 'stu_stuId, "inner")
.select("s_stuId", "stu_stuName", "avgScore")
.orderBy(desc("avgScore"))
.show()
}
/**
* //练习1:查询" 01 "课程比" 02 "课程成绩高的学生的信息(没有成绩不做比较)
*
* @param studentDf 学生df
* @param scoreDf 分数df
* @param spark
*/
def fun(studentDf: DataFrame, scoreDf: DataFrame, spark: SparkSession): Unit = {
import spark.implicits._
scoreDf.filter('s_courseId === 1)
.join(scoreDf.filter('s_courseId === 2).select('s_stuId.alias("_s_stuId"),
's_score.alias("_s_score")), 's_stuId === '_s_stuId, "inner"
).filter('s_score > '_s_score)
.join(studentDf, 's_stuId === 'stu_stuId, "inner").select('stu_stuId, 'stu_stuName)
.show(1000, false)
}
}
10.1 SQL
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for tb_Course
-- ----------------------------
DROP TABLE IF EXISTS `tb_Course`;
CREATE TABLE `tb_Course` (
`c_courseId` int(11) DEFAULT NULL,
`c_course` varchar(255) DEFAULT NULL,
`c_teacherId` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records of tb_Course
-- ----------------------------
BEGIN;
INSERT INTO `tb_Course` (`c_courseId`, `c_course`, `c_teacherId`) VALUES (1, '语文', 2);
INSERT INTO `tb_Course` (`c_courseId`, `c_course`, `c_teacherId`) VALUES (2, '数学', 1);
INSERT INTO `tb_Course` (`c_courseId`, `c_course`, `c_teacherId`) VALUES (3, '英语', 3);
COMMIT;
SET FOREIGN_KEY_CHECKS = 1;
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for tb_Score
-- ----------------------------
DROP TABLE IF EXISTS `tb_Score`;
CREATE TABLE `tb_Score` (
`s_stuId` int(11) DEFAULT NULL,
`s_courseId` int(11) DEFAULT NULL,
`s_score` int(11) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records of tb_Score
-- ----------------------------
BEGIN;
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (1, 1, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (1, 2, 90);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (1, 3, 99);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (2, 1, 70);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (2, 2, 60);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (2, 3, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (3, 1, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (3, 2, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (3, 3, 80);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (4, 1, 50);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (4, 2, 30);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (4, 3, 20);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (5, 1, 76);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (5, 2, 87);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (6, 1, 31);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (6, 3, 34);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (7, 2, 89);
INSERT INTO `tb_Score` (`s_stuId`, `s_courseId`, `s_score`) VALUES (7, 3, 98);
COMMIT;
SET FOREIGN_KEY_CHECKS = 1;
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for tb_Student
-- ----------------------------
DROP TABLE IF EXISTS `tb_Student`;
CREATE TABLE `tb_Student` (
`stu_stuId` int(11) DEFAULT NULL,
`stu_stuName` varchar(255) DEFAULT NULL,
`stu_birthday` varchar(255) DEFAULT NULL,
`stu_gender` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records of tb_Student
-- ----------------------------
BEGIN;
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (1, '赵雷', '1990/1/1', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (2, '钱电', '1990/12/21', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (3, '孙风', '1990/5/20', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (4, '李云', '1990/8/6', '男');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (5, '周梅', '1991/12/1', '女');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (6, '吴兰', '1992/3/1', '女');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (7, '郑竹', '1989/7/1', '女');
INSERT INTO `tb_Student` (`stu_stuId`, `stu_stuName`, `stu_birthday`, `stu_gender`) VALUES (8, '王菊', '1990/1/20', '女');
COMMIT;
SET FOREIGN_KEY_CHECKS = 1;
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for tb_Teacher
-- ----------------------------
DROP TABLE IF EXISTS `tb_Teacher`;
CREATE TABLE `tb_Teacher` (
`t_teacherId` int(11) DEFAULT NULL,
`t_teacherName` varchar(255) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
-- ----------------------------
-- Records of tb_Teacher
-- ----------------------------
BEGIN;
INSERT INTO `tb_Teacher` (`t_teacherId`, `t_teacherName`) VALUES (1, 'zhangsan');
INSERT INTO `tb_Teacher` (`t_teacherId`, `t_teacherName`) VALUES (2, 'lisi');
INSERT INTO `tb_Teacher` (`t_teacherId`, `t_teacherName`) VALUES (3, 'wangwu');
COMMIT;
SET FOREIGN_KEY_CHECKS = 1;
11 UDF的学习
package com.spark学习
import com.baidu.utils.LoggerTrait
import org.apache.spark.sql.api.java.UDF1
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.DataTypes
import org.apache.spark.sql.{DataFrame, SparkSession}
object UDF的学习 extends LoggerTrait {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkUtils.getSparkSession("UDF的学习")
// 加载数据
val df: DataFrame =
spark.read.text("/Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/java/com/spark学习/file/person.txt")
df.show(1000, false)
fun_2(spark, df)
}
def fun_1(spark: SparkSession, df: DataFrame): Unit = {
registerCommonUdfToSpark(spark)
// UDF函数
spark.udf.register("small2big", (value: String) => {
value.toUpperCase()
})
df.createOrReplaceTempView("tb_person")
spark.sql(
s"""
|select small2big(value) from tb_person
|""".stripMargin).show(1000, false)
import spark.implicits._
val small3big = udf((value: String) => {
value.toUpperCase()
})
df.select('value, small3big('value)).show(1000, false)
}
def fun_2(spark: SparkSession, df: DataFrame): Unit = {
registerCommonUdfToSpark(spark)
df.createOrReplaceTempView("tb_person")
spark.sql(
s"""
|select small2big(value) from tb_person
|""".stripMargin).show(1000, false)
}
private def registerCommonUdfToSpark(spark: SparkSession): Unit = {
/**
* UDF1<Int,Int> 第一个Int表示你输入的参数,即你sql中要传入的字段或者数据,
* 最后一个Int表示调用UDF方法后返回的数据类型,即return 的类型。
*/
spark.udf.register("small2big", new UDF1[String, String] {
override def call(t1: String): String = {
t1.toUpperCase()
}
}, DataTypes.StringType)
}
}
12 sparkOnHive
https://blog.csdn.net/qq_43659234/article/details/117298401
https://blog.csdn.net/weixin_44911081/article/details/121858327
SparkOnHive: 仅仅使用Hive的元数据(库/表/字段/位置等信息);剩下的用SparkSQL的,如:执行引擎,语法解析,物理执行计划,SQL优化。
首先要启动: hive --service metastore
package com.baidu.sparkcodetest.hive
import com.baidu.utils.LoggerTrait
import org.apache.spark.sql.SparkSession
object Demo_hive extends LoggerTrait{
def main(args: Array[String]): Unit = {
// 环境创建 hive --service metastore
val spark = SparkSession.builder().appName("Demo_flatMap_map").master("local[*]")
.config("spark.sql.shuffle.partitions", "4")
.config("spark.sql.warehouse.dir", "hdfs://localhost:8020/user/hive/warehouse") // 指定hdfs位置
.config("hive.metastore.uris", "thrift://localhost:9083")
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
spark.sql("show databases").show(false)
spark.sql("show tables").show(false)
spark.sql("create table person(id int, name string, age int) row format delimited fields terminated by ' '")
spark.sql("load data local inpath 'file:///Users/zhaoshuai11/Desktop/baidu/ebiz/stu-scala/src/main/resources/person.txt'" +
" into table person")
spark.sql("select * from person").show(false)
}
}