SparkSQL_JDBC
package com.chen
import org.apache.spark.SparkConf
import org.apache.spark.sql._
import java.util.Properties
object SparkSQL_JDBC {
def main(args: Array[String]): Unit = {
//创建上下文环境配置对象
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建 SparkSession 对象
val spark = SparkSession.builder().config(conf).getOrCreate()
读取数据
// //方式 1:通用的 load 方法读取
// spark.read.format("jdbc")
// .option("url", "jdbc:mysql://192.168.56.100:3306/maxwell")
// .option("driver", "com.mysql.jdbc.Driver")
// .option("user", "root")
// .option("password", "root123456")
// .option("dbtable", "test")
// .load().show
// //方式 2:通用的 load 方法读取 参数另一种形式
// spark.read.format("jdbc")
// .options(Map("url"->"jdbc:mysql://192.168.56.100:3306/maxwell?user=root&password=root123456", "dbtable"->"test","driver"->"com.mysql.jdbc.Driver")).load().show
//方式 3:使用 jdbc 方法读取
val props: Properties = new Properties()
props.setProperty("user", "root")
props.setProperty("password", "root123456")
val df: DataFrame = spark.read.jdbc("jdbc:mysql://192.168.56.100:3306/maxwell", "test", props)
df.show
写入数据
//方式 1:通用的方式 format 指定写出类型
df.write
.format("jdbc")
.option("url", "jdbc:mysql://192.168.56.100:3306/maxwell")
.option("driver", "com.mysql.jdbc.Driver")
.option("user", "root")
.option("password", "root123456")
.option("dbtable", "test2")
.mode(SaveMode.Append)
.save()
//释放资源
spark.stop()
}
}
SparkSQL
package com.chen
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object SparkSQL {
def main(args: Array[String]): Unit = {
//创建上下文环境配置对象
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建 SparkSession 对象
val spark = SparkSession.builder().config(conf).getOrCreate()
//RDD=>DataFrame=>DataSet 转换需要引入隐式转换规则,否则无法转换
//spark 不是包名,是上下文环境对象名
//读取 json 文件 创建 DataFrame {"username": "lisi","age": 18}
//val df: DataFrame = spark.read.json("datas/user.json")
//df.show()
// //DataFrame=>SQL
// df.createOrReplaceTempView("user")
// spark.sql("select avg(age) from user").show
// //DataFrame=>DSL【如果涉及到转换操作,需要引入转换规则】
import spark.implicits._
// df.select("age", "username").show
// df.select($"age" + 1).show
// df.select('age + 1).show
// //DataSet【DataFrame是特定泛型的DataSet】
// val seq = Seq(1, 2, 3, 4)
// val ds: Dataset[Int] = seq.toDS()
// ds.show()
//RDD<=>DataFrame
val rdd: RDD[(Int, String, Int)] = spark.sparkContext.makeRDD(List((1, "zhangsan", 30), (2, "lisi", 28), (3, "wangwu", 20)))
val df: DataFrame = rdd.toDF("id", "name", "age")
val rowRDD: RDD[Row] = df.rdd
//DataFrame<=>DataSet
val ds: Dataset[User] = df.as[User]
val df1: DataFrame = ds.toDF()
//RDD<=>DataSet
val ds1: Dataset[User] = rdd.map {
case (id, name, age) => {
User(id, name, age)
}
}.toDS()
val userRDD: RDD[User] = ds1.rdd
// //SQL 风格语法
// df.createOrReplaceTempView("user")
// //spark.sql("select avg(age) from user").show
// //DSL 风格语法
// //df.select("username","age").show()
// //*****RDD=>DataFrame=>DataSet*****
// //RDD
// val rdd1: RDD[(Int, String, Int)] = spark.sparkContext.makeRDD(List((1, "zhangsan", 30), (2, "lisi", 28), (3, "wangwu", 20)))
// //DataFrame
// val df1: DataFrame = rdd1.toDF("id", "name", "age")
//
// //df1.show()
// //DateSet
// val ds1: Dataset[User] = df1.as[User]
// //ds1.show()
// //*****DataSet=>DataFrame=>RDD*****
// //DataFrame
// val df2: DataFrame = ds1.toDF()
// //RDD 返回的 RDD 类型为 Row,里面提供的 getXXX 方法可以获取字段值,类似 jdbc 处理结果集,但是索引从 0 开始
// val rdd2: RDD[Row] = df2.rdd
// //rdd2.foreach(a=>println(a.getString(1)))
// //*****RDD=>DataSet*****
// rdd1.map {
// case (id, name, age) => User(id, name, age)
// }.toDS()
// //*****DataSet=>=>RDD*****
// ds1.rdd
//释放资源
spark.stop()
}
}
case class User(id: Int, name: String, age: Int)
SparkSQL_UDAF
package com.chen
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, LongType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object SparkSQL_UDAF {
def main(args: Array[String]): Unit = {
//创建上下文环境配置对象
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建 SparkSession 对象
val spark = SparkSession.builder().config(conf).getOrCreate()
//RDD=>DataFrame=>DataSet 转换需要引入隐式转换规则,否则无法转换
//spark 不是包名,是上下文环境对象名
//读取 json 文件 创建 DataFrame {"username": "lisi","age": 18}
val df: DataFrame = spark.read.json("datas/user.json")
df.createOrReplaceTempView("user")
spark.udf.register("ageAvg", new MyAvgUDAF())
spark.sql("select ageAvg(age) from user").show
//释放资源
spark.close()
}
/*
自定义聚合函数类:计算年龄的平均值
1,继承UserDefinedAggregateFunction
2,重写方法(8)
*/
class MyAvgUDAF extends UserDefinedAggregateFunction {
//输入数据的结构
override def inputSchema: StructType = {
StructType(
Array(
StructField("age", LongType)
)
)
}
//缓冲区数据结构:Buffer
override def bufferSchema: StructType = {
StructType(
Array(
StructField("total", LongType),
StructField("count", LongType)
)
)
}
//函数计算结果的数据类型:out
override def dataType: DataType = LongType
//函数的稳定性
override def deterministic: Boolean = true
//缓冲区初始化
override def initialize(buffer: MutableAggregationBuffer): Unit = {
// buffer(0) = 0L
// buffer(1) = 0L
buffer.update(0, 0L)
buffer.update(1, 0L)
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer.update(0, buffer.getLong(0) + input.getLong(0))
buffer.update(1, buffer.getLong(1) + 1)
}
//缓冲区数据合并
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0))
buffer1.update(1, buffer1.getLong(1) + buffer2.getLong(1))
}
//计算平均值
override def evaluate(buffer: Row): Any = {
buffer.getLong(0) / buffer.getLong(1)
}
}
}
SparkSQL_UDAF1
package com.chen
import org.apache.spark.SparkConf
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{DataFrame, Encoder, Encoders, SparkSession, functions}
object SparkSQL_UDAF1 {
def main(args: Array[String]): Unit = {
//创建上下文环境配置对象
val conf = new SparkConf().setMaster("local[*]").setAppName("sparkSQL")
//创建 SparkSession 对象
val spark = SparkSession.builder().config(conf).getOrCreate()
//RDD=>DataFrame=>DataSet 转换需要引入隐式转换规则,否则无法转换
//spark 不是包名,是上下文环境对象名
//读取 json 文件 创建 DataFrame {"username": "lisi","age": 18}
val df: DataFrame = spark.read.json("datas/user.json")
df.createOrReplaceTempView("user")
spark.udf.register("ageAvg", functions.udaf(new MyAvgUDAF1()))
spark.sql("select ageAvg(age) from user").show
//释放资源
spark.close()
}
/*
自定义聚合函数类:计算年龄的平均值
1,继承org.apache.spark.sql.expressions.Aggregator,定义泛型
IN:输入的数据类型 Long
BUF:缓冲区的数据类型 Buff
OUT:输出的数据类型 Long
2,重写方法(6)
*/
case class Buff(var total: Long, var count: Long)
class MyAvgUDAF1 extends Aggregator[Long, Buff, Long] {
//初始值或零值,缓冲区初始化
override def zero: Buff = {
Buff(0L, 0L)
}
//根据输入的数据更新缓冲区的数据
override def reduce(buff: Buff, in: Long): Buff = {
buff.total = buff.total + in
buff.count = buff.count + 1
buff
}
//合并缓冲区
override def merge(buff1: Buff, buff2: Buff): Buff = {
buff1.total = buff1.total + buff2.total
buff1.count = buff1.count + buff2.count
buff1
}
//计算结果
override def finish(buff: Buff): Long = {
buff.total / buff.count
}
//缓冲区的编码操作
override def bufferEncoder: Encoder[Buff] = Encoders.product
//输出的编码操作
override def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
}