一、SparkSession
Spark SQL所有功能入口点是SparkSession,创建SparkSession,仅使用SparkSession.builder()就可以:
import org.apache.spark.sql.SparkSession
val spark = SparkSession .builder() .appName("Spark SQL basic example") .config("spark.some.config.option", "some-value") .getOrCreate()
二、创建DataFrames
val df1 = spark.read.json("file:///opt/data/sample_example.json")
注释:
sample_example.json
{"name":"Michael", "age":25,"myScore":[{"score1":19,"score2":23},{"score1":58,"score2":50}]}
{"name":"Andy", "age":30,"myScore":[{"score1":29,"score2":33},{"score1":38,"score2":52},{"score1":88,"score2":71}]}
{"name":"Justin", "age":19,"myScore":[{"score1":39,"score2":43},{"score1":28,"score2":53}]}
从hdfs读取json文件=>val df2 = spark.read.json("hdfs://lee:8020/json/sample_example.json")
df1.show()
+---+--------------------+-------+
|age| myScore| name|
+---+--------------------+-------+
| 25|[[19, 23], [58, 50]]|Michael|
| 30|[[29, 33], [38, 5...| Andy|
| 19|[[39, 43], [28, 53]]| Justin|
+---+--------------------+-------+
df1.printSchema()
root
|-- age: long (nullable = true)
|-- myScore: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- score1: long (nullable = true)
| | |-- score2: long (nullable = true)
|-- name: string (nullable = true)
myScore为数组,使用explode展开
val df2 = df1.select(df1("name"),functions.explode(df1("myScore"))).toDF("name","myScore")
df2.show()
+-------+--------+
| name| myScore|
+-------+--------+
|Michael|[19, 23]|
|Michael|[58, 50]|
| Andy|[29, 33]|
| Andy|[38, 52]|
| Andy|[88, 71]|
| Justin|[39, 43]|
| Justin|[28, 53]|
+-------+--------+
df2.printSchema()
root
|-- name: string (nullable = true)
|-- myScore: struct (nullable = true)
| |-- score1: long (nullable = true)
| |-- score2: long (nullable = true)
df2.select("myScore.score1").show()
df2.select(df2("name"),df2("myScore.score2") + 1).show()
+-------+--------------------------------+
| name|(myScore.score2 AS `score2` + 1)|
+-------+--------------------------------+
|Michael| 24|
|Michael| 51|
| Andy| 34|
| Andy| 53|
| Andy| 72|
| Justin| 44|
| Justin| 54|
+-------+--------------------------------+
df2.filter(df2("myScore.score2") > 60).show()
+----+--------+
|name| myScore|
+----+--------+
|Andy|[88, 71]|
+----+--------+
df2.groupBy("name").count().show()
+-------+-----+
| name|count|
+-------+-----+
|Michael| 2|
| Andy| 3|
| Justin| 2|
+-------+-----+
三、临时视图
df2.createOrReplaceTempView("score")
val df3 = spark.sql("select * from score")
df3.show()
+-------+--------+
| name| myScore|
+-------+--------+
|Michael|[19, 23]|
|Michael|[58, 50]|
| Andy|[29, 33]|
| Andy|[38, 52]|
| Andy|[88, 71]|
| Justin|[39, 43]|
| Justin|[28, 53]|
+-------+--------+
四、dataframe与RDD互相转换
1. dataframe=>dataset
dataframe通过类转换为dataset,其中sample.json数据格式如下:
{"umengmac":"f4:f5:db:0a:14:20","os":"2","ot":"中国移动","imsi":"460001991339696","ua":"Redmi Note 4X","mac":"f4:f5:db:0a:14:20","openudid":"a88df41c-ab47-489e-b8d9-464d2237be6e","manufacturer":"Xiaomi","sdkversion":24,"mac_real":"f4:f5:db:0a:14:20","serial":"469bf4b20304","buildv":"2.5.3","imei":"866334030818203","osversion":"7.0","umengid":"e6e32197d5ca339b","romversion":"","androidid":"e6e32197d5ca339b","apn":"4","uid":87642693,"source":"SaleChannelFragment:MainActivity->SpecialOnlyGoodsListFragment:SpecialTabCategoryActivity","channelid":"11","v":"2.5.3"}
{"umengmac":"38:29:5a:9d:32:5f","os":"2","ot":"中国移动","imsi":"460078359353359","ua":"OPPO R9s Plus","mac":"38:29:5a:9d:32:5f","openudid":"90921e49-c28d-4e89-954a-092e2ab394b0","manufacturer":"OPPO","sdkversion":23,"mac_real":"38:29:5a:9d:32:5f","serial":"5249b4d1","buildv":"6.5.4","imei":"864251038839530","osversion":"6.0.1","umengid":"8e5e46efa66d1280","romversion":"V3.0","androidid":"8e5e46efa66d1280","apn":"4","uid":203059434,"dna":"ac8e7d5f021fbd57a1708403ef32272f918cdb4c7f68b9a679b3ea4202409963","source":"SpecialOnlyGoodsListFragment:SpecialTabCategoryActivity->EcoNewRecommendDetailFragment:EcoNewRecommendDetailActivity","channelid":"12","v":"6.5.4","utdid":"Vgaq9amW41wDADE9IbAknwPC"}
import spark.implicits._
val df4 = spark.read.json("/opt/data/sample.json").as[PageView]
case class PageView(umengmac:String, os:String,
ot:String, imsi:String, ua:String,
mac:String, openudid:String,
manufacturer:String,
sdkversion:String, mac_real:String,
serial:String, buildv:String,
imei:String, osversion: String,
umengid:String, romversion:String,
androidid:String, apn:String,
uid: String, source:String,
channelid:String, v:String)
df4.show()
+----------------+---+------+---------+--------------------+---------------+---------------+-----------------+-----------------+------------+--------------------+---+---------+----+----------+----------+------------+--------------------+-------------+---------+----------------+-----------------+--------------------+-----+
| androidid|apn|buildv|channelid| dna| imei| imsi| mac| mac_real|manufacturer| openudid| os|osversion| ot|romversion|sdkversion| serial| source| ua| uid| umengid| umengmac| utdid| v|
+----------------+---+------+---------+--------------------+---------------+---------------+-----------------+-----------------+------------+--------------------+---+---------+----+----------+----------+------------+--------------------+-------------+---------+----------------+-----------------+--------------------+-----+
|e6e32197d5ca339b| 4| 2.5.3| 11| null|866334030818203|460001991339696|f4:f5:db:0a:14:20|f4:f5:db:0a:14:20| Xiaomi|a88df41c-ab47-489...| 2| 7.0|中国移动| | 24|469bf4b20304|SaleChannelFragme...|Redmi Note 4X| 87642693|e6e32197d5ca339b|f4:f5:db:0a:14:20| null|2.5.3|
|8e5e46efa66d1280| 4| 6.5.4| 12|ac8e7d5f021fbd57a...|864251038839530|460078359353359|38:29:5a:9d:32:5f|38:29:5a:9d:32:5f| OPPO|90921e49-c28d-4e8...| 2| 6.0.1|中国移动| V3.0| 23| 5249b4d1|SpecialOnlyGoodsL...|OPPO R9s Plus|203059434|8e5e46efa66d1280|38:29:5a:9d:32:5f|Vgaq9amW41wDADE9I...|6.5.4|
+----------------+---+------+---------+--------------------+---------------+---------------+-----------------+-----------------+------------+--------------------+---+---------+----+----------+----------+------------+--------------------+-------------+---------+----------------+-----------------+--------------------+-----+
val df5 = spark
.sparkContext
.textFile("/opt/data/keyword_catalog_day.csv")
.map(_.split(","))
.map(attributes=>Keyword(attributes(0),attributes(1).toInt,attributes(3),attributes(4).toDouble,attributes(5).toDouble)).toDF()
val df6 = spark.sql("select keyword,app_id,catalog_name,keyword_catalog_pv,keyword_catalog_pv_rate from keyword where app_id = 1")
df6.map(row => "keyword:" + row(0)).show(3)
+-------------+
| value|
+-------------+
| keyword:孕妇装|
|keyword:哺乳衣外出|
| keyword:孕妇裤|
+-------------+
2. rdd=>dataframe
根据反射弧推断schema,将元素类型为case class的RDD自动转化为DataFrame,case class定义表模式
row=>Map[K,V],注意org.apache.spark.sql.Encoders.kryo隐式转换
对于不能预先定义case class,采用编码指定模式:
- 原始RDD=>Row RDD
- 根据row结构创建对应的StructType模式
- 通过SparkSession提供的createDataFrame来把第2步创建模式应用到第一部Row RDD中
val schemaString = "keyword,app_id,catalog_name,keyword_catalog_pv,keyword_catalog_pv_rate"
val fields = schemaString.split(",").map(fieldName=>StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)
val rd1 = spark.sparkContext.textFile("/opt/data/keyword_catalog_day.csv").map(_.split(","))
.map(row=>Row(row(0),row(1),row(2),row(3),row(4)))
val df7 = spark.createDataFrame(rd1,schema)
df7.createOrReplaceTempView("keyword_catalog_day")
val df8 = spark.sql("select * from keyword_catalog_day")
df8.show()
+--------------------+------+------------+------------------+-----------------------+
| keyword|app_id|catalog_name|keyword_catalog_pv|keyword_catalog_pv_rate|
+--------------------+------+------------+------------------+-----------------------+
| 睡袋| 2| 母婴| 43| 0.00401982|
| 吸奶器| 2| 母婴| 25| 0.00233710|
| 纸巾| 2| 居家日用| 22| 0.00205665|
+--------------------+------+------------+------------------+-----------------------+
五、通用加载和保存
通用加载(json,parquet,jdbc,orc,libsvm,csv,text)
val df9 = spark.read.load("/opt/data/users.parquet")
df9.select("name","favorite_color").write.save("nameAndFavoriteColors.parquet")
save mode:
SaveMode.ErrorIfExists=>将dataframe保存到data
source,如果数据源已经存在,则会抛异常
SaveMode.Append=>如果数据源已经存在,则会追加现有数据
SaveMode.OverWrite=>覆盖数据
SaveMode.Ignore=>则保存数据,并不更改现有内容,相当于create tableif not exists *
df9.select("name","favorite_color").write.mode(SaveMode.Append).save("nameAndFavoriteColors.parquet")
六、分桶、排序和分区
1. 基于文件数据源,可以对output进行partition/bucket/sort
bucket和sort仅适用persistent table
df5.write
.mode(SaveMode.Overwrite)
.partitionBy("app_id")
.bucketBy(10,"keyword")
.sortBy("keyword_catalog_pv_rate")
.saveAsTable("keyword_bucket")
val df10 = spark.read.table("keyword_bucket")
df10.show()
df10.printSchema()
+-------+------------+------------------+-----------------------+------+
|keyword|catalog_name|keyword_catalog_pv|keyword_catalog_pv_rate|app_id|
+-------+------------+------------------+-----------------------+------+
| 风扇| 79| 8.1931E-4| 2.0180502E7| 1|
| 男童凉鞋| 79| 8.1931E-4| 2.0180502E7| 1|
| 隔离霜| 50| 5.1855E-4| 2.0180502E7| 1|
+-------+------------+------------------+-----------------------+------+
root
|-- keyword: string (nullable = true)
|-- catalog_name: string (nullable = true)
|-- keyword_catalog_pv: double (nullable = true)
|-- keyword_catalog_pv_rate: double (nullable = true)
|-- app_id: integer (nullable = true)
七、Schema Merging(模式合并)
parquet支持模式演进,用户可以从simple
schema,根据需要逐渐向schema添加更多的columns
读取parquet文件,将data source optinon(数据源选项)
设置为true或将global SQL option(全局SQL选项)spark.sql
.parquet.mergeSchema设置为true
val squareDF = spark.sparkContext
.makeRDD(1 to 5)
.map(i =>(i, i * i))
.toDF("value","square")
squareDF.write.mode(SaveMode.Overwrite).parquet("data/test_table/key=1")
val cubeDF = spark.sparkContext
.makeRDD(6 to 10)
.map(i=>(i, i * i * i))
.toDF("value","cube")
cubeDF.write.mode(SaveMode.Overwrite).parquet("data/test_table/key=2")
val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
mergedDF.printSchema()
root
|-- value: integer (nullable = true)
|-- square: integer (nullable = true)
|-- cube: integer (nullable = true)
|-- key: integer (nullable = true)
八、hive表加载
创建hive表,无法指定存储处理,可以在hive端创建
示例1:create table keywords(keyword string,catalog_name string) using hive options(fileFormat 'textFile', filedDelim ',')
示例2:create table keywords(keyword string,catalog_name string) row format delimited fields terminated by ','
spark.sql("drop table keywords")
spark.sql("create table if not exists keywords (keyword String,app_id Int,catalog_name String,keyword_catalog_pv Double,keyword_catalog_pv_rate Double) row format delimited fields terminated by ','")
spark.sql("load data local inpath '/opt/data/keyword_catalog_day.csv' into table keywords")
spark.sql("select * from keywords").show()
+--------------------+------+------------+------------------+-----------------------+
| keyword|app_id|catalog_name|keyword_catalog_pv|keyword_catalog_pv_rate|
+--------------------+------+------------+------------------+-----------------------+
| 睡袋| 2| 母婴| 43.0| 0.00401982|
| 吸奶器| 2| 母婴| 25.0| 0.0023371|
| 纸巾| 2| 居家日用| 22.0| 0.00205665|
+--------------------+------+------------+------------------+-----------------------+
九、jdbc连接其他数据库
两种方式:
(1)spark.read.format(..).options(..).load()
(2)spark.read.jdbc(..)
- numPartitions:读写并行度的最大分区数
- fetchszie:(读)确定每次数据往返传递行数
- batchsize:(写)确定每次数据往返传递行数
注意:mysql8.0.11驱动由com.mysql.jdbc.Driver改为com.mysql.cj.jdbc.Driver
val url = "jdbc:mysql://lee:3306/meiyou?createDatabaseIfNotExist=true&useSSL=false"
val tableName = "user_pro"
val predictes = Array("age < 20","age >= 20 and age <26", "age >=26")
val prop = new java.util.Properties()
prop.setProperty("user","root")
prop.setProperty("password","EOSspark123")
prop.setProperty("driver","com.mysql.cj.jdbc.Driver")
val df11 = spark.read.jdbc(url,tableName,predictes,prop)
val df11 = spark.read
.format("jdbc")
.option("url", "jdbc:mysql://lee:3306/meiyou?createDatabaseIfNotExist=true&useSSL=false")
.option("driver","com.mysql.cj.jdbc.Driver")
.option("dbtable","user_pro")
.option("user","root")
.option("password","EOSspark123")
.load()
df11.show()
df11.printSchema()
+---------+-----------+---------+---+----------+------+----------+------------------------+---------------+---------+------+--------+-----+---------+-----------+
| uid| ceil| qq|age| birthday|height|is_married|duration_of_menstruation|menstrual_cycle|star_sign|weight|province| city|recipient| recip_ceil|
+---------+-----------+---------+---+----------+------+----------+------------------------+---------------+---------+------+--------+-----+---------+-----------+
|2***1|1***6| | 19|12-21| 16| false| 5| 128| 11| 5.7| 江苏省| ***市| 朱*|1***6|
| 7***6|1***0| | 18|01-04| 11| false| 5| 228| 12| 5.0| 浙江省| ***市| 郑*|1***0|
| 7***3|1***3| | 19|10-18| 16| false| 7| 630| 9| 4.6| 湖南省| ***市| 张*|1***3|
+---------+-----------+---------+---+----------+------+----------+------------------------+---------------+---------+------+--------+-----+---------+-----------+
root
|-- uid: integer (nullable = true)
|-- ceil: string (nullable = true)
|-- qq: string (nullable = true)
|-- age: integer (nullable = true)
|-- birthday: string (nullable = true)
|-- height: double (nullable = true)
|-- is_married: boolean (nullable = true)
|-- duration_of_menstruation: integer (nullable = true)
|-- menstrual_cycle: integer (nullable = true)
|-- star_sign: integer (nullable = true)
|-- weight: double (nullable = true)
|-- province: string (nullable = true)
|-- city: string (nullable = true)
|-- recipient: string (nullable = true)
|-- recip_ceil: string (nullable = true)
十、自定义函数UDF、UDAF和UDTF
/**
* UDF函数:表单行转换
* 身高,体重=>BMI指数=体重(kg)/身高(m)的平方
*/
spark.udf.register("get_bmi",(weight:Double, height:Double)=>{
val ht = if(height > 100) height/100 else height
weight/Math.pow(ht, 2)
})
val df13 = spark.sql("select *,get_bmi(weight, " +
"height) bmi from user_pro")
df13.show()
df13.printSchema()
UDAF函数
弱类型:需要继承UserDefineAggregateFunction并实现相关方法,比较灵活
强类型:需要继承Aggregator并实现相关方法,适合json加载
spark.udf.register("get_average",new AverageAge1)
val df14 = spark.sql("select province,get_average" +
"(age) age1 from user_pro group by province")
df14.show()
df14.printSchema()
+--------+------------------+
|province| age1|
+--------+------------------+
| 西藏自治区|23.444444444444443|
| 北京市| 23.21028334925183|
| 辽宁省|23.036282771535582|
+--------+------------------+
val df15 = spark.sql("select * from user_pro").as[UserPor]
val aver = new Average2().toColumn.name("age2")
val df16 = df15.select(aver)
df16.show()
df16.printSchema()
root
|-- province: string (nullable = true)
|-- age1: double (nullable = true)
+----------------+
| age2|
+----------------+
|22.6388599958394|
+----------------+
root
|-- age2: double (nullable = false)
/**
* UDAF弱类型
*/
class AverageAge1 extends UserDefinedAggregateFunction{
// 输入数据
override def inputSchema: StructType = StructType(StructField("age",IntegerType)::Nil)
// 每个分区共享变量
override def bufferSchema: StructType = StructType(StructField("sum", IntegerType) :: StructField("count", IntegerType) :: Nil)
// UDAF的输出类型
override def dataType: DataType = DoubleType
// 表示如果有相同的输入是否存在相同的输出,如果是则true
override def deterministic: Boolean = true
// 初始化每个分区中的 共享变量
override def initialize
(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0
buffer(1) = 0
}
// 每一个分区中的每一条数据 聚合的时候需要调用该方法
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
// 获取这一行中的年龄,然后将年龄加入到sum中
buffer(0) = buffer.getInt(0) + input.getInt(0)
// 将年龄的个数加1
buffer(1) = buffer.getInt(1) + 1
}
// 将每一个分区的输出合并,形成最后的数据
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
// 合并总的年龄
buffer1(0) = buffer1.getInt(0) + buffer2.getInt(0)
// 合并总的年龄个数
buffer1(1) = buffer1.getInt(1) + buffer2.getInt(1)
}
// 给出计算结果
override def evaluate(buffer: Row): Any = {
// 取出总的年龄 / 总年龄个数
buffer.getInt(0).toDouble / buffer.getInt(1)
}
}
/**
* UDAF强类型
*/
class Average2 extends Aggregator[UserPor, Aver, Double]{
// 初始化方法 初始化每一个分区中的 共享变量
override def zero: Aver = Aver(0, 0)
// 每一个分区中的每一条数据聚合的时候需要调用该方法
override def reduce(b: Aver, a: UserPor): Aver = {
b.sum = b.sum + a.age
b.count = b.count + 1
b
}
// 将每一个分区的输出 合并 形成最后的数据
override def merge(b1: Aver, b2: Aver): Aver = {
b1.sum = b1.sum + b2.sum
b1.count = b1.count + b2.count
b1
}
// 给出计算结果
override def finish(reduction: Aver): Double = {
reduction.sum.toDouble / reduction.count
}
// 主要用于对共享变量进行编码
override def bufferEncoder: Encoder[Aver] = Encoders.product
// 主要用于将输出进行编码
override def outputEncoder: Encoder[Double] = Encoders.scalaDouble
}
case class UserPor(uid:Int,ceil:String,qq:String,
age:Int,birthday:String,
height:Double,is_married:Int,
duration_of_menstruation:Int,
menstrual_cycle:Int,star_sign:Int,
weight:Double,province:String,city:String,
recipient:String,recip_ceil:String)
case class Aver(var sum: Int, var count: Int)