RDD,DataSet,DataFrame三者之间的相互转换:
UDF
//向udf中注册自定义函数名和函数的功能
spark.udf.register("prefixName",(name: String) => {
"Name:" + name
})
spark.sql("select id,prefixName(name),age from t_user").show()
UDAF(3.0新版强类型)
spark.udf.register("avgAge",functions.udaf(new MyAvgAgeUDAF()))
spark.sql("select avgAge(age) from t_user").show()
case class AvgBuffer(var sum: Long, var cnt: Long)
//org.apache.spark.sql.expressions.Aggregator
class MyAvgAgeUDAF extends Aggregator[Long,AvgBuffer,Long]{
override def zero: AvgBuffer = {
AvgBuffer(0l,0l)
}
override def reduce(buffer: AvgBuffer, age: Long): AvgBuffer = {
buffer.sum += age
buffer.cnt += 1
buffer
}
override def merge(buffer1: AvgBuffer, buffer2: AvgBuffer): AvgBuffer = {
buffer1.sum += buffer2.sum
buffer1.cnt += buffer2.cnt
buffer1
}
override def finish(reduction: AvgBuffer): Long = {
reduction.sum / reduction.cnt
}
override def bufferEncoder: Encoder[AvgBuffer] = Encoders.product
override def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
UFAF(旧版本强类型)
//UDAF是强类型,所以用ds操作
val ds = df.as[User]
val udaf = new MyavgageUDAF
//DSL是面向对象的语法
ds.select(udaf.toColumn).show
case class User(id: Long,name: String, age: Long)
case class AvgBuffer(var sum: Long, var cnt: Long)
//
class MyavgageUDAF extends Aggregator[User,AvgBuffer,Long]{
override def zero: AvgBuffer = {
AvgBuffer(0l,0l)
}
override def reduce(b: AvgBuffer, user: User): AvgBuffer = {
b.sum += user.age
b.cnt += 1
b
}
override def merge(b1: AvgBuffer, b2: AvgBuffer): AvgBuffer = {
b1.sum += b2.sum
b1.cnt += b2.cnt
b1
}
override def finish(reduction: AvgBuffer): Long = {
reduction.sum / reduction.cnt
}
override def bufferEncoder: Encoder[AvgBuffer] = Encoders.product
override def outputEncoder: Encoder[Long] = Encoders.scalaLong
}
UDAF(旧版本弱类型)
spark.udf.register("avgAge",new MyavgAgeUDAF)
spark.sql("select avgAge(age) from t_user").show()
class MyavgAgeUDAF extends UserDefinedAggregateFunction{
override def inputSchema: StructType = {
StructType{
Array{
StructField("age",LongType)
}
}
}
override def bufferSchema: StructType = {
StructType{
Array{
StructField("sum",LongType)
StructField("cnt",LongType)
}
}
}
override def dataType: DataType = {
LongType
}
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer.update(0,0l)
buffer.update(1,0l)
}
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer.update(0,buffer.getLong(0) + input.getLong(0))
buffer.update(1,buffer.getLong(1) + 1)
}
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1.update(0,buffer1.getLong(0)+buffer2.getLong(0))
buffer1.update(1,buffer1.getLong(1)+buffer2.getLong(1))
}
override def evaluate(buffer: Row): Any = {
buffer.getLong(0) / buffer.getLong(1)
}
}
通用的数据的加载和保存 方式:
SparkSQL提供了通用的保存数据和数据加载的方式。SparkSql默认读取和保存文件的格式为parquer.
df.read.format("json").load("data/user.json")等同于df.read.json("data/user.json")
save()是一个通用的方法,用于写入数据,但是不知道文件中数据格式是什么,默认是parquet
写入数据时为了防止数据被覆盖,所以如果数据文件的输出路径已经存在,会发生错误。
如果能够确定不会出现问题,需要修改保存模式
df.write.mode(“append”).format(“json”).save(“output”)
SparkSql读取文件的方式是用的Hadoop的方式,即按行读取。
sparksql ->sparkcore ->hadoop -> 按行读取 -> 一行为json格式
Spark连接外部已经部署好的Hive:
1.需要把hive-site.xml复制到spark的conf目录下
2.把mysql的驱动包复制到spark的jars目录下
3.如果访问不到hdfs, 还需要将core-site.xml和hdfs-site.xml复制到spark的conf目录下