Spark SQL自定义函数
spark自定义函数用法类似于hive的格式,可以参考hive自定义函数的写法进行编写
UDF 一进一出
case class Hobbies(name:String,hobbies: String)
object UDFDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("UDFFunction")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val spark: SparkSession = SparkSession.builder().master("local[*]").config(conf).getOrCreate()
import org.apache.spark.sql.functions._
import spark.implicits._
val rdd: RDD[String] = sc.textFile("in/hobbies.txt")
val hobbyDF: DataFrame = rdd.map(x=>x.split(" ")).map(x=>Hobbies(x(0),x(1))).toDF()
hobbyDF.createOrReplaceTempView("hobby")
spark.udf.register("hobby_name",(x:String)=>x.split(",").size)
spark.sql("select name,hobbies,hobby_name(hobbies) as hobynum from hobby").show(false) //show 设置不隐藏字段
}
}
UDTF 一进多出
package nj.zb.kb11.fuction
import java.util
import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory
import org.apache.hadoop.hive.serde2.objectinspector.{ObjectInspector, ObjectInspectorFactory, StructObjectInspector}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
object UDTFDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("udtfdemo").setMaster("local[*]")
val spark: SparkSession = SparkSession
.builder()
.config(conf)
.config("hive.metastore.uris","thrift://192.168.153.141:9083")
.enableHiveSupport()
.getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
val rdd: RDD[String] = sc.textFile("in/UDTF.txt")
val frame: DataFrame = rdd.map(x => x.split("//"))
.filter(x => x(1).equals("ls"))
.map(x => (x(0), x(1), x(2)))
.toDF("id", "name", "class")
frame.createOrReplaceTempView("udtftable")
// frame.printSchema()
// frame.show()
// spark.udf.register("myudtf",new MyUDTF())
spark.sql("CREATE TEMPORARY FUNCTION Myudtf AS 'nj.zb.kb11.fuction.MyUDTF'")
spark.sql("select Myudtf(class) from udtftable").show()
}
}
// hive UDTF函数
class MyUDTF extends GenericUDTF{
override def initialize(argOIs: Array[ObjectInspector]): StructObjectInspector = {
val fieldName = new java.util.ArrayList[String]
val fieldOIS = new util.ArrayList[ObjectInspector]
// 定义输出字段的类型
fieldName.add("type")
fieldOIS.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory.getStandardStructObjectInspector(fieldName,fieldOIS)
}
override def initialize(argOIs: StructObjectInspector): StructObjectInspector = {
val fieldName = new java.util.ArrayList[String]
val fieldOIS = new util.ArrayList[ObjectInspector]
// 定义输出字段的类型
fieldName.add("type")
fieldOIS.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory.getStandardStructObjectInspector(fieldName,fieldOIS)
}
// 传入 Hadoop scala spark hive hbase
/* 输出head type String
Hadoop
scala
spark
hive
hbase
*/
override def process(objects: Array[AnyRef]): Unit = {
// 将字符串切割成单个的字符数组
val strings: Array[String] = objects(0).toString.split(" ")
for(str<-strings){
var temp: Array[String] = new Array[String](1)
temp(0) = str
forward(temp)
}
}
override def close(): Unit = {
}
}
UDAF 多进一出
package nj.zb.kb11
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object UDAFDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("UDFFunction")
val sc: SparkContext = SparkContext.getOrCreate(conf)
val spark: SparkSession = SparkSession.builder().master("local[*]").config(conf).getOrCreate()
import org.apache.spark.sql.functions._
import spark.implicits._
val students = Seq(
Student(1, "zhangsan", "F", 20),
Student(2, "lisi", "F", 30),
Student(3, "wangwu", "F", 25),
Student(4, "zhaoliu", "F", 26),
Student(5, "songba", "M", 44),
Student(6, "qianjiu", "F", 55),
Student(7, "zhoushi", "M", 33),
Student(8, "fengshiyi", "M", 22),
Student(9, "chenshier", "M", 66),
Student(10, "weishisan", "F", 77)
)
import spark.implicits._
val frame: DataFrame = students.toDF()
frame.createOrReplaceTempView("students")
//select avg(age) from student group by gender;
val myUDAF = new MyAgeAvgFunction
spark.udf.register("myAvg",myUDAF)
val relustDF: DataFrame = spark.sql("select gender,myAvg(age) as avgage from students group by gender")
relustDF.printSchema()
relustDF.show()
}
}
class MyAgeAvgFunction extends UserDefinedAggregateFunction{
//聚合函数的输入数据结构
override def inputSchema: StructType = {
new StructType().add("age",LongType)
// StructType(StructField("age",LongType) :: Nil) //两种写法都可以
}
//缓存区内的数据结构
override def bufferSchema: StructType = {
new StructType().add("sum",LongType).add("count",LongType)
// StructType(StructField("sum",LongType) :: StructField("count",LongType):: Nil) //两种写法都可以
}
//聚合函数返回值的数据结构
override def dataType: DataType = DoubleType
//聚合函数 相同的输入是否要得到相同输出 聚合函数是否为幂等操作
override def deterministic: Boolean = true
//数据初始化
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = 0L //记录 传入所有用户年龄的总和
buffer(1) = 0L //记录 传入的用户个数
}
//传入一条新的数据后需要进行的处理
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
buffer(0)=buffer.getLong(0)+input.getLong(0)
buffer(1)=buffer.getLong(1)+1
}
//合并各分区内的数据
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
buffer1(0) = buffer1.getLong(0)+buffer2.getLong(0) //总年龄和
buffer1(1)=buffer1.getLong(1)+buffer2.getLong(1) //总用户个数
}
//计算最终结果
override def evaluate(buffer: Row): Any = {
buffer.getLong(0).toDouble/buffer.getLong(1)
}
}