1、UDF:(一进一出)用户定义(普通)函数,只对单行数值产生作用
val spark = SparkSession.builder()
.master(“local[*]”).appName(“1”).getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val rdd = sc.textFile(“in/hobbies.txt”)
val df = rdd.map(x => x.split(" ")).map(x => Hobbies(x(0), x(1)))
.toDF()
df.printSchema()
df.show()
df.createOrReplaceTempView("t1")
spark.udf.register("hobby_num",(x:String)=>x.split(",").size)
val frame = spark.sql("select name,hobbies,hobby_num(hobbies) as hobby_num from t1")
frame.show()
//hobby_num(hobbies)在hobbies这一列的基础上统计出的num要()
//as hobby_num 是表别名
2、UDAF:(多进一出)User- Defined Aggregation Funcation;用户定义聚合函数,可对多行数据产生作用;等同与SQL中常用的SUM(),AVG(),也是聚合函数;
val spark = SparkSession.builder().master(“local[*]”).appName(“1”).getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val df = spark.read.json("in/user.json")
df.printSchema()
df.show()
//创建并注册自定义udaf函数
val myudaf = new MyAgeAvgFunction
spark.udf.register("myAvgAge",myudaf)
df.createTempView("biao1")
spark.sql("select myAvgAge(age) as bieming from biao1")
}
}
class MyAgeAvgFunction extends UserDefinedAggregateFunction {
//缓存区数据结构
override def inputSchema: StructType = {
new StructType().add(“age”,LongType)
}
//聚合函数返回值数据结构
override def bufferSchema: StructType = {
new StructType().add(“sum”,LongType).add(“count”,LongType)
StructType(StructField(“sum”,LongType)::StructField(“count”,LongType)::Nil)
}
//聚合函数是否是幂等的,即相同输入是否总是能得到相同输出
override def dataType: DataType = DoubleType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0)=0L //初始值
buffer(1)=0L
}
//给聚合函数传入一条新数据进行处理
override def update(buffer: MutableAggregationBuffer, input: Row): Unit ={
buffer(0)=buffer.getLong(0)+input.getLong(0)
buffer(1)=buffer.getLong(1)+1
}
//合并聚合函数缓冲区
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
//总年龄数
buffer1(0)=buffer1.getLong(0)+buffer2.getLong(0)
//部个数
buffer1(1)=buffer1.getLong(1)+buffer2.getLong(1)
}
//计算最终结果
override def evaluate(buffer: Row): Any = {
buffer.getLong(0).toDouble/buffer.getLong(1)
}
3、UDTF:(一进多出)User-Defined Table-Generating Functions,用户定义表生成函数,用来解决输入一行输出多行;
//继承GenericUDTF类,重写initialize(选择Object那个)(返回输出行信息:列个数,类型)(alter insert=>override method), process, close(在f1上alter enter)三方法;
//添加enableHiveSupport() 写在getOrCreate()前面即可
//导入隐式参数 ,否则没有toDF
//必须写包名sql.f1
val spark = SparkSession.builder().appName(“1”).master(“local[*]”).enableHiveSupport().getOrCreate()////添加enableHiveSupport() 写在getOrCreate()即可
val sc = spark.sparkContext
val rdd = sc.textFile("in/udtf.txt")
rdd.collect().foreach(println)
//导入隐式参数 ,否则没有toDF
import spark.implicits._
val df = rdd.map(x=>x.split("//")).filter(x=>x(1).equals("ls")).map(x=>(x(0),x(1),x(2))).toDF("id","name","banji")
df.show()
df.createTempView("t1")
spark.sql("create temporary function f1 as 'UDF.f1'")
val frame = spark.sql("select f1(banji) from t1")
frame.show()
}
}
class f1 extends GenericUDTF{
override def initialize(argOIs: Array[ObjectInspector]): StructObjectInspector = {
if(argOIs.length!=1){
throw new UDFArgumentException(“有且只能有一个参数传入”)
}
if(argOIs(0).getCategory!=ObjectInspector.Category.PRIMITIVE){
throw new UDFArgumentException(“参数类型不匹配”)
}
val filedNames = new util.ArrayList[String]
val fieldOIs = new util.ArrayList[ObjectInspector]
filedNames.add(“type”)
fieldOIs.add(PrimitiveObjectInspectorFactory.javaStringObjectInspector)
ObjectInspectorFactory.getStandardStructObjectInspector(filedNames,fieldOIs)
}
override def process(objects: Array[AnyRef]): Unit = {
val strings=objects(0).toString.split(" ")
println(strings)
for (str <- strings){
val tmp=new ArrayString
tmp(0)=str
forward(tmp)
}
}
override def close(): Unit = {
}
1209

被折叠的 条评论
为什么被折叠?



