import org.apache.spark.SparkContext
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* 自定义函数:
* UDF:User- Defined Funcation;用户定义(普通)函数,只对单行数值产生作用; 一进一出
* UDAF:User- Defined Aggregation Funcation;用户定义聚合函数,对多行数据产生作用(sum()、avg()...),多进一出
* UDTF:User- Defined Table-Generating Functions;用户定义表生成函数,输入一行 输出多行,一进多出
*
*/
case class Hobbies(name:String,hobbies:String)
object UDF {
def main(args: Array[String]): Unit = {
val sparkSession: SparkSession = SparkSession.builder().appName("UDF").master("local[*]").getOrCreate()
val sc: SparkContext = sparkSession.sparkContext
import sparkSession.implicits._
val hobbyDF: DataFrame = sc.textFile("in/hobbies.txt").map(x => x.split(" "))
.map(x => Hobbies(x(0), x(1))).toDF()
hobbyDF.printSchema()
hobbyDF.show(false) //truncate = false 显示完整内容
//创建临时视图
hobbyDF.createOrReplaceTempView("hobby")
//Sql方式
sparkSession.udf.register("hobby_num",(x:String)=>{x.split(",").size}) //注册hobby_num函数
sparkSession.sql("select name,hobbies,hobby_num(hobbies)as hobbyNum from hobby")
.show(false)
println("----------------------------------")
import org.apache.spark.sql.functions
//DataFrame方式o/
val hobby_num: UserDefinedFunction = functions.udf((x:String)=>{x.split(",").size}) //注册hobby_num函数
val newHobbyDF: DataFrame = hobbyDF .withColumn("hobbyNum",hobby_num($"hobbies"))
newHobbyDF.printSchema()
newHobbyDF.show(false)
}
}
Spark_UDF
最新推荐文章于 2024-09-15 11:31:49 发布