一、Spark自定义函数UDF
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
object UDFDemo {
//创建样例类
case class Hobbies(name: String, hobbies: String)
def main(args: Array[String]): Unit = {
//建立连接
val conf = new SparkConf().setAppName("Function").setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
val sc = spark.sparkContext
//需要手动导入一个隐式转换,否则RDD无法转换成DF
import spark.implicits._
//导入文件
val rdd = sc.textFile("in/hobbies.txt")
//根据文件情况进行分割,然后分组,转换为DataFrame
val hobbyDF = rdd.map(_.split(" "))
.map(x => Hobbies(x(0), x(1))).toDF()
//注册为临时表
hobbyDF.createOrReplaceTempView("hobby")
//注册自定义函数
spark.udf.register("hobby_num", (x: String) => x.split(",").size )
//.show(false)字段不隐藏
spark.sql("select name,hobbies,hobby_num(hobbies) as hobbyNum from hobby").show(false)
}
}
运行结果如下
+-----+----------------------+--------+
|name |hobbies |hobbyNum|
+-----+----------------------+--------+
|alice|jogging,Coding,cooking|3 |
|lina |travel,dance |2 |
+-----+----------------------+--------+
文档 hobbies.txt内容
alice jogging,Coding,cooking
lina travel,dance
### 二、Spark自定义函数UDAF