callUDF用与将sql注册的UDF转为外部DataFrame可见
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
object TestDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[*]")
.appName("test-demo")
.getOrCreate()
import spark.implicits._
val df = Seq(("id1", 1, 100), ("id2", 4, 300), ("id3", 5, 800)).toDF("id", "value", "cnt")
df.printSchema()
// 注册自定义UDF函数
spark.udf.register("simpleUDF", (v: Int, w: Int) => v * v + w * w)
df.select($"id", callUDF("simpleUDF", $"value", $"cnt")).toDF("id", "value").show
}
}
spark udf 调用多次
原因分析:
- spark udf (闭包) 默认是确定性的,每次使用udf处理的字段(action),需要重新调用一次;
- 一次action操作中,调用一次。
解决方案:
- udf.asNondeterministic() 可以解决udf 隐式调用问题,显示的action,还是会调用多次
- cache:cache之后的操作,不再调用udf
demo演示
- 每行数据调用2次(2次显式action)
@Test
def testUdf(): Unit = {
val dataSource = Seq(
("11111111", "1")
, ("11111111", "-1")
, ("11111111", "-2")
, ("11111111", "-2")
)
val rawDF = spark.createDataFrame(dataSource).toDF("id", "temp")
.withColumn("temp", $"temp".cast("double"))
val testDF = rawDF
.withColumn("test", testUdfM($"temp"))
testDF.show(false)
val testDF2 = testDF.withColumn("test2", concat(lit("aaaa:"), concat($"test")))
testDF2.printSchema()
testDF2.show(false)
}
val testUdfM = udf(
(temp: Double) => {
println("========>>>>>>>>" + temp)
if (temp > 0 ) {
">0"
} else {
"not >0"
}
}
)
- 每行数据调用1次(日过显式action中包含优化job,则会触发调用多次udf)
def testUdf(): Unit = {
val dataSource = Seq(
("11111111", "1")
, ("11111111", "-1")
, ("11111111", "-2")
, ("11111111", "-2")
)
val rawDF = spark.createDataFrame(dataSource).toDF("id", "temp")
.withColumn("temp", $"temp".cast("double"))
val testDF = rawDF
.withColumn("test", testUdfM($"temp"))
val testDF2 = testDF.withColumn("test2", concat(lit("aaaa:"), concat($"test")))
testDF2.printSchema()
testDF2.show(false)
}
val testUdfM = udf(
(temp: Double) => {
println("========>>>>>>>>" + temp)
if (temp > 0 ) {
">0"
} else {
"not >0"
}
}
)
- 每行数据调用1次 cache(cache 只有的操作不再调用udf)
@Test
def testUdf(): Unit = {
val dataSource = Seq(
("11111111", "1")
, ("11111111", "-1")
, ("11111111", "-2")
, ("11111111", "-2")
)
val rawDF = spark.createDataFrame(dataSource).toDF("id", "temp")
.withColumn("temp", $"temp".cast("double"))
val testDF = rawDF
.withColumn("test", testUdfM($"temp")).cache()
testDF.show(false)
val testDF2 = testDF.withColumn("test2", concat(lit("aaaa:"), concat($"test")))
testDF2.printSchema()
testDF2.show(false)
}
val testUdfM = udf(
(temp: Double) => {
println("========>>>>>>>>" + temp)
if (temp > 0 ) {
">0"
} else {
"not >0"
}
}
)
- 每行数据调用2次: asNondeterministic(每次显式action调用一次)
@Test
def testUdf(): Unit = {
val dataSource = Seq(
("11111111", "1")
, ("11111111", "-1")
, ("11111111", "-2")
, ("11111111", "-2")
)
val rawDF = spark.createDataFrame(dataSource).toDF("id", "temp")
.withColumn("temp", $"temp".cast("double"))
val testDF = rawDF
.withColumn("test", testUdfM($"temp"))
testDF.show(false)
val testDF2 = testDF.withColumn("test2", concat(lit("aaaa:"), concat($"test")))
testDF2.printSchema()
testDF2.show(false)
}
val testUdfM = udf(
(temp: Double) => {
println("========>>>>>>>>" + temp)
if (temp > 0 ) {
">0"
} else {
"not >0"
}
}
).asNondeterministic()
变长参数udf
spark.udf.register("test_udf_2", StringUtil.concatStr2 _ )
val testDF = tmpDF
.withColumn("new_fuel", test_udf(lit("_"), array($"aaa", $"bbb")))
def concatStr2(sep: String, str: String*): String = {
str.toList.reduce((a, b) => a + sep + b)
}
val test_udf = udf(
StringUtil.concatStr2 _
)