自定义udf都是大同小异
udf返回Array
-
udf可以直接返回null只需返回值为null
-
不可变的 val s1 = Set("a","b","c") s1 ++: "a" 反回结果类型为Set<any>; 如果需要则要用s1 ++: Set("a")
-
udf 返回Array 需要 XXX.toSeq()
spark.udf.register("COMBINE_KEYWORD", (goodsName: String, keywords: String) => {
if (!(goodsName == null && keywords == null)) {
if (goodsName == null && keywords != null) {
val keywordsFilter = keywords.split("[\\\\||丨|、|;]").map(_.trim.toLowerCase).filter(_.length > 1)
if (keywordsFilter.size > 0) keywordsFilter.toSet.toSeq else null
} else if (goodsName != null && keywords == null) {
Set(goodsName).toSeq
} else {
val keywordList = keywords.split("[\\\\||丨|、|;]").map(_.trim.toLowerCase).filter(_.length > 1).toSet
val goodsStandName = goodsName.trim.toLowerCase
if (keywordList.contains(goodsStandName)) {
keywordList.toSeq
} else {
(keywordList ++: Set(goodsStandName)).toSeq
}
}
} else {
null
}
})
即使用object静态方式传udf函数,还是会提示UserTagsParams未序列化.
case class UserTagsParams(
weightBrand: Double,
weightCategory: Double,
weightConsumeOnLine: Double,
weightConsumeOffLine: Double,
weigthTimeDecay: Double,
weigthPv: Double,
weigthUv: Double,
weightAddCart: Double,
weightRemoveCart: Double,
limitDate: String,
memberGroup: List[String],
numPartitions: Int
)
object UserTagsParams {
private var mLimitDayNum: Int = 0
def setLimitDayNum(limitDayNum: Int) = this.mLimitDayNum = limitDayNum
def limitDayNum: Int = this.mLimitDayNum
}
spark.udf.register("WEIBULL_DECAY", (t: Int) => {
// UserTagsParams.limitDayNum
val totalNum = UserTagsParams.limitDayNum
// val totalNum = 30
val initScore = 0.8
val maxScore = 0.2
val alpha = math.log(initScore / maxScore) / totalNum
val l = -math.log(initScore) / alpha
val decayScore = math.exp(-alpha * (t + l))
decayScore
})
即使用object静态方式传udf函数,还是会提示UserTagsParams未序列化.
就算UserTagsParams extends Serializable 还是会有这样的提示
org.apache.spark.SparkException: Task not serializable
at org.apache.spark.util.ClosureCleaner$.ensureSerializable(ClosureCleaner.scala:298)
at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:288)
at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:108)
at org.apache.spark.SparkContext.clean(SparkContext.scala:2094)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:840)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitionsWithIndex$1.apply(RDD.scala:839)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)
at org.apache.spark.rdd.RDD.mapPartitionsWithIndex(RDD.scala:839)
at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:371)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:114)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:135)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:132)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:113)
at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:225)
at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:308)
at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:38)
at org.apache.spark.sql.Dataset$$anonfun$org$apache$spark$sql$Dataset$$execute$1$1.apply(Dataset.scala:2371)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:57)
at org.apache.spark.sql.Dataset.withNewExecutionId(Dataset.scala:2765)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$execute$1(Dataset.scala:2370)
at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$collect(Dataset.scala:2377)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2113)
at org.apache.spark.sql.Dataset$$anonfun$head$1.apply(Dataset.scala:2112)
...
... 56 elided
Caused by: java.io.NotSerializableException: UserTagsParams$
Serialization stack:
- object not serializable (class: UserTagsParams$, value: UserTagsParams$@364dfe1e)
- field (class: $iw, name: UserTagsParams$module, type: class UserTagsParams$)