spark SQL
package com.wacai.blog
import org.apache.spark.sql.{DataFrame, SparkSession}
object SparkUDF {
def isAdult(age:Int):Boolean ={
if(age>18){
true
}else{
false
}
}
/**
* spark sql的udfs用法
* <1> 匿名注册 sparkSession.udf.register("strLen",(str:String) => str.length)
* <2> 实名注册 定义实际存在的函数,然后进行注册,区别匿名的不存在正式形式的函数
* @param sparkSession
* @param dataFrame
*/
def spark2X_SQL_UDF(sparkSession: SparkSession,dataFrame: Array[Tuple2[String,Int]]): Unit ={
val userDF = sparkSession.createDataFrame(dataFrame).toDF("name","age")
//注册为临时表,注册为全局临时表是可以其他sparkSession访问到,是绑定与spark 应用进程的,随进程终止为结束,
//跨SparkSession,, 访问时指定global_temp.tmpName
//userDF.createGlobalTempView("user")
userDF.createTempView("user")
//注册spark udf 函数 spark2.x 使用方式,<一> 匿名注册
sparkSession.udf.register("strLen",(str:String) => str.length)
//spark 1.x 使用hiveContext,spark2.x后使用sparkSession包含(sparkConext.SQLConext,HiveContext等所有)
//sqlContext.udf.udf.register("strLen",(str:String) => str.length)
var new_col = "name_len"
//<二>使用实名注册函数
sparkSession.udf.register("isAdult",isAdult _)
//使用全局视图时,需要加global_temp.viewName
// val sqlDF = sparkSession.sql(s"select name,strlen(name) as ${new_col},age from global_temp.user")
//局部视图
val sqlDF = sparkSession.sql(s"select name,strlen(name) as ${new_col},age,isAdult(age) as flag from user")
sqlDF.show()
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[1]")
.config("spark.driver.memory","1g")
.config("spark.executor.memory","1g")
.getOrCreate();
//import spark.implicits._
val userData = Array(("leo",15),("julia",12),("jack",15),("romeo",12))
//spark SQL udf
// spark2X_SQL_UDF(spark,userData)
//dataFrame注册
spark2x_DF_UDF(spark,userData)
}
}
21/05/17 23:27:27 INFO CodeGenerator: Code generated in 14.875294 ms
+-----+--------+---+-----+
| name|name_len|age| flag|
+-----+--------+---+-----+
| leo| 3| 15|false|
|julia| 5| 12|false|
| jack| 4| 15|false|
|romeo| 5| 12|false|
+-----+--------+---+-----+
spark DataFrme
package com.wacai.blog
import org.apache.spark.sql.{DataFrame, SparkSession}
object SparkUDF {
def isAdult(age:Int):Boolean ={
if(age>18){
true
}else{
false
}
}
/**
* 方法同spark SQL,但是引用的是org.apache.spark.sql.functions
* @param spark
* @param userData
*/
def spark2x_DF_UDF(spark: SparkSession, userData: Array[(String, Int)]): Unit = {
val df = spark.createDataFrame(userData).toDF("name","age")
df.registerTempTable("user")
//使用匿名udf函数,引入functions
import org.apache.spark.sql.functions._
//<1>注册匿名函数
val strLen = udf((str:String) => str.length)
//<2>注册实名函数
val udf_isAdult = udf(isAdult _)
//新增列.spark 2.0才有,增加或替换一列[存在该列则替换,不存在则调用select方法新增一列,性能不如select]
df.withColumn("name_len",strLen(col("name"))).withColumn("isAdult",udf_isAdult(col("age"))).show()
df.select(col("*"),strLen(col("name")) as "name_len",udf_isAdult(col("age") as "isAdult")).show()
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[1]")
.config("spark.driver.memory","1g")
.config("spark.executor.memory","1g")
.getOrCreate();
//import spark.implicits._
val userData = Array(("leo",15),("julia",12),("jack",15),("romeo",12))
//spark SQL udf
// spark2X_SQL_UDF(spark,userData)
//dataFrame注册
spark2x_DF_UDF(spark,userData)
}
}
+-----+---+--------+-------+
| name|age|name_len|isAdult|
+-----+---+--------+-------+
| leo| 15| 3| false|
|julia| 12| 5| false|
| jack| 15| 4| false|
|romeo| 12| 5| false|
+-----+---+--------+-------+
+-----+---+--------+---------------------+
| name|age|name_len|UDF(age AS `isAdult`)|
+-----+---+--------+---------------------+
| leo| 15| 3| false|
|julia| 12| 5| false|
| jack| 15| 4| false|
|romeo| 12| 5| false|
+-----+---+--------+---------------------+