自定义函数的原因
因为在一些情况下,sparksql里面自带的一些函数可能满足不了需求,而一些操作可能又需要多次去执行,比如对矿井下测出来的一些数据需要进行分离解析,得出具体的字段来存放到表里,那么如果频繁的使用spark的API进行多次的操作,代码也会很繁琐,这时候就可以考虑通过自定义注册函数,来解析数据,下面是简单的代码操作。
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.functions.udf
object DataFramewithColumnApp {
def main(args: Array[String]): Unit = {
val sparkSession = SparkSession.builder()
.appName("DataFramewithColumnApp")
.master("local[2]")
.getOrCreate()
//functionApp(sparkSession)
withColumu(sparkSession)
sparkSession.stop()
}
//自定义注册函数,用sql增加表字段
def functionApp(sparkSession: SparkSession)={
val info = sparkSession.sparkContext.textFile("file:///C:\\Users\\HJ\\Desktop/parse.txt")
val rdd = info.map(_.split("\t")).map(x=>Row(x(0),x(2).toLong))
val struct = StructType(Array(
StructField("name", StringType, true),
StructField("number", LongType, false)
))
val DF = sparkSession.createDataFrame(rdd,struct)
sparkSession.udf.register("parsename",(name:String,num:Int)=>{
DFwithColumnUtils.parseUtils(name,num)
})
DF.createOrReplaceTempView("infos")
sparkSession.sql("select name,parsename(name,0)as fruit,parsename(name,1) from infos").show
}
//用withColumn的方法增加表字段,需要导入内置functions下面的udf
def withColumu(sparkSession: SparkSession)={
val info = sparkSession.sparkContext.textFile("file:///C:\\Users\\HJ\\Desktop/parse.txt")
val rdd = info.map(_.split("\t")).map(x=>Row(x(0),x(2).toLong))
val struct = StructType(Array(
StructField("name", StringType, true),
StructField("number", LongType, false)
))
val DF = sparkSession.createDataFrame(rdd,struct)
//定义temp方法的时候,只能传入一个参数,用于下面传入列字段,再多出其他参数都会报错
//比如temp=(name:String,num:Int)本来想在方法里面创建索引,发现有两个参数就报错
val temp=(name:String)=>{
val vall=name.split("/")
vall
}
val parsename1=udf(temp)
val result=DF.withColumn("friut",parsename1(DF("name"))(0)).withColumn("color",parsename1(DF("name"))(1))
.show()
}
}
object DFwithColumnUtils {
def parseUtils(name:String,num:Int) = {
val value=name.split("/")
value(num)
}
}
parse.txt
apple/red 2018-12-12 6
orange/yellow 2018-03-12 7
banana/yellow 2018-07-12 3
pear/white 2018-05-12 10
结果
使用withColumn
+-------------+------+------+------+
| name|number| friut| color|
+-------------+------+------+------+
| apple/red| 6| apple| red|
|orange/yellow| 7|orange|yellow|
|banana/yellow| 3|banana|yellow|
| pear/white| 10| pear| white|
+-------------+------+------+------+
注册为临时表,使用sql
sql
+-------------+------+----------------------+
| name| fruit|UDF:parsename(name, 1)|
+-------------+------+----------------------+
| apple/red| apple| red|
|orange/yellow|orange| yellow|
|banana/yellow|banana| yellow|
| pear/white| pear| white|
+-------------+------+----------------------+