1、内置函数
需要导包:import org.apache.spark.sql.funtions._
例题一:
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object InnerFunctionDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[1]")
.appName("InnerFunctionDemo")
.getOrCreate()
val sc = spark.sparkContext
val accessLog = Array(
"2016-12-27,001",
"2016-12-27,001",
"2016-12-27,002",
"2016-12-28,003",
"2016-12-28,004",
"2016-12-28,002",
"2016-12-28,002",
"2016-12-28,001"
)
val schema=StructType(Array(
StructField("day",StringType,true), //true是非空
StructField("userId",IntegerType,true)
))
val rdd = sc.parallelize(accessLog).map(x=>x.split(",")).map(x=>Row(x(0),x(1).toInt))
//根据数据以及Schema信息生成DataFrame
val df = spark.createDataFrame(rdd,schema)
df.printSchema
df.show()
//导入Spark SQL内置的函数
import org.apache.spark.sql.functions._
//求每天所有的访问量(pv)
df.groupBy(df("day")).agg(count(df("userId")).as("pv")).collect().foreach(println)
//求每天的去重访问量
df.groupBy(df("day")).agg(countDistinct(df("userId")).as("pv")).show()
}
}
例题二(结合样例类):
import org.apache.spark.sql.{DataFrame, SparkSession}
object InnerFunctionDemo2 {
case class Student(id: Int, name: String, gender: String, age: Int)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[1]")
.appName("InnerFunctionDemo2")
.getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val stuDF: DataFrame = Seq(
Student(1001, "zhangsan", "F", 20),
Student(1002, "lisi", "M", 16),
Student(1003, "wangwu", "M", 21),
Student(1004, "zhaoliu", "F", 21),
Student(1005, "zhouqi", "M", 22),
Student(1006, "qianba", "M", 19),
Student(1007, "liuliu", "F", 23)
).toDF()
stuDF.printSchema()
stuDF.show()
import org.apache.spark.sql.functions._
stuDF.groupBy(stuDF("gender")).agg(count(stuDF("age"))).show()
stuDF.groupBy(stuDF("gender")).agg(max(stuDF("age"))).show()
stuDF.groupBy(stuDF("gender")).agg(min(stuDF("age"))).show()
println("--------------------------")
stuDF.groupBy(stuDF("gender")).agg("age"->"max",
"age"->"min",
"age"->"avg",
"id"->"count").show()
//select * from table group by gender,age
stuDF.groupBy("gender","age").count().show()
}
}
2、自定义函数
1、定义函数
2、注册函数
①SparkSession.udf.register():只在sql()中有效
②functions.udf():对DataFrame API均有效
3、函数调用
示例:用户行为喜好个数统计
import org.apache.spark.sql.SparkSession
object SparkUDFDemo {
case class Hobbies(name:String,hobbies: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[1]")
.appName("SparkUDFDemo")
.getOrCreate()
//需要手动导入一个隐式转换,否则RDD无法转换成DF
import spark.implicits._
val sc = spark.sparkContext
val rdd = sc.textFile("D:\\test\\t\\hobbies.txt")
val df = rdd.map(x=>x.split(",")).map(x=>Hobbies(x(0),x(1))).toDF()
df.printSchema()
df.show()
df.registerTempTable("hobbies")
//注册自定义函数,注意是匿名函数
spark.udf.register("hoby_num",(s:String)=>s.split(",").size)
val frame = spark.sql("select name,hobbies,hoby_num(hobbies) as hobnum from hobbies")
frame.show()
}
}