内置函数(org.apache.spark.sql.funtions.scala)
内置函数的使用
模拟用户访问日志信息,acceLog.txt内容如下
2016-12-27,001
2016-12-27,001
2016-12-27,002
2016-12-28,003
2016-12-28,004
2016-12-28,002
2016-12-28,002
2016-12-28,001
package nj.zb.kb09.sql
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
object SparkSQL2 {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("sparksqlMysql")
.master("local[*]").getOrCreate()
val sc = spark.sparkContext
import spark.implicits._
val schema = StructType(List(
StructField("date", StringType, true),
StructField("userID", IntegerType, true)
))
val rowRDD = sc.textFile("in/accessLog.txt")
.map(_.split(",")).map(x => Row(x(0), x(1).toInt))
val df = spark.createDataFrame(rowRDD,schema)
df.printSchema()
df.show()
import org.apache.spark.sql.functions._
//求每天所有用户的访问量
df.groupBy("date").agg(count("userID").as("pv")).show()
//求每天的去重用户的访问量
df.groupBy("date").agg(countDistinct("userID").as("uv")).show()
}
}
自定义函数
1、定义函数
2、注册函数
SparkSession.udf.register():只在sql()中有效
functions.udf():对DataFrame API均有效
3、函数调用
实例
需求:用户行为喜好个数统计
hobbles.txt内容如下
alice jogging,Coding,cooking
lina travel,dance
输出格式要求
alice jogging,Coding,cooking 3
lina travel,dance 2
代码
package nj.zb.kb09.sql
import org.apache.spark.sql.SparkSession
object SparkUDFDemo {
case class Hobbies(name:String,hobbies: String)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]")
.appName("udf").getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val rdd = sc.textFile("in/hobbies.txt")
val df = rdd.map(x=>x.split(" "))
.map(x=>Hobbies(x(0),x(1))).toDF()
df.printSchema()
df.show()
df.registerTempTable("hobbies")
spark.udf.register("hobby_num",
(v:String)=>v.split(",").size)
spark.sql("select name,hobbies,hobby_num(hobbies) as hobnum from hobbies").show()
}
}