def UDAF(): Unit ={
val conf =newSparkConf().setMaster("local").setAppName("UDAF")
val sc =newSparkContext(conf)
val spark = SparkSession
.builder().appName("Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()
val name =Array("leo","marry","jack","tom","tom","tom","jack","jack","jack","marry","marry","marry","marry")
val nameRDD =sc.parallelize(name)
val namesRowRDD = nameRDD.map(num =>Row(num))
val structType =StructType(Array(StructField("name",StringType,true)))
val namesDF = spark.createDataFrame(namesRowRDD,structType)
namesDF.createOrReplaceTempView("names")
spark.sql("select name,count(name) from names group by name").rdd.collect().foreach(num =>println(num))}
2经典sale量统计案例
def DailySale(): Unit ={
val conf =newSparkConf().setMaster("local").setAppName("Daily")
val sc =newSparkContext(conf)
val spark = SparkSession
.builder().appName("Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()import spark.implicits._
//(销售日期,销售价格,销售商品ID)
val userSaleLog=Array("2015-10-01,55.05,1122","2015-10-01,15.20,1133","2015-10-01,15.20,1144","2015-10-02,56.05,","2015-10-02,78.54,1155","2015-10-02,113.02,1123")
val userSaleLogRDD= sc.parallelize(userSaleLog)
val filterUserSaleRDD = userSaleLogRDD.filter( log=>if(log.split(",").length==3)trueelsefalse)
val userSaleLogRowRDD = filterUserSaleRDD.map( log=>Row(log.split(",")(0),log.split(",")(1).toDouble,log.split(",")(2).toInt))//构造元数据
val structType =StructType(Array(StructField("date",StringType,true),StructField("saleAmount",DoubleType,true),StructField("UserID",IntegerType,true)))
val userSaleLogDF = spark.createDataFrame(userSaleLogRowRDD,structType)//通过agg算子对分组后的saleAmount进行统计
userSaleLogDF.groupBy("date").agg('date,sum('saleAmount)).rdd.map(row =>Row(row(1),row(2))).collect().foreach(num =>println(num))//开始进行每日统计}
3 经典用户访问量案例
def DailyUV(): Unit ={
val conf =newSparkConf().setMaster("local").setAppName("Daily")
val sc =newSparkContext(conf)
val spark = SparkSession
.builder().appName("Spark SQL basic example").config("spark.some.config.option","some-value").getOrCreate()import spark.implicits._
//构造用户访问数据,创建DataFrame//模拟用户访问日志,日志用逗号隔开,第一列是日期,第二列是用户id
val userAccessLog =Array("2015-10-01,1122","2015-10-01,1122","2015-10-01,1123","2015-10-01,1124","2015-10-01,1124","2015-10-02,1122","2015-10-02,1121","2015-10-02,1123","2015-10-02,1123")
val userAccessRDD = sc.parallelize(userAccessLog,5)
val userAccessLogRDD = userAccessRDD.map(num =>Row(num.split(",")(0),num.split(",")(1).toInt))// [2015-10-01,1122]// [2015-10-01,1122]// [2015-10-01,1123]// [2015-10-01,1124]// [2015-10-01,1124]// [2015-10-02,1122]// [2015-10-01,1121]// [2015-10-01,1123]// [2015-10-01,1123]//将模拟出来的用户访问日志rdd转换成dataframe// val userAccessDF =userAccessRDD.toDF().show()
val structType =StructType(Array(StructField("date",StringType,true),StructField("id",IntegerType,true)))
val userAccessLogRowDF = spark.createDataFrame(userAccessLogRDD,structType)//这里正式开始使用内置函数countDistinct//每天都很多用户来访问,但是每个用户可能每天都会访问很多次,所以uv是对用户进行去重以后的访问总数
userAccessLogRowDF.groupBy("date").agg('date,countDistinct('id)).rdd
.map{num =>Row(num(1),num(2))}.collect().foreach(num =>println(num))}
4 TOP3用户访问量统计
val conf =newSparkConf().setAppName("DailyTop3Keyword").setMaster("local")
val sc =newSparkContext(conf)
val spark = SparkSession
.builder().appName("DailyTop3Keyword").config("spark.some.config.option","some-value").getOrCreate()
val rowRDD = sc.textFile("file:///d://www/testgit/keyword.txt")// rowRDD.foreach(num => println(num))
val queryParaMap =Map("city"->List("beijing"),"platform"->List("android"),"version"->List("1.0","1.2","1.5","2.0"))
val queryParamMapBroadcast = sc.broadcast(queryParaMap)println(queryParamMapBroadcast.value("city"))
val filterRDD = rowRDD
.filter(row =>if( queryParamMapBroadcast.value("city").contains( row.split(" ")(3)))trueelsefalse).filter(row =>if( queryParamMapBroadcast.value("platform").contains( row.split(" ")(4)))trueelsefalse).filter(row =>if( queryParamMapBroadcast.value("version").contains( row.split(" ")(5)))trueelsefalse).map(row =>Tuple2(row.split(" ")(0)+"_"+row.split(" ")(2),row.split(" ")(1)))
val dateKeywordUserRDD = filterRDD.groupByKey().map(row =>Tuple2(row._1,row._2.size)).map(tuple =>Row(tuple._1.split("_")(0),tuple._1.split("_")(1),tuple._2.toInt))
val structType =StructType(Array(StructField("date",StringType,true),StructField("keyword",StringType,true),StructField("uv",IntegerType,true)))
val dateKeywordDF = spark.createDataFrame(dateKeywordUserRDD,structType)
val sql=""+"select date,keyword,uv "+"from"+" (select date,keyword,uv,row_number() over(partition by date order by uv desc) rank from daily_keyword_uv )tmp"+" where rank <=3";
dateKeywordDF.createOrReplaceTempView("daily_keyword_uv")
val dailyTop3KeywordDF = spark.sql(sql)
val dailyTop3KeywordRDD = dailyTop3KeywordDF.rdd
.map(a =>Tuple2(a.getAs[String](0),a.getAs[String](1)+"_"+ a.getAs[Integer](2))).groupByKey().foreach(a =>println(a))