import org.apache.spark.{SparkConf, SparkContext}
object Test1 {
//Top10热门品类
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setMaster("local[4]").setAppName("test"))
//1、读取数据
val rdd1 = sc.textFile("datas/user_visit_action.txt")
//3、统计每个品类点击次数
//3.1、过滤点击数据
val clickRdd = rdd1.filter(line=>{
val arr = line.split("_")
arr(6)!="-1"
})
//3.2、切割
val clickSplitRdd = clickRdd.map(line=>{
val arr = line.split("_")
(arr(6),1)
})
//3.3、分组聚合
val clickNumRdd = clickSplitRdd.reduceByKey(_+_)
//List( (1,10),(5,30))
//4、统计每个品类下单次数
//4.1、过滤下单数据
val orderRDD = rdd1.filter(line=>{
val arr = line.split("_")
arr(8)!="null"
})
//4.2、切割
val orderSplitRdd = orderRDD.flatMap(line=>{
val arr = line.split("_")
val ids = arr(8)
ids.split(",").map(id=> (id,1))
})
//4.3、统计下单次数
val orderNumRdd = orderSplitRdd.reduceByKey(_+_)
//RDD[ (1,15),(5,5)]
//5、统计每个品类支付次数
//5.1、过滤支付数据
val payRdd = rdd1.filter(line=>{
val arr = line.split("_")
arr(10)!="null"
})
//5.2、切割
val paySplitRdd = payRdd.flatMap(line=>{
val arr = line.split("_")
val ids = arr(10)
ids.split(",").map(id=>(id,1))
})
//5.3、统计支付次数
val payNumRdd = paySplitRdd.reduceByKey(_+_)
//RDD[ (1,2),(5,3)]
//6、join得到每个品类的点击、支付、下单次数
val totalRdd = clickNumRdd.leftOuterJoin(orderNumRdd).leftOuterJoin(payNumRdd)
val totalNumRdd = totalRdd.map{
case (id,((clickNum,orderNum),payNum)) => (id,clickNum,orderNum.getOrElse(0),payNum.getOrElse(0))
}
//7、排序取前十
totalNumRdd.sortBy({
case (id,clickNum,orderNum,payNum) => (clickNum,orderNum,payNum)
},false)
//8、结果展示
.collect().take(10).foreach(println(_))
}
}
Spark十个热门类
最新推荐文章于 2022-07-31 17:07:22 发布