需求说明:
品类是指产品的分类,大型电商网站品类分多级,咱们的项目中品类只有一级,不同的公司可能对热门的定义不一样。我们按照每个品类的点击、下单、支付的量来统计热门品类。
鞋 点击数 下单数 支付数
衣服 点击数 下单数 支付数
电脑 点击数 下单数 支付数
本项目需求优化为:先按照点击数排名,靠前的就排名高;如果点击数相同,再比较下单数;下单数再相同,就比较支付数。
分别统计每个品类点击的次数,下单的次数和支付的次数:
(品类,点击总数)(品类,下单总数)(品类,支付总数)
log数据字段说明如下:
编号 | 字段名称 | 字段类型 | 字段含义 |
1 | date | String | 用户点击行为的日期 |
2 | user_id | Long | 用户的ID |
3 | session_id | String | Session的ID |
4 | page_id | Long | 某个页面的ID |
5 | action_time | String | 动作的时间点 |
6 | search_keyword | String | 用户搜索的关键词 |
7 | click_category_id | Long | 某一个商品品类的ID |
8 | click_product_id | Long | 某一个商品的ID |
9 | order_category_ids | String | 一次订单中所有品类的ID集合 |
10 | order_product_ids | String | 一次订单中所有商品的ID集合 |
11 | pay_category_ids | String | 一次支付中所有品类的ID集合 |
12 | pay_product_ids | String | 一次支付中所有商品的ID集合 |
13 | city_id | Long | 城市 id |
log数据截图:
方式1:
import org.apache.spark.rdd.RDD
import org.apache.spark.util.AccumulatorV2
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object Spark01_ReqHotCategory_4 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Test")
val sc = new SparkContext(conf)
val fileDatas: RDD[String] = sc.textFile("Spark/data/user_visit_action.txt",4)
// 一个flatMap将点击,下单,付款数据都准备好
val rdd: RDD[(String, (Int, Int, Int))] = fileDatas.flatMap(
lines => {
val datas: Array[String] = lines.split("_")
if (datas(6) != "-1") {
List((datas(6), (1, 0, 0)))
} else if (datas(8) != "null") {
val ids: Array[String] = datas(8).split(",")
ids.map(id => {
(id, (0, 1, 0))
})
} else if (datas(10) != "null") {
datas(10).split(",").map(id => {
(id, (0, 0, 1))
})
} else {
Nil
}
}
)
val acc = new reqAcc()
sc.register(acc,"req")
rdd.foreach(
data => {
acc.add(data)
}
)
val datas: List[(String, (Int, Int, Int))] = acc.value.toList
val top10: List[(String, (Int, Int, Int))] = datas.sortBy(_._2)(Ordering.Tuple3(Ordering.Int.reverse,Ordering.Int.reverse,Ordering.Int.reverse)).take(10)
top10.foreach(println)
sc.stop()
}
//自定义累加器实现需求1
class reqAcc extends AccumulatorV2[(String,(Int,Int,Int)),mutable.Map[String,(Int,Int,Int)]]{
private val resMap: mutable.Map[String, (Int, Int, Int)] = mutable.Map[String, (Int, Int, Int)]()
override def isZero: Boolean = {
resMap.isEmpty
}
override def reset(): Unit = {
resMap.clear()
}
override def copy(): AccumulatorV2[(String, (Int, Int, Int)), mutable.Map[String, (Int, Int, Int)]] = {
new reqAcc()
}
override def add(v: (String, (Int, Int, Int))): Unit = {
val oldCnt: (Int, Int, Int) = resMap.getOrElse(v._1, (0, 0, 0))
resMap.update(v._1,(oldCnt._1 + v._2._1,oldCnt._2 + v._2._2,oldCnt._3 + v._2._3))
}
override def merge(other: AccumulatorV2[(String, (Int, Int, Int)), mutable.Map[String, (Int, Int, Int)]]): Unit = {
val map2: mutable.Map[String, (Int, Int, Int)] = other.value
map2.foldLeft(resMap){
case (map,kv)=>{
val oldCnt: (Int, Int, Int) = map.getOrElse(kv._1, (0, 0, 0))
map.update(kv._1,(oldCnt._1 + kv._2._1,oldCnt._2 + kv._2._2,oldCnt._3 + kv._2._3))
map
}
}
/* map2.foreach{
case (k,v) => {
val oldCnt: (Int, Int, Int) = resMap.getOrElse(k, (0, 0, 0))
resMap.update(k,(oldCnt._1 + v._1,oldCnt._2+v._2,oldCnt._3+v._3))
}
}*/
}
override def value: mutable.Map[String, (Int, Int, Int)] = {
resMap
}
}
}
方式2:
import org.apache.spark.rdd.RDD
import org.apache.spark.util.AccumulatorV2
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object Spark01_ReqHotCategory_5 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("Test")
val sc = new SparkContext(conf)
val fileDatas: RDD[String] = sc.textFile("Spark/data/user_visit_action.txt", 4)
val acc = new reqAcc()
sc.register(acc, "WordCount")
fileDatas.foreach(
lines => {
val datas: Array[String] = lines.split("_")
if (datas(6) != "-1") {
acc.add((datas(6), "click"))
} else if (datas(8) != "null") {
val ids: Array[String] = datas(8).split(",")
ids.foreach(id => {
acc.add((id, "order"))
})
} else if (datas(10) != "null") {
datas(10).split(",").foreach(id => {
acc.add((id, "pay"))
}
)
}
}
)
// println(acc.value)
acc.value.toList.sortBy(_._2)(Ordering.Tuple3(Ordering.Int.reverse,Ordering.Int.reverse,Ordering.Int.reverse))
.take(10).foreach(println)
sc.stop()
}
//自定义累加器实现需求1
class reqAcc extends AccumulatorV2[(String, String), mutable.Map[String, (Int, Int, Int)]] {
private val resMap = mutable.Map[String, (Int,Int,Int)]()
override def isZero: Boolean = {
resMap.isEmpty
}
override def copy(): AccumulatorV2[(String, String), mutable.Map[String, (Int, Int, Int)]] = {
new reqAcc
}
override def reset(): Unit = {
resMap.clear()
}
override def add(v: (String, String)): Unit = {
val oldCnt = resMap.getOrElse(v._1, (0,0,0))
v._2 match {
case "click" => resMap.update(v._1,(oldCnt._1 + 1,oldCnt._2,oldCnt._3))
case "order" => resMap.update(v._1,(oldCnt._1,oldCnt._2 + 1,oldCnt._3))
case "pay" => resMap.update(v._1,(oldCnt._1,oldCnt._2,oldCnt._3 + 1))
}
}
override def merge(other: AccumulatorV2[(String, String), mutable.Map[String, (Int, Int, Int)]]): Unit = {
val map2: mutable.Map[String, (Int, Int, Int)] = other.value
map2.foreach{
case(k,v) => {
val oldCnt: (Int, Int, Int) = resMap.getOrElse(k, (0, 0, 0))
resMap.update(k,(oldCnt._1 + v._1,oldCnt._2 + v._2,oldCnt._3 + v._3))
}
}
}
override def value: mutable.Map[String, (Int, Int, Int)] = {
resMap
}
}
}