需求1:分别统计每个品类的点击次数,下单次数和支付次数
(品类,点击总数)(品类,下单总数)(品类,支付总数)
排名顺序如:点击总数>下单总数>支付总数
方案一
def main(args: Array[String]): Unit = {
//TODO : TOP10热门品类
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("HotCategoryTop10Analysis")
val sc = new SparkContext(sparkConf)
// 1.读取原始日志数据
val actionRDD = sc.textFile("datas/user_visit_action.txt")
// 2.统计品类的点击数量:(品类ID,点击数量)
val clickActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
datas(6) != "-1"
}
)
val clickCountRDD: RDD[(String, Int)] = clickActionRDD.map(
action => {
val datas = action.split("_")
(datas(6), 1)
}
).reduceByKey(_ + _)
// 3.统计品类的下单数量:(品类ID,下单数量)
val orderActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
datas(8) != "null"
}
)
//orderid => 1,2,3
//[(1,1),(2,1),(3,1)]
val orderCountRDD = orderActionRDD.flatMap(
action => {
val datas = action.split("_")
val cid = datas(8)
val cids = cid.split(",")
cids.map(id => (id, 1))
}
).reduceByKey(_ + _)
// 4.统计品类的支付数量:(品类ID,支付数量)
val payActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
datas(10) != "null"
}
)
//orderid => 1,2,3
//[(1,1),(2,1),(3,1)]
val payCountRDD = orderActionRDD.flatMap(
action => {
val datas = action.split("_")
val cid = datas(10)
val cids = cid.split(",")
cids.map(id => (id, 1))
}
).reduceByKey(_ + _)
// 5.将品类进行排序,并且取前10名
// 点击数量排序,下单数量排序,支付数量排序
// 元组排序:先比较第一个,再比较第二个,再比较第三个,依次类推
// (品类ID,(点击数量,下单数量,支付数量))
// cogroup = connect + group
val cogroupRDD: RDD[(String, (Iterable[Int], Iterable[Int], Iterable[Int]))] =
clickCountRDD.cogroup(orderCountRDD, payCountRDD)
val analysisRDD = cogroupRDD.mapValues {
case (clickIter, orderIter, payIter) => {
var clickCnt = 0
var iter1 = clickIter.iterator
if (iter1.hasNext) {
clickCnt = iter1.next()
}
var orderCnt = 0
var iter2 = orderIter.iterator
if (iter2.hasNext) {
orderCnt = iter2.next()
}
var payCnt = 0
var iter3 = payIter.iterator
if (iter3.hasNext){
payCnt = iter3.next()
}
(clickCnt, orderCnt, payCnt)
}
}
val resultRDD = analysisRDD.sortBy(_._2, false).take(10)
// 6.将结果采集到控制台打印出来
resultRDD.foreach(println)
sc.stop()
}
方案二
def main(args: Array[String]): Unit = {
//TODO : TOP10热门品类
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("HotCategoryTop10Analysis")
val sc = new SparkContext(sparkConf)
// Q : actionRDD重复使用
// Q : cogroup性能可能较低
// 1.读取原始日志数据
val actionRDD = sc.textFile("datas/user_visit_action.txt")
actionRDD.cache()
// 2.统计品类的点击数量:(品类ID,点击数量)
val clickActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
datas(6) != "-1"
}
)
val clickCountRDD: RDD[(String, Int)] = clickActionRDD.map(
action => {
val datas = action.split("_")
(datas(6), 1)
}
).reduceByKey(_ + _)
// 3.统计品类的下单数量:(品类ID,下单数量)
val orderActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
datas(8) != "null"
}
)
val orderCountRDD = orderActionRDD.flatMap(
action => {
val datas = action.split("_")
val cid = datas(8)
val cids = cid.split(",")
cids.map(id => (id, 1))
}
).reduceByKey(_ + _)
// 4.统计品类的支付数量:(品类ID,支付数量)
val payActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
datas(10) != "null"
}
)
val payCountRDD = orderActionRDD.flatMap(
action => {
val datas = action.split("_")
val cid = datas(10)
val cids = cid.split(",")
cids.map(id => (id, 1))
}
).reduceByKey(_ + _)
// (品类ID,点击数量) => (品类ID,(点击数量,0,0))
// (品类ID,下单数量) => (品类ID,(0,下单数量,0))
// (品类ID,支付数量) => (品类ID,(0,0,支付数量))
// (品类ID,(点击数量,下单数量,支付数量))
// 5.将品类进行排序,并且取前10名
// 点击数量排序,下单数量排序,支付数量排序
// 元组排序:先比较第一个,再比较第二个,再比较第三个,依次类推
// (品类ID,(点击数量,下单数量,支付数量))
val rdd1 = clickCountRDD.map {
case (cid, cnt) => {
(cid, (cnt, 0, 0))
}
}
val rdd2 = orderCountRDD.map {
case (cid, cnt) => {
(cid, (0, cnt, 0))
}
}
val rdd3 = payCountRDD.map {
case (cid, cnt) => {
(cid, (0, 0, cnt))
}
}
// 将三个数据源合并在一起,统一进行聚合计算
val soruceRDD: RDD[(String, (Int, Int, Int))] = rdd1.union(rdd2).union(rdd3)
val analysisRDD = soruceRDD.reduceByKey(
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3)
}
)
val resultRDD = analysisRDD.sortBy(_._2, false).take(10)
// 6.将结果采集到控制台打印出来
resultRDD.foreach(println)
sc.stop()
}
方案三
def main(args: Array[String]): Unit = {
//TODO : TOP10热门品类
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("HotCategoryTop10Analysis")
val sc = new SparkContext(sparkConf)
// Q : 存在大量的shuffle操作(reduceByKey)
// reduceByKey 聚合算子;spark会提供优化,缓存
// 1.读取原始日志数据
val actionRDD = sc.textFile("datas/user_visit_action.txt")
// 2.将数据转换结构
// 点击的场合:(品类ID,(1,0,0))
// 下单的场合:(品类ID,(0,1,0))
// 支付的场合:(品类ID,(0,0,1))
val flatRDD: RDD[(String, (Int, Int, Int))] = actionRDD.flatMap(
action => {
val datas = action.split("_")
if (datas(6) != "-1") {
//点击的场合
List((datas(6), (1, 0, 0)))
} else if (datas(8) != "null") {
//下单的场合
val ids = datas(8).split(",")
ids.map(id => (id, (0, 1, 0)))
} else if (datas(10) != "null") {
//支付的场合
val ids = datas(10).split(",")
ids.map(id => (id, (0, 0, 1)))
} else {
Nil
}
}
)
// 3.将相同的品类ID的数据进行分组聚合
// (品类ID,(点击数量,下单数量,支付数量))
val analysisRDD = flatRDD.reduceByKey(
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3)
}
)
// 4.将统计结构根据数量进行降序处理,取前10名
val resultRDD = analysisRDD.sortBy(_._2, false).take(10)
// 5.将结果采集到控制台打印出来
resultRDD.foreach(println)
sc.stop()
}
方案四
def main(args: Array[String]): Unit = {
//TODO : TOP10热门品类
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("HotCategoryTop10Analysis")
val sc = new SparkContext(sparkConf)
// Q : 存在大量的shuffle操作(reduceByKey)
// reduceByKey 聚合算子;spark会提供优化,缓存
// 1.读取原始日志数据
val actionRDD = sc.textFile("datas/user_visit_action.txt")
val acc = new HotCategoryAccmulator
sc.register(acc, "hotCategory")
// 2.将数据转换结构
actionRDD.foreach(
action => {
val datas = action.split("_")
if (datas(6) != "-1") {
//点击的场合
acc.add((datas(6), "click"))
} else if (datas(8) != "null") {
//下单的场合
val ids = datas(8).split(",")
ids.foreach(
id => {
acc.add((id, "order"))
}
)
} else if (datas(10) != "null") {
//支付的场合
val ids = datas(10).split(",")
ids.foreach(
id => {
acc.add((id, "pay"))
}
)
} else {
Nil
}
}
)
val accVal: mutable.Map[String, HotCategory] = acc.value
val categories: mutable.Iterable[HotCategory] = accVal.map(_._2)
val sort = categories.toList.sortWith(
(left, right) => {
if (left.clickCnt > right.clickCnt) {
true
} else if (left.clickCnt == right.clickCnt) {
if (left.orderCnt > right.orderCnt) {
true
} else if (left.orderCnt == right.orderCnt) {
left.payCnt > right.payCnt
} else {
false
}
}
else {
false
}
}
)
// 5.将结果采集到控制台打印出来
sort.take(10).foreach(println)
sc.stop()
}
case class HotCategory(cid: String, var clickCnt: Int, var orderCnt: Int, var payCnt: Int) {
}
/**
* 自定义累加器
* 1.继承AccumulatorV2,定义泛型
* IN:(品类ID,行为类型)
* OUT: mutable.Map[String,HotCategory]
* 2.重写方法(6)
*/
class HotCategoryAccmulator extends AccumulatorV2[(String, String), mutable.Map[String, HotCategory]] {
private val hcMap = mutable.Map[String, HotCategory]()
override def isZero: Boolean = {
hcMap.isEmpty
}
override def copy(): AccumulatorV2[(String, String), mutable.Map[String, HotCategory]] = {
new HotCategoryAccmulator()
}
override def reset(): Unit = {
hcMap.clear()
}
override def add(v: (String, String)): Unit = {
val cid = v._1
val actionType = v._2
val category: HotCategory = hcMap.getOrElse(cid, HotCategory(cid, 0, 0, 0))
if (actionType == "click") {
category.clickCnt += 1
} else if (actionType == "order") {
category.orderCnt += 1
} else if (actionType == "pay") {
category.payCnt += 1
}
hcMap.update(cid, category)
}
override def merge(other: AccumulatorV2[(String, String), mutable.Map[String, HotCategory]]): Unit = {
val map1 = this.hcMap
val map2 = other.value
map2.foreach {
case (cid, hc) => {
val category: HotCategory = map1.getOrElse(cid, HotCategory(cid, 0, 0, 0))
category.clickCnt += hc.clickCnt
category.orderCnt += hc.orderCnt
category.payCnt += hc.payCnt
map1.update(cid, category)
}
}
}
override def value: mutable.Map[String, HotCategory] = hcMap
}
需求2:在需求一的基础上,增加每个品类用户 session 的点击统计
def main(args: Array[String]): Unit = {
//TODO : TOP10热门品类
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("HotCategoryTop10Analysis")
val sc = new SparkContext(sparkConf)
val actionRDD = sc.textFile("datas/user_visit_action.txt")
actionRDD.cache()
val top10Ids = top10Category(actionRDD)
// 1.过滤原始数据,保留点击和前10品类ID
val filterActionRDD = actionRDD.filter(
action => {
val datas = action.split("_")
if (datas(6) != "-1") {
top10Ids.contains(datas(6))
} else {
false
}
}
)
//2.根据品类ID和sessionid进行点击量的统计
val reduceRDD: RDD[((String, String), Int)] = filterActionRDD.map(
action => {
val datas = action.split("_")
((datas(6), datas(2)), 1)
}
).reduceByKey(_ + _)
//3.将统计的结果进行结构的转换
//((品类ID,sessionId),sum) => (品类ID,(sessionId,sum))
val mapRDD = reduceRDD.map {
case ((cid, sid), sum) => {
(cid, (sid, sum))
}
}
//4.相同的品类进行分组
val groupRDD: RDD[(String,Iterable[(String,Int)])] = mapRDD.groupByKey()
//5.将分组后的数据进行点击量的排序,取前10名
val resultRDD = groupRDD.mapValues(
iter => {
iter.toList.sortBy(_._2)(Ordering.Int).take(10)
}
)
resultRDD.collect().foreach(println)
sc.stop()
}
def top10Category(actionRDD: RDD[String]) = {
val flatRDD: RDD[(String, (Int, Int, Int))] = actionRDD.flatMap(
action => {
val datas = action.split("_")
if (datas(6) != "-1") {
//点击的场合
List((datas(6), (1, 0, 0)))
} else if (datas(8) != "null") {
//下单的场合
val ids = datas(8).split(",")
ids.map(id => (id, (0, 1, 0)))
} else if (datas(10) != "null") {
//支付的场合
val ids = datas(10).split(",")
ids.map(id => (id, (0, 0, 1)))
} else {
Nil
}
}
)
val analysisRDD = flatRDD.reduceByKey(
(t1, t2) => {
(t1._1 + t2._1, t1._2 + t2._2, t1._3 + t2._3)
}
)
analysisRDD.sortBy(_._2, false).take(10).map(_._1)
}