package com.rdd.topn
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//城市 id
object PageflowAnalysis2 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster(“local[*]”).setAppName(“HotCategoryTop10Analysis”)
val sc = new SparkContext(conf)
val actionRDD: RDD[String] = sc.textFile(“in/user_visit_action.txt”)
val actionDataRDD: RDD[UserVisitAction] = actionRDD.map(
actions => {
val datas = actions.split("")
UserVisitAction(
datas(0),
datas(1).toLong,
datas(2),
datas(3).toLong,
datas(4),
datas(5),
datas(6).toLong,
datas(7).toLong,
datas(8),
datas(9),
datas(10),
datas(11),
datas(12).toLong
)
}
)
// TODO 计算分母
val ids = List(1L,2L,3L,4L,5L,6L,7L)
val okflowIds = ids.zip(ids.tail)
val pageidToCountMap: Map[Long, Long] = actionDataRDD.filter(
action => {
ids.init.contains(action.page_id)
}
)
.map(
action => {
(action.page_id, 1L)
}
).reduceByKey( + ).collect().toMap
//计算分子 根据session分组
val sessionRDD: RDD[(String, Iterable[UserVisitAction])] = actionDataRDD.groupBy(.session_id)
//分组后 根据访问时间进行排序
val mvRDD = sessionRDD.mapValues(
iter => {
//对时间排序
val sortList: List[UserVisitAction] = iter.toList.sortBy(.action_time)
val flowIds: List[Long] = sortList.map(.page_id)
val pageflowIds: List[(Long, Long)] = flowIds.zip(flowIds.tail)
pageflowIds.filter(
t=>{
okflowIds.contains(t)
}
).map(
t=>{
(t,1)
}
)
}
)
//((1,2)1),
val flatRDD: RDD[((Long, Long), Int)] = mvRDD.map(.2).flatMap(list=>list)
val dataRDD: RDD[((Long, Long), Int)] = flatRDD.reduceByKey(+)
//计算单挑转换率 分子除以分母
dataRDD.foreach{
case ((pageid1,pageid2),sum)=>{
val lon: Long = pageidToCountMap.getOrElse(pageid1,0L)
println(s"页面
p
a
g
e
i
d
1
跳
转
到
页
面
{pageid1}跳转到页面
pageid1跳转到页面{pageid2}的单挑转换率为"+(sum.toDouble/lon))
}
}
//[1,2,3,4,5,6]
//[1,2],[2,3],[3,4],[4,5]
//[1-2,2-3,3-4,4-5] Sliding滑窗
//[1,2,3,4,5]
//[2,3,4,5,6] 上一个集合的尾部
//zip:拉链
sc.stop()
}
}