数据格式
样例类
case class UserVisitAction(date: String,
user_id: Long,
session_id: String,
page_id: Long,
action_time: String,
search_keyword: String,
click_category_id: Long,
click_product_id: Long,
order_category_ids: String,
order_product_ids: String,
pay_category_ids: String,
pay_product_ids: String,
city_id: Long)
case class CategoryCountInfo(var categoryId: String,
var clickCount: Long,
var orderCount: Long,
var payCount: Long)
object Project_demand2 {
val conf: SparkConf = new SparkConf().setAppName("SparkCoreTest").setMaster("local[*]")
val sc: SparkContext = new SparkContext(conf)
}
需求1:Top10热门品类
需求分析:
1.先求热门品类Top10,就是统计品类下面的:点击数,下单数,支付数后,根据以上信息排名
统计优先级:点击数 > 下单数 > 支付数
1.1 将数据封装到样例类 UserVisitAction
1.2 结构转换,提取与需求相关的信息 封装到CategoryCountInfo对象中
1.3 聚合点击数,下单数,支付数 => RDD[CategoryCountInfo]
1.4 排序取Top10
class Project_demand2 {
@After
def close(): Unit = {
sc.stop()
}
@Test
def test: Unit = {
val rdd01: RDD[String] = sc.textFile("input/user_visit_action.txt")
val actionRdd: RDD[UserVisitAction] = rdd01.map {
line => {
val infos: Array[String] = line.split("_")
UserVisitAction(
infos(0),
infos(1).toLong,
infos(2),
infos(3).toLong,
infos(4),
infos(5),
infos(6).toLong,
infos(7).toLong,
infos(8),
infos(9),
infos(10),
infos(11),
infos(12).toLong
)
}
}
println("----------------------需求1 : Top10热门品类------------------------------")
val countInfoRdd: RDD[(String, CategoryCountInfo)] = actionRdd.flatMap {
case action: UserVisitAction => {
if (action.click_category_id != -1) {
List((action.click_category_id.toString, CategoryCountInfo(action.click_category_id.toString, 1, 0, 0)))
} else if (action.order_category_ids != "null") {
val orderCountInfos = new ListBuffer[(String, CategoryCountInfo)]()
val ids: Array[String] = action.order_category_ids.split(",")
for (elem <- ids) {
orderCountInfos.append((elem, CategoryCountInfo(elem, 0, 1, 0)))
}
orderCountInfos
} else if (action.pay_category_ids != "null") {
val payCountInfos = new ListBuffer[(String, CategoryCountInfo)]()
val ids: Array[String] = action.pay_category_ids.split(",")
for (elem <- ids) {
payCountInfos.append((elem, CategoryCountInfo(elem, 0, 0, 1)))
}
payCountInfos
} else {
Nil
}
}
case _ => {
Nil
}
}
val countReduceInfoRdd: RDD[(String, CategoryCountInfo)] = countInfoRdd.reduceByKey {
case (info1, info2) => {
info1.clickCount = info1.clickCount + info2.clickCount
info1.orderCount = info1.orderCount + info2.orderCount
info1.payCount = info1.payCount + info2.payCount
info1
}
}
val categoryTop10: Array[CategoryCountInfo] = countReduceInfoRdd.map(_._2).sortBy(countInfo => {
(countInfo.clickCount, countInfo.orderCount, countInfo.payCount)
}, false).take(10)
categoryTop10.foreach(println)
需求2:Top10热门品类中每个品类的Top10活跃城市统计
println("----------------------需求2 : Top10热门品类中每个品类的Top10活跃城市统计------------------------------")
val categoryIds: Array[String] = categoryTop10.map(_.categoryId)
val b_categoryIds: Broadcast[Array[String]] = sc.broadcast(categoryIds)
val cateTop10Rdd: RDD[UserVisitAction] = actionRdd.filter(
action => b_categoryIds.value.contains(action.click_category_id.toString))
val cateAndCity_singleRdd: RDD[(String, Int)] = cateTop10Rdd.map {
action => {
((action.click_category_id + "-" + action.city_id), 1)
}
}
val cateAndCity_sumRdd: RDD[(String, Int)] = cateAndCity_singleRdd.reduceByKey(_ + _)
val cateWithCityAndSum: RDD[(String, (String, Int))] = cateAndCity_sumRdd.map {
case (k, sum) => {
val cateIdAndCity: Array[String] = k.split("-")
(cateIdAndCity(0), (cateIdAndCity(1), sum))
}
}
val cateUnderCityTop10: RDD[(String, List[(String, Int)])] = cateWithCityAndSum.groupByKey().mapValues(datas => {
datas.toList.sortWith((left, right) => {
left._2 > right._2
}).take(10)
})
cateUnderCityTop10.collect().foreach(println)
需求3:页面单跳转化率
println("----------------------需求3 : 页面单跳转化率------------------------------")
val pageIds = List(1L, 2L, 3L, 4L, 5L, 6L, 7L)
val b_pageIds: Broadcast[List[Long]] = sc.broadcast(pageIds.init)
val actionPageInfoRdd: RDD[UserVisitAction] =
actionRdd.filter(action => b_pageIds.value.contains(action.page_id))
val pageIdAndSumMap: collection.Map[Long, Long] = actionPageInfoRdd.map(
action =>
(action.page_id, 1L))
.reduceByKey(_ + _).collectAsMap()
val pageAndPageSingleRdd: RDD[((Long, Long), Long)] = actionRdd.groupBy(_.session_id).mapValues {
datas => {
val sortList: List[UserVisitAction] = datas.toList.sortWith((left, right) => {
left.action_time < left.action_time
})
val pageIdList: List[Long] = sortList.map(_.page_id)
val pageToPage: List[(Long, Long)] = pageIdList.zip(pageIdList.tail)
val demandPageIds: List[(Long, Long)] = pageIds.zip(pageIds.tail)
pageToPage.filter(demandPageIds.contains(_)).map {
case (id1, id2) => {
((id1, id2), 1L)
}
}
}
}.flatMap(_._2)
val pageAndPageSumRdd: RDD[((Long, Long), Long)] = pageAndPageSingleRdd.reduceByKey(_+_)
pageAndPageSumRdd.foreach{
case (pageAndPage,sum) =>{
val pageId: Long = pageAndPage._1.toLong
println(pageAndPage + ":" + sum.toDouble / pageIdAndSumMap.getOrElse(pageId,1L))
}
}
}