Scala_Spark-电商平台离线分析项目-需求三top10热门品类
样例类
/**
* 需求三
* 的样例类
* 最后输出到mysql的数据格式
*
* @param taskid
* @param categoryid
* @param clickCount
* @param orderCount
* @param payCount
*/
case class Top10Category(taskid:String,
categoryid:Long,
clickCount:Long,
orderCount:Long,
payCount:Long)
自定义二次排序key
/**
* 需求三的
* 自定义二次排序key
* 继承Ordered[SortKey]
*
* @param clictCount
* @param orderCount
* @param payCount
*/
case class SortKey(clictCount:Long,orderCount:Long,payCount:Long) extends Ordered[SortKey]{
// x<0 this<that
// x==0 this==that
// x>0 this>that
override def compare(that: SortKey): Int = {
if(this.clictCount-that.clictCount !=0){
return (this.clictCount-that.clictCount).toInt
}else if(this.orderCount-that.orderCount != 0){
return (this.orderCount-that.orderCount).toInt
}else {
return (this.payCount-that.payCount).toInt
}
}
}
/*
结果存入到数据库后clickCount并不是完全降序的,所以有一点二次排序概念模糊
*/
实现方法
/**
* (步骤8 需求3方法里的方法1)
* 求各品类的点击次数
*
* @param sessionId2FilterActionRDD
*/
def getClickCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
// 过滤 把点击行为对应的action保留下来
val clickFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.click_category_id != -1L)
//map格式转换 为reduceBykey做准备 (5555品类,1次)
val clickNumRDD = clickFilterRDD.map{
case(sessionId,action) => (action.click_category_id,1L)
}
// 聚合 返回
clickNumRDD.reduceByKey(_+_)
}
/**
* (步骤8 需求3方法里的方法2)
* 求各品类的下单次数
*
* @param sessionId2FilterActionRDD
*/
def getOrderCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
// 过滤 把下单行为对应的action保留下来
val orderFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.order_category_ids != null)
// map 换成聚合的key (5555品类,1次)
val orderNumRDD = orderFilterRDD.flatMap{
case (sid,action) =>
action.order_category_ids.split(",").map(item => (item.toLong,1L))
}
// 聚合 返回
orderNumRDD.reduceByKey(_+_)
}
/**
* (步骤8 需求3方法里的方法3)
* 求各品类的付款次数
*
* @param sessionId2FilterActionRDD
*/
def getPayCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
// 过滤 把付款行为对应的action保留下来
val payFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.pay_category_ids != null)
// map 换成聚合的key (5555品类,1次)
val payNumRDD = payFilterRDD.flatMap{
case (sid,action) =>
action.pay_category_ids.split(",").map(item => (item.toLong,1L))
}
// 聚合 返回
payNumRDD.reduceByKey(_+_)
}
/**
* (步骤8 需求三方法里的方法4)
*
* @param distinctCid2CidRDD
* @param cid2ClickCountRDD
* @param cid2OrderCountRDD
* @param cid2PayCountRDD
*/
def getFullCount(distinctCid2CidRDD: RDD[(Long, Long)],
cid2ClickCountRDD: RDD[(Long, Long)],
cid2OrderCountRDD: RDD[(Long, Long)],
cid2PayCountRDD: RDD[(Long, Long)])={
val cid2ClickInfoRDD = distinctCid2CidRDD.leftOuterJoin(cid2ClickCountRDD).map{
case (cid,(categoryId,option)) =>{ //因为是leftOuterJoin 所以后面十个option 需要先判断一下存不存在 option.isDefinded
val clickCount = if(option.isDefined) option.get else 0
val aggCount = Constants.FIELD_CATEGORY_ID + "=" +categoryId+ "|" +
Constants.FIELD_CLICK_COUNT + "=" + clickCount
(cid,aggCount)
}
}
val cid2OrderInfoRDD = cid2ClickInfoRDD.leftOuterJoin(cid2OrderCountRDD).map{
case(cid,(clickInfo,option))=>{
val orderCount = if(option.isDefined) option.get else 0
val aggCount = clickInfo + "|" + Constants.FIELD_ORDER_COUNT + "=" + orderCount
(cid, aggCount)
}
}
val cid2PayInfoRDD = cid2OrderInfoRDD.leftOuterJoin(cid2PayCountRDD).map{
case(cid,(orderInfo,option)) =>{
val payCount = if(option.isDefined) option.get else 0
val aggCount = orderInfo +"|"+Constants.FIELD_PAY_COUNT+"="+payCount
(cid,aggCount)
}
}
cid2PayInfoRDD //虽然叫payinfo 但是已经是完整信息了
}
/**
* (步骤8 需求三的方法)
* top10热门品类
*
* @param sparkSession
* @param taskUUID
* @param sessionId2FilterActionRDD 符合条件用户行为数据
*/
def top10PopularCategories(sparkSession: SparkSession, taskUUID: String, sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]): Unit = {
// 第一步:获取所有发生过点击、下单、付款的品类 放在一个容器里 可以看打印文档的图
val cid2CidRDD = sessionId2FilterActionRDD.flatMap{
case (sid,action)=>{
val categoryBuffer = new ArrayBuffer[(Long,Long)]()
if(action.click_category_id != -1){ //点击行为
categoryBuffer += ((action.click_category_id,action.click_category_id))
}else if(action.order_category_ids != null){ //下单行为
for(orderCid <- action.order_category_ids.split(",")){
categoryBuffer += ((orderCid.toLong,orderCid.toLong))
}
}else if(action.pay_category_ids != null){ //付款行为
for(payCid <- action.pay_category_ids.split(",")){
categoryBuffer += ((payCid.toLong,payCid.toLong))
}
}
categoryBuffer
}
}
//对重复的categoryId进行去重
val distinctCid2CidRDD = cid2CidRDD.distinct()
// 第二步:计算各品类被点击、下单、支付的各个次数
val cid2ClickCountRDD = getClickCount(sessionId2FilterActionRDD) //注意 传的是未去重前的
val cid2OrderCountRDD = getOrderCount(sessionId2FilterActionRDD)
val cid2PayCountRDD = getPayCount(sessionId2FilterActionRDD)
// 测试打印输出一下 (注意必须把 getClickCount后面的unit删了 不然foreach不行)
// cid2ClickCountRDD.foreach(println(_))
/* 结果如下(cid,count)
(93,75)
(37,67)...
*/
// 第三步:整合各品类被点击、下单、支付次数 比如(888品类,点击80次,下单30次,付款2次)
val cid2FullCountRDD = getFullCount(distinctCid2CidRDD,cid2ClickCountRDD,cid2OrderCountRDD,cid2PayCountRDD)
// 测试输出一下 看看数据的拼接有没有问题
// cid2FullCountRDD.foreach(println(_))
/* 结果如下
(80,categoryid=80|clickCount=79|orderCount=80|payCount=87)
(22,categoryid=22|clickCount=76|orderCount=68|payCount=89)
(54,categoryid=54|clickCount=78|orderCount=88|payCount=67) ...
*/
// 第四步:自定义二次排序key
val sortkey2FullcountRDD = cid2FullCountRDD.map{
case (cid,countInfo) =>{
val clickCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CLICK_COUNT).toLong
val orderCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_ORDER_COUNT).toLong
val payCount = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_PAY_COUNT).toLong
// 调用自定义的二次排序类
val sortKey = SortKey(clickCount,orderCount,payCount) //从小到大的
(sortKey,countInfo)
}
}
// 再用一下sortByKey()算子 false降序 从大到小 注意 take返回的是array
val top10CategoryArray: Array[(SortKey, String)] = sortkey2FullcountRDD.sortByKey(false).take(10)
// Array ----> RDD
val top10CategoryRDD =sparkSession.sparkContext.makeRDD(top10CategoryArray).map{
case(sortKey,countInfo) =>{
val cid = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CATEGORY_ID).toLong
val clickCount = sortKey.clictCount
val orderCount = sortKey.orderCount
val payCount = sortKey.payCount
Top10Category(taskUUID,cid,clickCount,orderCount,payCount)
}
}
// 第五步:写入到数据库
import sparkSession.implicits._
top10CategoryRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","top10_category")
.mode(SaveMode.Append)
.save
}
主线程
def main(args: Array[String]): Unit = {
// 获取筛选条件
val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS)
// 获取筛选条件的JsonObject
val taskParam = JSONObject.fromObject(jsonStr)
// 创建全局唯一的主键
val taskUUID = UUID.randomUUID().toString
// 创建SparkConf
val sparkConf = new SparkConf().setMaster("local[*]").setAppName("session")
// 创建SparkSession (包含SparkContext)
val sparkSession = SparkSession.builder().config(sparkConf).enableHiveSupport().getOrCreate()
// 获取原始动作表
// actionRDD:RDD[UserVisitAction]
val actionRDD = getOriActionRDD(sparkSession,taskParam)
// 测试1打印输出 先确认下数据获取成功
// actionRDD.foreach(println(_))
// map-----sessionID2ActionRDD:RDD[(sessionID,UserVieitAction)]
val sessionID2ActionRDD = actionRDD.map(item => (item.session_id, item)) //item就是平时练习的x
// groupByKey-----sessionID2GroupActionRDD: RDD[(sessionID, Iterable[UserVisitAction])]
val session2GroupActionRDD = sessionID2ActionRDD.groupByKey()
session2GroupActionRDD.cache()
//todo:聚合数据
//测试2打印输出
// session2GroupActionRDD.foreach(println(_))
//测试3打印输出
// val userId2AggrInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
// userId2AggrInfoRDD.foreach(println(_))
//4打印输出
val sessionId2FullInfoRDD = getSessionFullInfo(sparkSession, session2GroupActionRDD)
sessionId2FullInfoRDD .foreach(println(_))
//至此聚合完成,开始过滤操作
//5 过滤
//todo:过滤
//对自定义累加器进行注册
val sessionAccumulator =new SessionAccumulator
sparkSession.sparkContext.register(sessionAccumulator)
//在过滤过程中完成了累加器的更新操作
//sessionId2FilterRDD:RDD[(sessionId,fullInfo)]是所有符合过滤条件的数组组成的RDD
//getSessionFilteredRDD:实现根据限制条件对session数据进行过滤,并完成累加的更新
val sessionId2FilteredRDD =getSessionFilteredRDD(taskParam,sessionId2FullInfoRDD,sessionAccumulator) //sessionAccumulator作为参数传进去
//s输出
sessionId2FilteredRDD.foreach(println(_)) //需要一个action操作
//6 计算比率 存入mysql数据库
//todo: 计算比率 存入mysql数据库
getSessionRatio(sparkSession,taskUUID,sessionAccumulator.value)
// 需求二:Session随机抽取
// 7
// sessionId2FilteredRDD:RDD[(sid,fullInfo)]
//todo:Session随机抽取
sessionRandomExtract(sparkSession,taskUUID,sessionId2FilteredRDD)
// 需求三: top10热门品类
// sessionId2ActionRDD:RDD[(sessionId,action)]
// sessionId2FilteredRDD:RDD[(ssessionId,fullInfo)] 符合过滤条件的
// 获取所有符合过滤条件的action数据 join
val sessionId2FilterActionRDD = sessionID2ActionRDD.join(sessionId2FilteredRDD).map{
case (sessionId,(action,fullInfo)) =>
(sessionId,action)
}
// 8
//todo:按点击、下单、支付数量获取top10品类
top10PopularCategories(sparkSession,taskUUID,sessionId2FilterActionRDD)
}