Scala_Spark-电商平台离线分析项目-需求五页面转化率统计
模块二:页面单跳转化率统计业务模块
PageConverStat.scala 方法
import java.util.UUID
import commons.conf.ConfigurationManager
import commons.constant.Constants
import commons.utils.{DateUtils, ParamUtils}
import net.sf.json.JSONObject
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SaveMode, SparkSession}
import scala.collection.mutable
/**
* PageConverStat.scala
* 需求五开始:页面转化率统计
*/
object PageConverStat {
def main(args: Array[String]): Unit = {
// 获取任务限制条件
val jsonStr = ConfigurationManager.config.getString(Constants.TASK_PARAMS)
val taskParam = JSONObject.fromObject(jsonStr)
// 获取全局唯一主键
val taskUUID = UUID.randomUUID().toString
// 创建sparkConf
val saprkConf = new SparkConf().setAppName("PageConver").setMaster("local[*]")
// 创建sparkSession
val sparkSession = SparkSession.builder().config(saprkConf).enableHiveSupport().getOrCreate()
/*至此创建完毕*/
// 1
//todo 获取用户行为数据 : RDD[(String, UserVisitAction)]
val sessionId2ActionRDD= getUserVisitAction(sparkSession,taskParam)
// 测试输出一下
// sessionId2ActionRDD.foreach(println(_))
/*(88a3cae9b4ac431ebcb97102de07486b,UserVisitAction(0000-00-00,38,88a3cae9b4ac431ebcb97102de07486b,8,0000-00-00 6:20:02,null,40,23,null,null,null,null,4)) */
// todo 字符串--页面切片 转化
// 读取限制条件里的目标页面访问流
// pageFlowStr:"1,2,3,4,5,6,7"
val pageFlowStr = ParamUtils.getParam(taskParam,Constants.PARAM_TARGET_PAGE_FLOW)
// pageFlowArray:Array[Long] [1,2,3,4,5,6,7]
val pageFlowArray = pageFlowStr.split(",")
// 厉害的来了
// pageFlowArray.slice(0,pageFlowArray.length-1) ---[1,2,3,4,5,6]
// pageFlowArray.tail ---[2,3,4,5,6,7]
// pageFlowArray.slice(0,pageFlowArray.length-1).zip(pageFlowArray.tail) ---[(1,2),(2,3),(3,4),(4,5),(5,6),(6,7)]
// tatgetPageSplit ---[1_2,2_3,3_4,...]
val targetPageSplit = pageFlowArray.slice(0,pageFlowArray.length-1).zip(pageFlowArray.tail).map{
case (page1,page2) => page1+"_"+page2
}
/** 至此 目标页面切片完成*/
/** 开始 实际页面切片 */
// todo (1)聚合 用户访问行为聚合数据 sessionId2ActionRDD:RDD[(sessionId,action)]
val sessionId2GroupRDD = sessionId2ActionRDD.groupByKey()
// todo (2)排序 按时间排序的行为数据
val pageSplitNumRDD = sessionId2GroupRDD.flatMap{
case(sessionId,iterableAction) =>
// .toList.sortWith
// item1:action
// item2:action
// sortList:List[UserVisitAction] 并且是按照时间顺序排好序的
val sortList = iterableAction.toList.sortWith((item1,item2)=>{
DateUtils.parseTime(item1.action_time).getTime < DateUtils.parseTime(item2.action_time).getTime
})
// pageList:List[Long] [1,2,3,4,...]
val pageList = sortList.map{
case action => action.page_id
}
// paseSplitList: [1_2,2_3,3_4,..2_5.]
val pageSplitMap = pageList.slice(0,pageList.length-1).zip(pageList.tail).map{
case(page1,page2) => page1+"_"+page2
}
// todo (3)过滤
val pageSplitFilterMap = pageSplitMap.filter{
case pageSplit => targetPageSplit.contains(pageSplit)
}
// todo (4)转换 (1_2,1),(2_3,1)...
pageSplitFilterMap.map{
case pageSplit => (pageSplit,1L)
}
}
// todo (5)聚合 (1_2,55),(2_3,100)....
// pageSplitCountMap: Map[(pageSplit,count)]
val pageSplitCountMap = pageSplitNumRDD.countByKey()
/** 至此 实际页面切片结束*/
/** 开始 计算页面跳转转化率 */
// todo 计算第一个页面的访问量
val startPage = pageFlowArray(0).toLong
val startPageCount = sessionId2ActionRDD.filter{
case (sessionId,action) => action.page_id == startPage
}.count()
// 2
// todo 计算页面跳转转化率
getPageConvert(sparkSession,taskUUID,targetPageSplit,startPageCount,pageSplitCountMap)
}
/**
* * 锚点2的方法
* * 计算页面跳转转化率
*
* @param sparkSession
* @param taskUUID
* @param targetPageSplit
* @param startPageCount
* @param pageSplitCountMap
*/
def getPageConvert(sparkSession: SparkSession,
taskUUID: String,
targetPageSplit: Array[String],
startPageCount: Long,
pageSplitCountMap: collection.Map[String, Long])={
val pageSplitRatio = new mutable.HashMap[String,Double]()
var lastPageCount = startPageCount.toDouble // 注意必须要转化为Double类型 为除法做准备
// 1,2,3,4,5,6,7
// 1_2,2_3,...
for(pageSplit <- targetPageSplit){
// 第一次循环: lastPageCount:page1 currentPageSplitCount:page1_page2
val currentPageSplitCount = pageSplitCountMap.get(pageSplit).get.toDouble
val ratio = currentPageSplitCount / lastPageCount
pageSplitRatio.put(pageSplit,ratio) //存放容器 map结构
lastPageCount = currentPageSplitCount //向后移动 有点像斐波那契数列
}
// 把原来的k-v结构转换成单元素结构 拼接成一个大的字符串
val convertStr= pageSplitRatio.map{
case (pageSplit,ratio) => pageSplit + "=" + ratio
}.mkString("|")
val pageSplit = PageSplitConverRate(taskUUID,convertStr)
val pageSplitRatioRDD = sparkSession.sparkContext.makeRDD(Array(pageSplit))
//todo 存入数据库
import sparkSession.implicits._
pageSplitRatioRDD.toDF().write
.format("jdbc")
.option("url",ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user",ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password",ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable","page_split_conver_rate")
.mode(SaveMode.Append)
.save()
/*最后数据库里会有一条记录*/
}
/**
* 锚点1的方法
* 获取用户行为数据
* @param sparkSession
* @param taskParam 任务限制条件
*/
def getUserVisitAction(sparkSession: SparkSession, taskParam: JSONObject) = {
val startDate = ParamUtils.getParam(taskParam,Constants.PARAM_START_DATE)
val endDate = ParamUtils.getParam(taskParam,Constants.PARAM_END_AGE)
val sql = "select * from user_visit_action where date>= '" + startDate +"'and date<='" +endDate +"'"
import sparkSession.implicits._
sparkSession.sql(sql).as[UserVisitAction].rdd.map(item=>(item.session_id,item))
}
}
// Exception in thread "main" org.apache.spark.sql.AnalysisException: cannot resolve '`clcik_category_id`' given input columns: [pay_product_ids, action_time, order_category_ids, pay_category_ids, order_product_ids, click_product_id, search_keyword, user_id, session_id, page_id, date, click_category_id, city_id];
样例类
/**
* 需求5的caseclass
* 页面跳转率存到数据库的表
*
* @param taskid
* @param converRate
*/
case class PageSplitConverRate(taskid:String,converRate:String)
/**
* 需求五的样例类
* @param date
* @param user_id
* @param session_id
* @param page_id
* @param action_time
* @param search_keyword
* @param click_category_id
* @param click_product_id
* @param order_category_ids
* @param order_product_ids
* @param pay_category_ids
* @param pay_product_ids
* @param city_id
*/
case class UserVisitAction(
date:String,
user_id:Long,
session_id:String,
page_id: Long,
action_time:String,
search_keyword:String,
click_category_id:Long,
click_product_id:Long,
order_category_ids:String,
order_product_ids:String,
pay_category_ids:String,
pay_product_ids:String,
city_id:Long
)