做什么?
在符合条件的用户行为数据中,获取点击、下单和支付数量排名前10的品类。在Top10的排序中,按照点击数量、下单数量、支付数量的次序进行排序,即优先考虑点击数量。
需求分析
首先我们想要得到的是在符合条件的action中,统计排名前十的热门商品.并且排名的依据是根据点击数量、下单数量、支付数量的次序进行排序的.所以通过逆推:
top10商品–>(id,(clickCount=83|orderCount=67|payCount=63))------>分别统计(id,clickCount=…),(id,orderCount=…)--------------->需要得到符合条件的原始数据
步骤分析
- 得到符合需求一中过滤条件的原始数据—join算子
val actionRdd=serverOne.getOriActionRDD(session,task);
val sessionId2ActionRDD = actionRdd.map{
item => (item.session_id, item)
}
val sessionId2FilterActionRDD=sessionId2ActionRDD.join(FilterInfo).map {
case (sessionId,(action,info))=>{
(sessionId,action);
}
}
- 获取所有发生过点击、下单、支付行为的categoryId
var cid2CidRdd=sessionId2FilterActionRDD.flatMap{
case(sessionId,action: UserVisitAction)=>{
val categoryBuffer=new ArrayBuffer[(Long,Long)]();
// 点击行为
if(action.click_category_id != -1){
categoryBuffer += ((action.click_category_id, action.click_category_id))
}else if(action.order_category_ids != null){
for(orderCid <- action.order_category_ids.split(","))
categoryBuffer += ((orderCid.toLong, orderCid.toLong))
}else if(action.pay_category_ids != null){
for(payCid <- action.pay_category_ids.split(","))
categoryBuffer += ((payCid.toLong, payCid.toLong))
}
categoryBuffer
}
}
cid2CidRdd=cid2CidRdd.distinct();
- 分别统计点击、下单、支付行为的数量:
// 第二步:统计品类的点击次数、下单次数、付款次数
val cid2ClickCountRDD = getClickCount(sessionId2FilterActionRDD)
val cid2OrderCountRDD = getOrderCount(sessionId2FilterActionRDD)
val cid2PayCountRDD = getPayCount(sessionId2FilterActionRDD)
def getClickCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
val clickFilterRDD=sessionId2FilterActionRDD.filter{
case (sessionId,action: UserVisitAction)=>{
action.click_category_id != -1L;
}
}
val clickNumRDD = clickFilterRDD.map{
case (sessionId, action) => (action.click_category_id, 1L)
}
clickNumRDD.reduceByKey(_+_)
}
def getOrderCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
val orderFilterRDD=sessionId2FilterActionRDD.filter(item=>item._2.order_category_ids!=null)
val orderNumRDD=orderFilterRDD.flatMap{
case (sessionId,action)=>{
for(id<-action.order_category_ids.split(",")){
}
action.order_category_ids.split(",").map(item=>(item.toLong,1L));
}
}
orderNumRDD.reduceByKey(_+_);
}
def getPayCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
val payFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.pay_category_ids != null)
val payNumRDD = payFilterRDD.flatMap{
case (sid, action) =>
action.pay_category_ids.split(",").map(item => (item.toLong, 1L))
}
payNumRDD.reduceByKey(_+_)
}
- 用左连接算子,统计总的数据,最后格式为:categoryId,str,str代表总的数据:(clickCount=83|orderCount=67|payCount=63)
def getFullCount(cid2CidRDD: RDD[(Long, Long)], cid2ClickCountRDD: RDD[(Long, Long)], cid2OrderCountRDD: RDD[(Long, Long)], cid2PayCountRDD: RDD[(Long, Long)]) = {
val cid2ClickInfoRDD=cid2CidRDD.leftOuterJoin(cid2ClickCountRDD).map{
case (cId,(categoryId,option))=>{
val clickCount=if (option.isDefined)option.getOrElse(0);
val aggrCount = Constants.FIELD_CATEGORY_ID + "=" + cId + "|" +
Constants.FIELD_CLICK_COUNT + "=" + clickCount
(cId, aggrCount)
}
}
val cid2OrderInfoRDD = cid2ClickInfoRDD.leftOuterJoin(cid2OrderCountRDD).map{
case (cid, (clickInfo, option)) =>
val orderCount = if(option.isDefined) option.get else 0
val aggrInfo = clickInfo + "|" +
Constants.FIELD_ORDER_COUNT + "=" + orderCount
(cid, aggrInfo)
}
val cid2PayInfoRDD = cid2OrderInfoRDD.leftOuterJoin(cid2PayCountRDD).map{
case (cid, (orderInfo, option)) =>
val payCount = if(option.isDefined) option.get else 0
val aggrInfo = orderInfo + "|" +
Constants.FIELD_PAY_COUNT + "=" + payCount
(cid, aggrInfo)
}
cid2PayInfoRDD;
}
- 自定义排序器,将数据转化为(sortKey,info)格式,接着用sorkByKey及逆行排序
自定义排序:
package server
case class SortKey(clickCount:Long, orderCount:Long, payCount:Long) extends Ordered[SortKey]{
// this.compare(that)
// this compare that
// compare > 0 this > that
// compare <0 this < that
override def compare(that: SortKey): Int = {
if(this.clickCount - that.clickCount != 0){
return (this.clickCount - that.clickCount).toInt
}else if(this.orderCount - that.orderCount != 0){
return (this.orderCount - that.orderCount).toInt
}else{
return (this.payCount - that.payCount).toInt
}
}
}
val sortRDD=cid2FullCountRDD.map{
case (cId,info)=>{
val clickCount = StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_CLICK_COUNT).toLong
val orderCount = StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_ORDER_COUNT).toLong
val payCount = StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_PAY_COUNT).toLong
val sortKey = SortKey(clickCount, orderCount, payCount)
(sortKey, info)
}
}
//5.排序
val top10=sortRDD.sortByKey(false).take(10);
- 数据封装,写入数据库
//6.封装数据,写进数据库
val top10CategoryRDD = sparkSession.sparkContext.makeRDD(top10).map{
case (sortKey, countInfo) =>
val cid = StringUtil.getFieldFromConcatString(countInfo, "\\|", Constants.FIELD_CATEGORY_ID).toLong
val clickCount = sortKey.clickCount
val orderCount = sortKey.orderCount
val payCount = sortKey.payCount
Top10Category(taskUUID, cid, clickCount, orderCount, payCount)
}
//保存到数据库
/* import sparkSession.implicits._
top10CategoryRDD.toDF().write
.format("jdbc")
.option("url", ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user", ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password", ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable", "top10_category_0308")
.mode(SaveMode.Append)
.save*/
完整代码:
package server
import commons.constant.Constants
import commons.model.{Top10Category, UserVisitAction}
import commons.utils.StringUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.ArrayBuffer
class serverThree extends Serializable {
def top10PopularCategories(sparkSession: SparkSession,
taskUUID: String,
sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
//1.将所有基本数据,转化成(cId,cId)格式的总数据
var cid2CidRdd=sessionId2FilterActionRDD.flatMap{
case(sessionId,action: UserVisitAction)=>{
val categoryBuffer=new ArrayBuffer[(Long,Long)]();
// 点击行为
if(action.click_category_id != -1){
categoryBuffer += ((action.click_category_id, action.click_category_id))
}else if(action.order_category_ids != null){
for(orderCid <- action.order_category_ids.split(","))
categoryBuffer += ((orderCid.toLong, orderCid.toLong))
}else if(action.pay_category_ids != null){
for(payCid <- action.pay_category_ids.split(","))
categoryBuffer += ((payCid.toLong, payCid.toLong))
}
categoryBuffer
}
}
cid2CidRdd=cid2CidRdd.distinct();
// 第二步:统计品类的点击次数、下单次数、付款次数
val cid2ClickCountRDD = getClickCount(sessionId2FilterActionRDD)
val cid2OrderCountRDD = getOrderCount(sessionId2FilterActionRDD)
val cid2PayCountRDD = getPayCount(sessionId2FilterActionRDD)
//3.根据左连接,将总的数据cid2CidRdd和第二部得到的数据一个个进行连接,创造出cid:str
//其中,str代表count=32|order=15.......
val cid2FullCountRDD = getFullCount(cid2CidRdd,cid2ClickCountRDD,cid2OrderCountRDD,cid2PayCountRDD);
//4.自定义排序器,将数据转化为(sortKey,info)
val sortRDD=cid2FullCountRDD.map{
case (cId,info)=>{
val clickCount = StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_CLICK_COUNT).toLong
val orderCount = StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_ORDER_COUNT).toLong
val payCount = StringUtil.getFieldFromConcatString(info, "\\|", Constants.FIELD_PAY_COUNT).toLong
val sortKey = SortKey(clickCount, orderCount, payCount)
(sortKey, info)
}
}
//5.排序
val top10=sortRDD.sortByKey(false).take(10);
top10.foreach(println);
//6.封装数据,写进数据库
val top10CategoryRDD = sparkSession.sparkContext.makeRDD(top10).map{
case (sortKey, countInfo) =>
val cid = StringUtil.getFieldFromConcatString(countInfo, "\\|", Constants.FIELD_CATEGORY_ID).toLong
val clickCount = sortKey.clickCount
val orderCount = sortKey.orderCount
val payCount = sortKey.payCount
Top10Category(taskUUID, cid, clickCount, orderCount, payCount)
}
//保存到数据库
/* import sparkSession.implicits._
top10CategoryRDD.toDF().write
.format("jdbc")
.option("url", ConfigurationManager.config.getString(Constants.JDBC_URL))
.option("user", ConfigurationManager.config.getString(Constants.JDBC_USER))
.option("password", ConfigurationManager.config.getString(Constants.JDBC_PASSWORD))
.option("dbtable", "top10_category_0308")
.mode(SaveMode.Append)
.save*/
top10
}
def getFullCount(cid2CidRDD: RDD[(Long, Long)], cid2ClickCountRDD: RDD[(Long, Long)], cid2OrderCountRDD: RDD[(Long, Long)], cid2PayCountRDD: RDD[(Long, Long)]) = {
val cid2ClickInfoRDD=cid2CidRDD.leftOuterJoin(cid2ClickCountRDD).map{
case (cId,(categoryId,option))=>{
val clickCount=if (option.isDefined)option.getOrElse(0);
val aggrCount = Constants.FIELD_CATEGORY_ID + "=" + cId + "|" +
Constants.FIELD_CLICK_COUNT + "=" + clickCount
(cId, aggrCount)
}
}
val cid2OrderInfoRDD = cid2ClickInfoRDD.leftOuterJoin(cid2OrderCountRDD).map{
case (cid, (clickInfo, option)) =>
val orderCount = if(option.isDefined) option.get else 0
val aggrInfo = clickInfo + "|" +
Constants.FIELD_ORDER_COUNT + "=" + orderCount
(cid, aggrInfo)
}
val cid2PayInfoRDD = cid2OrderInfoRDD.leftOuterJoin(cid2PayCountRDD).map{
case (cid, (orderInfo, option)) =>
val payCount = if(option.isDefined) option.get else 0
val aggrInfo = orderInfo + "|" +
Constants.FIELD_PAY_COUNT + "=" + payCount
(cid, aggrInfo)
}
cid2PayInfoRDD;
}
def getClickCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
val clickFilterRDD=sessionId2FilterActionRDD.filter{
case (sessionId,action: UserVisitAction)=>{
action.click_category_id != -1L;
}
}
val clickNumRDD = clickFilterRDD.map{
case (sessionId, action) => (action.click_category_id, 1L)
}
clickNumRDD.reduceByKey(_+_)
}
def getOrderCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)])={
val orderFilterRDD=sessionId2FilterActionRDD.filter(item=>item._2.order_category_ids!=null)
val orderNumRDD=orderFilterRDD.flatMap{
case (sessionId,action)=>{
for(id<-action.order_category_ids.split(",")){
}
action.order_category_ids.split(",").map(item=>(item.toLong,1L));
}
}
orderNumRDD.reduceByKey(_+_);
}
def getPayCount(sessionId2FilterActionRDD: RDD[(String, UserVisitAction)]) = {
val payFilterRDD = sessionId2FilterActionRDD.filter(item => item._2.pay_category_ids != null)
val payNumRDD = payFilterRDD.flatMap{
case (sid, action) =>
action.pay_category_ids.split(",").map(item => (item.toLong, 1L))
}
payNumRDD.reduceByKey(_+_)
}
}
排序器:
package server
case class SortKey(clickCount:Long, orderCount:Long, payCount:Long) extends Ordered[SortKey]{
// this.compare(that)
// this compare that
// compare > 0 this > that
// compare <0 this < that
override def compare(that: SortKey): Int = {
if(this.clickCount - that.clickCount != 0){
return (this.clickCount - that.clickCount).toInt
}else if(this.orderCount - that.orderCount != 0){
return (this.orderCount - that.orderCount).toInt
}else{
return (this.payCount - that.payCount).toInt
}
}
}