11.SparkCore练习

SparkCore练习

2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_47_2019-07-17 00:00:54_null_14_79_null_null_null_null_2
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_27_2019-07-17 00:00:59_null_3_50_null_null_null_null_26
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_27_2019-07-17 00:01:05_i7_-1_-1_null_null_null_null_17
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_24_2019-07-17 00:01:07_null_5_39_null_null_null_null_10
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_25_2019-07-17 00:01:13_i7_-1_-1_null_null_null_null_24
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_22_2019-07-17 00:01:21_null_19_62_null_null_null_null_20
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_41_2019-07-17 00:01:27_null_4_58_null_null_null_null_9
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_2_2019-07-17 00:01:33_苹果_-1_-1_null_null_null_null_21
2019-07-17_38_6502cdc9-cf95-4b08-8854-f03a25baa917_8_2019-07-17 00:01:39_笔记本_-1_-1_null_null_null_null_21

按下划线从左到右切割依次为

时间戳,用户,session_id,页面id,搜索关键字,点击品Id,点击类Id,下单类id,下单品id,支付类id,支付品id,城市id

再次基础上制作相应得bean对象

package cn.lpc.spark.bean

case class UserVisitAction(date: String,
                           user_id: Long,
                           session_id: String,
                           page_id: Long,
                           action_time: String,
                           search_keyword: String,
                           click_category_id: Long,
                           click_product_id: Long,
                           order_category_ids: String,
                           order_product_ids: String,
                           pay_category_ids: String,
                           pay_product_ids: String,
                           city_id: Long)

case class CategoryCountInfo(categoryId: String,
                             clickCount: Long,
                             orderCount: Long,
                             payCount: Long)

取top10得热门商品

// 处理逻辑得主程序
object CategoryTopApp {
    def statCategoryTo10(sc: SparkContext,UserVisitActionAdd: RDD[UserVisitAction]): Array[CategoryCountInfo] ={
        // 自定义累加器
        val acc = new accAction
        // 注册累加器
        sc.register(acc,"accAction")
        // 类加
        UserVisitActionAdd.foreach(
            UserVisitAction=>{
                acc.add(UserVisitAction)
            }
        )
        // 对累加得值进行分过滤分组等操作就能完成
        val cidToCateInfo: Map[String, Map[(String, String), Long]] = acc.value.filter(x => x._1 != ("wrongData", "wrongData")).groupBy(_._1._1)
        val categoryCountInfos: Array[CategoryCountInfo] = cidToCateInfo.map {
            case (cid, map) => {
                CategoryCountInfo(cid,
                    map.getOrElse((cid, "click"), 0),
                    map.getOrElse((cid, "order"), 0),
                    map.getOrElse((cid, "pay"), 0))
            }
        }.toArray
         categoryCountInfos.sortBy(info => (info.clickCount, info.orderCount, info.clickCount))(Ordering.Tuple3(Ordering.Long.reverse, Ordering.Long.reverse, Ordering.Long.reverse))
            .take(10)


    }
}
// 主要代码在累加器之中
class accAction extends AccumulatorV2[UserVisitAction,Map[(String,String),Long]]{
    private var map: Map[(String, String), Long] = Map[(String, String), Long]()
    override def isZero: Boolean = map.isEmpty

    override def copy(): AccumulatorV2[UserVisitAction, Map[(String, String), Long]] = {
        val acc = new accAction
        acc.map = map
        acc
    }

    override def reset(): Unit =  map =Map[(String, String), Long]()

    // 根据不同得cid和action组成一个key进行类加
    override def add(v: UserVisitAction): Unit = {
        // 有些action有多个cid
        val cidAndActions: (String, String) = ParseUtil.parserAction(v)
        val cids: Array[String] = cidAndActions._1.split(",")
        cids.foreach{
            cid =>{
                val key: (String, String) = (cid,cidAndActions._2)
                map += (key->(map.getOrElse(key,0L)+1L))
            }
        }
    }
    // 多个分区之间的元素进行合并
    override def merge(other: AccumulatorV2[UserVisitAction, Map[(String, String), Long]]): Unit = other match {
        case o:accAction =>{
            o.map.foreach {
                case (cidAction, count) =>
                    this.map += cidAction -> (this.map.getOrElse(cidAction, 0L) + count)
            }
        }
    }

    override def value: Map[(String, String), Long] = map
}
// 自定义的解析器
package cn.lpc.spark.Util

import cn.lpc.spark.bean.{CategoryCountInfo, UserVisitAction}

case class ParseUtil(){

}
object ParseUtil{
    def parseLine(line:String): UserVisitAction ={
        val splits: Array[String] = line.split("_")
        UserVisitAction(
            splits(0),
            splits(1).toLong,
            splits(2),
            splits(3).toLong,
            splits(4),
            splits(5),
            splits(6).toLong,
            splits(7).toLong,
            splits(8),
            splits(9),
            splits(10),
            splits(11),
            splits(12).toLong
        )
    }
    def parserAction(action:UserVisitAction):(String,String)={
        if(action.click_category_id != -1){
            (action.click_category_id.toString,"click")
        }else if (action.order_category_ids != "null"){
            (action.order_category_ids,"order")
        }else if (action.pay_category_ids !="null"){
            (action.pay_category_ids,"pay")
        }else{
            ("wrongData","wrongData")
        }
    }
    def getCidFormCategoryCountInfo(CategoryInfos: Array[CategoryCountInfo])={
        CategoryInfos.map(x=>x.categoryId)
    }

}

Top10热门品类中每个品类的 Top10 活跃 Session 统计

活跃session

 def statCategoryTo10(sc: SparkContext,UserVisitActionAdd: RDD[UserVisitAction]): Array[CategoryCountInfo] ={
        val acc = new accAction
        sc.register(acc,"accAction")
        UserVisitActionAdd.foreach(
            UserVisitAction=>{
                acc.add(UserVisitAction)
            }
        )
        val cidToCateInfo: Map[String, Map[(String, String), Long]] = acc.value.filter(x => x._1 != ("wrongData", "wrongData")).groupBy(_._1._1)
        val categoryCountInfos: Array[CategoryCountInfo] = cidToCateInfo.map {
            case (cid, map) => {
                CategoryCountInfo(cid,
                    map.getOrElse((cid, "click"), 0),
                    map.getOrElse((cid, "order"), 0),
                    map.getOrElse((cid, "pay"), 0))
            }
        }.toArray

         categoryCountInfos.sortBy(info => (info.clickCount, info.orderCount, info.clickCount))(Ordering.Tuple3(Ordering.Long.reverse, Ordering.Long.reverse, Ordering.Long.reverse))
            .take(10)
    }
  1. 过滤出来 category Top10的日志
  2. 需要用到需求1的结果, 然后只需要得到categoryId就可以了
  3. 转换结果为 RDD[(categoryId, sessionId), 1] 然后统计数量 => RDD[(categoryId, sessionId), count]
  4. 统计每个品类 top10. => RDD[categoryId, (sessionId, count)] => RDD[categoryId, Iterable[(sessionId, count)]]
  5. 对每个 Iterable[(sessionId, count)]进行排序, 并取每个Iterable的前10
  6. 把数据封装到 CategorySession 中

页面单跳转化率统计

// 代码总体上不难
// 做漏斗分析需要注意的就是单Session在连续时间下相邻的转换才是合法的跳转
// 需要注意的是灵活的利用zip,构造转换
object PageConversion {
    def statPageConversion(sc: SparkContext, UserVisitActionAdd: RDD[UserVisitAction],pages:String)={
        // pages ->1,2,3,4,5,6,7
        val splits: Array[String] = pages.split(",")
        // 将页面的跳转行为进行表示
        val fromPages: Array[String] = splits.slice(0, splits.length - 1)
        val toPages: Array[String] = splits.slice(1, splits.length)
        val fromToPages: Array[String] = fromPages.zip(toPages).map {
            case (fromPage, toPage) => fromPage + "->" + toPage
        }
        // 计算分母
        val UserVisitActionFiltered: RDD[UserVisitAction] = UserVisitActionAdd.filter(action => splits.contains(action.page_id.toString))
        // 将包含所需pageId的页面过滤出来
        val pageToInt: collection.Map[Long, Long] =UserVisitActionFiltered.map(
            action => (action.page_id, 1L)
        ).countByKey()
        println(pageToInt)

        // 计算分子
        // 一个session一个action集合
        val sessionWithAction: RDD[(String, Iterable[UserVisitAction])] = UserVisitActionFiltered.groupBy(_.session_id)

        val SessionToTrains: RDD[(String, List[Long])] = sessionWithAction.mapValues(t => t.toList.sortBy(_.action_time).map(sessionWithAction =>
            sessionWithAction.page_id))

        val AllConversion: RDD[String] = SessionToTrains.flatMap(
            x => {
                val fromPages: List[Long] = x._2.slice(0, x._2.size - 1)
                val toPages: List[Long] = x._2.slice(1, x._2.size)
                fromPages.zip(toPages).map {
                    case (fromPage, toPage) => fromPage + "->" + toPage
                }
            }
        )
        val conversionAndCount: collection.Map[String, Long] = AllConversion.filter(x=>fromToPages.contains(x)).map(x => (x, 1)).countByKey()
        val format = new DecimalFormat(".00%")

        val changAndRate: collection.Map[String, String] = conversionAndCount.map {
            k => (k._1,format.format( k._2.toDouble / pageToInt(k._1.split("->")(0).toLong)))
        }
        changAndRate
    }
}
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值