目录
本篇文章将介绍用户访问session分析-top10活跃session之开发准备以及top10品类RDD生成。
需求回顾
top10活跃session
top10热门品类,获取每个品类点击次数最多的10个session,以及其对应的访问明细
实现思路分析
1、拿到符合筛选条件的session的明细数据;
2、按照session粒度进行聚合,获取到session对每个品类的点击次数,用flatMap,算子函数返回的是<categoryid,(sessionid,clickCount)>;
3、按照品类id,分组取top10,获取到top10活跃session;groupByKey;自己写算法,获取到点击次数最多的前10个session,直接写入MySQL表;返回的是sessionid;
4、获取各品类top10活跃session的访问明细数据,写入MySQL;
5、本地测试
本篇文章内容
1、重构一下之前的代码,将通过筛选条件的session的访问明细数据RDD,提取成公共的RDD;这样就不用重复计算同样的RDD;
2、将之前计算出来的top10热门品类的id,生成一个PairRDD,方便后面进行join
重构的代码
/** * 获取通过筛选session的访问明细的RDD * @param sessionid2aggrInfoRDD * @param sessionid2action * @return */ private static JavaPairRDD<String,Row> getSessionid2detailRDD( JavaPairRDD<String,String> sessionid2aggrInfoRDD, JavaPairRDD<String,Row> sessionid2action ){ JavaPairRDD<String,Row> sessionid2detailRDD = sessionid2aggrInfoRDD.join(sessionid2action) .mapToPair(new PairFunction<Tuple2<String, Tuple2<String, Row>>, String, Row>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Row> call(Tuple2<String, Tuple2<String, Row>> tuple) throws Exception { return new Tuple2<String,Row>(tuple._1,tuple._2._2); } }); return sessionid2detailRDD; }
/** * 获取Top10的品类 * @param taskid * @param sessionid2detailRDD */ private static List<Tuple2<CategorySortKey,String>> getTop10Category(long taskid, JavaPairRDD<String, Row> sessionid2detailRDD) { // 获取session访问过的所有品类id // 访问过:指的是,点击过、下单过、支付过的品类 JavaPairRDD<Long,Long> categoryidRDD = sessionid2detailRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, Row>, Long, Long>() { private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Long, Long>> call(Tuple2<String, Row> tuple) throws Exception { Row row = tuple._2; List<Tuple2<Long,Long>> list = new ArrayList<>(); Long clickCategoryId = Long.valueOf(row.getLong(6)); long maxid = 10240L; if (clickCategoryId != maxid){ list.add(new Tuple2<Long,Long>(clickCategoryId,clickCategoryId)); } String orderCategoryIds = row.getString(8); if (orderCategoryIds != null){ String[] orderCategoryIdsSplited = orderCategoryIds.split(","); for (String orderCategory: orderCategoryIdsSplited){ list.add(new Tuple2<Long,Long>(Long.valueOf(orderCategory),Long.valueOf(orderCategory))); } } String payCategoryIds = row.getString(10); if (payCategoryIds != null){ String[] payCategoryIdsSplited = payCategoryIds.split(","); for (String payCategoryId : payCategoryIdsSplited){ list.add(new Tuple2<Long,Long>(Long.valueOf(payCategoryId),Long.valueOf(payCategoryId))); } } return list.iterator(); } }); /** * 必须进行去重 * 如果不去重,会出现重复的categoryid,排序会对重复的Categoryid的categoryInfo进行排序 * 最后很可能拿到重复的数据 */ categoryidRDD = categoryidRDD.distinct(); /** * 第二步:计算各品类的点击、下单和支付的次数 */ // 访问明细中,其中三种访问行为是:点击、下单和支付 // 分别来计算各品类点击、下单和支付的次数,可以先对访问明细数据进行过滤 // 分别过滤出点击、下单和支付行为,然后通过map、reduceByKey等算子来进行计算 // 计算各个品类的点击次数 JavaPairRDD<Long, Long> clickCategoryId2CountRDD = getClickCategoryId2CountRDD(sessionid2detailRDD); // 计算各个品类的下单次数 JavaPairRDD<Long, Long> orderCategoryId2CountRDD = getOrderCategoryId2CountRDD(sessionid2detailRDD); // 计算各个品类的支付次数 JavaPairRDD<Long, Long> payCategoryId2CountRDD = getPayCategoryId2CountRDD(sessionid2detailRDD); /** * 第三步:join各品类与它的点击、下单和支付的次数 * * categoryidRDD中,是包含了所有的符合条件的session,访问过的品类id * * 上面分别计算出来的三份,各品类的点击、下单和支付的次数,可能不是包含所有品类的 * 比如,有的品类,就只是被点击过,但是没有人下单和支付 * * 所以,这里,就不能使用join操作,要使用leftOuterJoin操作,就是说,如果categoryidRDD不能 * join到自己的某个数据,比如点击、或下单、或支付次数,那么该categoryidRDD还是要保留下来的 * 只不过,没有join到的那个数据,就是0了 * */ JavaPairRDD<Long, String> categoryid2countRDD = joinCategoryAndData( categoryidRDD, clickCategoryId2CountRDD, orderCategoryId2CountRDD, payCategoryId2CountRDD); /** * 第四步:自定义二次排序key */ /** * 第五步:将数据映射成<CategorySortKey,info>格式的RDD,然后进行二次排序(降序) */ JavaPairRDD<CategorySortKey,String> sortKey2countRDD = categoryid2countRDD.mapToPair( new PairFunction<Tuple2<Long, String>, CategorySortKey, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<CategorySortKey, String> call(Tuple2<Long, String> tuple) throws Exception { String countInfo = tuple._2; String click = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_CLICK_COUNT); if (click == null){ click = "0"; } long clickCount = Long.valueOf(click); String order = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_ORDER_COUNT); if (order == null){ order = "0"; } long orderCount = Long.valueOf(order); String pay = StringUtils.getFieldFromConcatString(countInfo,"\\|",Constants.FIELD_PAY_COUNT); if (pay == null){ pay = "0"; } long payCount = Long.valueOf(pay); CategorySortKey categorySortKey = new CategorySortKey(clickCount,orderCount,payCount); return new Tuple2<CategorySortKey,String>(categorySortKey,countInfo); } }); JavaPairRDD<CategorySortKey,String> sortedCategoryCountRDD = sortKey2countRDD.sortByKey(false); /** * 第六步:用take(10)取出top10热门品类,并写入MySQL */ ITop10CategoryDAO top10CategoryDAO = DAOFactory.getTop10CategoryDAO(); List<Tuple2<CategorySortKey,String>> top10CategoryList = sortedCategoryCountRDD.take(10); for (Tuple2<CategorySortKey,String> tuple : top10CategoryList){ String countInfo = tuple._2; long categoryid = Long.valueOf(StringUtils.getFieldFromConcatString(countInfo,"\\|", Constants.FIELD_CATEGORY_ID)); long clickCount = Long.valueOf(StringUtils.getFieldFromConcatString(countInfo,"\\|", Constants.FIELD_CLICK_COUNT)); long orderCount = Long.valueOf(StringUtils.getFieldFromConcatString(countInfo,"\\|", Constants.FIELD_ORDER_COUNT)); long payCount = Long.valueOf(StringUtils.getFieldFromConcatString(countInfo,"\\|", Constants.FIELD_PAY_COUNT)); Top10Category top10Category = new Top10Category(); top10Category.setTaskid(taskid); top10Category.setCategoryid(categoryid); top10Category.setClickCount(clickCount); top10Category.setOrderCount(orderCount); top10Category.setPayCount(payCount); top10CategoryDAO.insert(top10Category); } return top10CategoryList; }
/** * 获取top10活跃session * @param taskid * @param sessionid2detailRDD */ private static void getTop10Session( JavaSparkContext sc, final long taskid, List<Tuple2<CategorySortKey, String>> top10CategoryList, JavaPairRDD<String, Row> sessionid2detailRDD) { /** * 第一步:将top10热门品类的id,生成一份RDD */ List<Tuple2<Long, Long>> top10CategoryIdList = new ArrayList<Tuple2<Long, Long>>(); for(Tuple2<CategorySortKey, String> category : top10CategoryList) { long categoryid = Long.valueOf(StringUtils.getFieldFromConcatString( category._2, "\\|", Constants.FIELD_CATEGORY_ID)); top10CategoryIdList.add(new Tuple2<Long, Long>(categoryid, categoryid)); } JavaPairRDD<Long, Long> top10CategoryIdRDD = sc.parallelizePairs(top10CategoryIdList); }