目录
UserVisitSessionAnalyzeSpark.java
本篇文章记录用户访问session分析-top10活跃session之计算top10品类被各sessoin点击的次数。
代码
UserVisitSessionAnalyzeSpark.java
/** * 获取top10活跃session * @param sc * @param taskId * @param top10CategoryList * @param session2detailRDD */ private static void getTop10Session(JavaSparkContext sc, long taskId, List<Tuple2<CategorySortKey, String>> top10CategoryList, JavaPairRDD<String, Row> session2detailRDD) { List<Tuple2<Long,Long>> top10CategoryIdList = new ArrayList<Tuple2<Long, Long>>(); for (Tuple2<CategorySortKey,String> category: top10CategoryList){ long categoryId = Long.valueOf(StringUtils.getFieldFromConcatString(category._2,"\\|",Constants.FIELD_CATEGORY_ID)); top10CategoryIdList.add(new Tuple2<Long,Long>(categoryId,categoryId)); } JavaPairRDD<Long,Long> top10CategoryIdRDD = sc.parallelizePairs(top10CategoryIdList); /** * 第二步:计算top10热门品类被各session点击的次数 */ JavaPairRDD<String,Iterable<Row>> sessionid2detailsRDD = session2detailRDD.groupByKey(); JavaPairRDD<Long,String> categoryid2sessionCountRDD = sessionid2detailsRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, Iterable<Row>>, Long, String>() { private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Long, String>> call(Tuple2<String, Iterable<Row>> tuple) throws Exception { String sessionid = tuple._1; Iterator<Row> iterator = tuple._2.iterator(); Map<Long,Long> categoryCountMap = new HashMap<Long,Long>(); while (iterator.hasNext()){ Row row = iterator.next(); if (row.getLong(6) != Long.MAX_VALUE){ long categoryid = row.getLong(6); Long count = categoryCountMap.get(categoryid); if (count == null){ count = 0L; } count++; categoryCountMap.put(categoryid,count); } } //返回结果,<categoryid,sessionid,count>格式 List<Tuple2<Long,String>> list = new ArrayList<Tuple2<Long, String>>(); for (Map.Entry<Long,Long> categoryCountEntry: categoryCountMap.entrySet()){ long categoryid = categoryCountEntry.getKey(); long count = categoryCountEntry.getValue(); String value = sessionid +"," + count; list.add(new Tuple2<Long,String>(categoryid,value)); } return list.iterator(); } } ); //获取到top10热门品类,被各个session点击的次数 JavaPairRDD<Long,String> top10CategorySessionCountRDD = top10CategoryIdRDD .join(categoryid2sessionCountRDD) .mapToPair(new PairFunction<Tuple2<Long, Tuple2<Long, String>>, Long, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Long, String> call(Tuple2<Long, Tuple2<Long, String>> tuple) throws Exception { return new Tuple2<Long,String>(tuple._1,tuple._2._2); } }); }