目录
本篇文章将记录用户访问session分析-top10热门品类之join品类与点击下单支付次数。
代码
UserVisitSessionSpark.java
/** * 获取Top10的品类 * @param filteredSessionid2AggrInfoRDD * @param session2actionRDD */ private static void getTop10Category(JavaPairRDD<String, String> filteredSessionid2AggrInfoRDD, JavaPairRDD<String, Row> session2actionRDD) { JavaPairRDD<String,Row> sessionid2detailRDD = filteredSessionid2AggrInfoRDD .join(session2actionRDD) .mapToPair( new PairFunction<Tuple2<String, Tuple2<String, Row>>, String, Row>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, Row> call(Tuple2<String, Tuple2<String, Row>> tuple) throws Exception { return new Tuple2<String,Row>(tuple._1,tuple._2._2); } }); // 获取session访问过的所有品类id // 访问过:指的是,点击过、下单过、支付过的品类 JavaPairRDD<Long,Long> categoryidRDD = sessionid2detailRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String, Row>, Long, Long>() { private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<Long, Long>> call(Tuple2<String, Row> tuple) throws Exception { Row row = tuple._2; List<Tuple2<Long,Long>> list = new ArrayList<>(); Long clickCategoryId = Long.valueOf(row.getLong(6)); long maxid = 10240L; if (clickCategoryId != maxid){ list.add(new Tuple2<Long,Long>(clickCategoryId,clickCategoryId)); } String orderCategoryIds = row.getString(8); if (orderCategoryIds != null){ String[] orderCategoryIdsSplited = orderCategoryIds.split(","); for (String orderCategory: orderCategoryIdsSplited){ list.add(new Tuple2<Long,Long>(Long.valueOf(orderCategory),Long.valueOf(orderCategory))); } } String payCategoryIds = row.getString(10); if (payCategoryIds != null){ String[] payCategoryIdsSplited = payCategoryIds.split(","); for (String payCategoryId : payCategoryIdsSplited){ list.add(new Tuple2<Long,Long>(Long.valueOf(payCategoryId),Long.valueOf(payCategoryId))); } } return list.iterator(); } } ); /** * 第二步:计算各品类的点击、下单和支付的次数 */ // 访问明细中,其中三种访问行为是:点击、下单和支付 // 分别来计算各品类点击、下单和支付的次数,可以先对访问明细数据进行过滤 // 分别过滤出点击、下单和支付行为,然后通过map、reduceByKey等算子来进行计算 // 计算各个品类的点击次数 JavaPairRDD<Long, Long> clickCategoryId2CountRDD = getClickCategoryId2CountRDD(sessionid2detailRDD); // 计算各个品类的下单次数 JavaPairRDD<Long, Long> orderCategoryId2CountRDD = getOrderCategoryId2CountRDD(sessionid2detailRDD); // 计算各个品类的支付次数 JavaPairRDD<Long, Long> payCategoryId2CountRDD = getPayCategoryId2CountRDD(sessionid2detailRDD); /** * 第三步:join各品类与它的点击、下单和支付的次数 * * categoryidRDD中,是包含了所有的符合条件的session,访问过的品类id * * 上面分别计算出来的三份,各品类的点击、下单和支付的次数,可能不是包含所有品类的 * 比如,有的品类,就只是被点击过,但是没有人下单和支付 * * 所以,这里,就不能使用join操作,要使用leftOuterJoin操作,就是说,如果categoryidRDD不能 * join到自己的某个数据,比如点击、或下单、或支付次数,那么该categoryidRDD还是要保留下来的 * 只不过,没有join到的那个数据,就是0了 * */ JavaPairRDD<Long, String> categoryid2countRDD = joinCategoryAndData( categoryidRDD, clickCategoryId2CountRDD, orderCategoryId2CountRDD, payCategoryId2CountRDD); }
/** * 连接品类与数据的RDD * @param categoryidRDD * @param clickCategoryId2CountRDD * @param orderCategoryId2CountRDD * @param payCategoryId2CountRDD * @return */ private static JavaPairRDD<Long,String> joinCategoryAndData( JavaPairRDD<Long,Long> categoryidRDD, JavaPairRDD<Long,Long> clickCategoryId2CountRDD, JavaPairRDD<Long,Long> orderCategoryId2CountRDD, JavaPairRDD<Long,Long> payCategoryId2CountRDD ){ // 解释一下,如果用leftOuterJoin,就可能出现,右边那个RDD中,join过来时,没有值 // 所以Tuple中的第二个值用Optional<Long>类型,就代表,可能有值,可能没有值 //JavaPairRDD<Long,Tuple2<Long,Optional<Long>>> tmpJoinRDD = categoryidRDD.leftOuterJoin(clickCategoryId2CountRDD); JavaPairRDD<Long, Tuple2<Long, Optional<Long>>> tmpJoinRDD = categoryidRDD.leftOuterJoin(clickCategoryId2CountRDD); JavaPairRDD<Long,String> tmpMapRDD= tmpJoinRDD.mapToPair( new PairFunction<Tuple2<Long, Tuple2<Long, Optional<Long>>>, Long, String>() { private static final long verialVersionUID = 1L; @Override public Tuple2<Long, String> call(Tuple2<Long, Tuple2<Long, Optional<Long>>> tuple) throws Exception { long categoryId = tuple._1; Optional<Long> optional = tuple._2._2; long clickCount = 0L; if (optional.isPresent()){ clickCount = optional.get(); } String value = Constants.FIELD_CATEGORY_ID +"=" +categoryId+"|" +Constants.FIELD_CLICK_COUNT+"="+clickCount; return new Tuple2<Long,String>(categoryId,value); } } ); tmpMapRDD = tmpMapRDD.leftOuterJoin(orderCategoryId2CountRDD).mapToPair( new PairFunction<Tuple2<Long,Tuple2<String,Optional<Long>>>, Long, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Long, String> call( Tuple2<Long, Tuple2<String, Optional<Long>>> tuple) throws Exception { long categoryid = tuple._1; String value = tuple._2._1; Optional<Long> optional = tuple._2._2; long orderCount = 0L; if(optional.isPresent()) { orderCount = optional.get(); } value = value + "|" + Constants.FIELD_ORDER_COUNT + "=" + orderCount; return new Tuple2<Long, String>(categoryid, value); } }); tmpMapRDD = tmpMapRDD.leftOuterJoin(payCategoryId2CountRDD).mapToPair( new PairFunction<Tuple2<Long,Tuple2<String,Optional<Long>>>, Long, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<Long, String> call( Tuple2<Long, Tuple2<String, Optional<Long>>> tuple) throws Exception { long categoryid = tuple._1; String value = tuple._2._1; Optional<Long> optional = tuple._2._2; long payCount = 0L; if(optional.isPresent()) { payCount = optional.get(); } value = value + "|" + Constants.FIELD_PAY_COUNT + "=" + payCount; return new Tuple2<Long, String>(categoryid, value); } }); return tmpMapRDD; }