目录
SessionRandomExtractDAOImpl.java
UserVisitSessionAnlyzeSpark.java
本篇文章将记录用户访问session分析-session随机抽取之根据随机索引进行抽取。
随机抽取session插入表代码
dao
ISessionRandomExtractDAO.java
package graduation.java.dao; import graduation.java.domain.SessionRandomExtract; /** * FileName: ISessionRandomExtractDAO * Author: hadoop * Email: 3165845957@qq.com * Date: 19-3-21 下午7:59 * Description: */ public interface ISessionRandomExtractDAO { /** * 插入随机抽取的session * @param sessionRandomExtract */ void insert(SessionRandomExtract sessionRandomExtract); }
domain
SessionRandomExtract.java
package graduation.java.domain; /** * FileName: SessionRandomExtract * Author: hadoop * Email: 3165845957@qq.com * Date: 19-3-21 下午7:55 * Description: * 随机抽取session表实体类 */ public class SessionRandomExtract { private long taskid; private String sessionid; private String startTime; private String serachKeyWords; private String clickCategoryIds; public long getTaskid() { return taskid; } public void setTaskid(long taskid) { this.taskid = taskid; } public String getSessionid() { return sessionid; } public void setSessionid(String sessionid) { this.sessionid = sessionid; } public String getStartTime() { return startTime; } public void setStartTime(String startTime) { this.startTime = startTime; } public String getSerachKeyWords() { return serachKeyWords; } public void setSerachKeyWords(String serachKeyWords) { this.serachKeyWords = serachKeyWords; } public String getClickCategoryIds() { return clickCategoryIds; } public void setClickCategoryIds(String clickCategoryIds) { this.clickCategoryIds = clickCategoryIds; } @Override public String toString() { return "SessionRandomExtract{" + "taskid=" + taskid + ", sessionid='" + sessionid + '\'' + ", startTime='" + startTime + '\'' + ", serachKeyWords='" + serachKeyWords + '\'' + ", clickCategoryIds='" + clickCategoryIds + '\'' + '}'; } }
impl
SessionRandomExtractDAOImpl.java
package graduation.java.impl; import graduation.java.dao.ISessionAggrStatDAO; import graduation.java.dao.ISessionRandomExtractDAO; import graduation.java.domain.SessionRandomExtract; import graduation.java.jdbc.JDBCHelper; /** * FileName: SessionRandomExtractImpl * Author: hadoop * Email: 3165845957@qq.com * Date: 19-3-21 下午8:02 * Description: *随机抽取session出入mysql */ public class SessionRandomExtractDAOImpl implements ISessionRandomExtractDAO { /** * 随机抽取session插入mysql数据库方法实现 * @param sessionRandomExtract */ public void insert(SessionRandomExtract sessionRandomExtract){ String sql = "insert into session_random_extract values(?,?,?,?,?)"; Object[] param = new Object[]{ sessionRandomExtract.getTaskid(), sessionRandomExtract.getSessionid(), sessionRandomExtract.getStartTime(), sessionRandomExtract.getSerachKeyWords(), sessionRandomExtract.getClickCategoryIds() }; JDBCHelper jdbcHelper = JDBCHelper.getInstance(); jdbcHelper.executeUpdate(sql,param); } }
test
SessionRandomExtractTest.java
package graduation.java.test; import graduation.java.domain.SessionRandomExtract; import graduation.java.jdbc.JDBCHelper; /** * FileName: SessionRandomExtractTest * Author: hadoop * Email: 3165845957@qq.com * Date: 19-3-21 下午8:45 * Description: * 测试随机插入session表 */ public class SessionRandomExtractTest { public static void main(String[] args){ String sql ="insert into session_random_extract values(?,?,?,?,?)"; SessionRandomExtract sessionRandomExtract = new SessionRandomExtract(); long taskid =1; String sessionid = "123"; String startTime = "2019-03-05"; String serachKeyWords = "milk"; String clickCategoryIds = "1,34,5"; Object[] param = new Object[]{taskid, sessionid, startTime,serachKeyWords,clickCategoryIds }; JDBCHelper jdbcHelper = JDBCHelper.getInstance(); jdbcHelper.executeUpdate(sql,param); } }
测试结果
根据随机索引进行抽取
UserVisitSessionAnlyzeSpark.java
/** * 随机抽取session * @param sessionid2AggrInfoRDD */ private static void randomExtractSession( final long taskid, JavaPairRDD<String, String> sessionid2AggrInfoRDD) { // 第一步,计算出每天每小时的session数量,获取<yyyy-MM-dd_HH,sessionid>格式的RDD JavaPairRDD<String, String> time2sessionidRDD = sessionid2AggrInfoRDD.mapToPair( new PairFunction<Tuple2<String,String>, String, String>() { private static final long serialVersionUID = 1L; @Override public Tuple2<String, String> call( Tuple2<String, String> tuple) throws Exception { String aggrInfo = tuple._2; String startTime = StringUtils.getFieldFromConcatString( aggrInfo, "\\|", Constants.FIELD_START_TIME); String dateHour = DateUtils.getDateHour(startTime); return new Tuple2<String, String>(dateHour, aggrInfo); } }); /** * 思考一下:这里我们不要着急写大量的代码,做项目的时候,一定要用脑子多思考 * * 每天每小时的session数量,然后计算出每天每小时的session抽取索引,遍历每天每小时session * 首先抽取出的session的聚合数据,写入session_random_extract表 * 所以第一个RDD的value,应该是session聚合数据 * */ // 得到每天每小时的session数量 Map<String, Long> countMap = time2sessionidRDD.countByKey(); //第二步,使用按时间比例随机抽取算法,计算出每天每小时需要抽取session的索引 //将<yyyy-MM-dd_HH,count>格式的map,转换为<yyyy-MM-dd,<HH,count>> Map<String,Map<String,Long>> dateHourCountMap = new HashMap<String,Map<String,Long>>(); for (Map.Entry<String, Long> countEntry : countMap.entrySet()){ String dateHour = countEntry.getKey(); String date = dateHour.split("_")[0]; String hour = dateHour.split("_")[1]; long count = countEntry.getValue(); Map<String,Long> hourCountMap = dateHourCountMap.get(date); if (hourCountMap ==null){ hourCountMap = new HashMap<String,Long>(); dateHourCountMap.put(date,hourCountMap); } dateHourCountMap.put(date,hourCountMap); } //开始实现按时间比例随机抽取算法 //总共要抽取100个session,按照天数,进行平分 int extractNumberPerDay = 100 /dateHourCountMap.size(); //<date,<hour,(1,3,4,2103)>> Map<String,Map<String, List<Integer>>> dateHourExtractMap = new HashMap<String,Map<String,List<Integer>>>(); Random random = new Random(); for (Map.Entry<String,Map<String,Long>> dateHourCountEntry : dateHourCountMap.entrySet()){ String date = dateHourCountEntry.getKey(); Map<String,Long> hourCountMap = dateHourCountEntry.getValue(); //计算出每天的session总数 long sessionCount = 0L; for (long hourCount : hourCountMap.values()){ sessionCount += hourCount; } Map<String,List<Integer>> hourExtractMap = dateHourExtractMap.get(date); if (hourExtractMap == null){ hourExtractMap = new HashMap<String,List<Integer>>(); dateHourExtractMap.put(date,hourExtractMap); } //遍历每一个小时 for (Map.Entry<String,Long> hourCountEntry : hourCountMap.entrySet()){ String hour = hourCountEntry.getKey(); long count = hourCountEntry.getValue(); // 计算每个小时的session数量,占据当天总session数量的比例,直接乘以每天要抽取的数量 // 就可以计算出,当前小时需要抽取的session数量 int hourExtractNumber = (int)((double)count/(double) sessionCount)*extractNumberPerDay; if (hourExtractNumber > count){ hourExtractNumber = (int)count; } //先获取当前小时的存放随机数的list List<Integer> extractIndexList = hourExtractMap.get(hour); if (extractIndexList == null){ extractIndexList = new ArrayList<Integer>(); hourExtractMap.put(hour,extractIndexList); } //生成上面计算出来的数量的随机数 for (int i = 0; i < hourExtractNumber;i++){ int extractIndex = random.nextInt((int)count); while (extractIndexList.contains(extractIndex)){ extractIndex = random.nextInt((int)count); } extractIndexList.add(extractIndex); } } } /** * 第三步:遍历每天每小时的session,然后根据随机索引进行抽取 */ // 执行groupByKey算子,得到<dateHour,(session aggrInfo)> JavaPairRDD<String,Iterable<String>> time2sessionsRDD = time2sessionidRDD.groupByKey(); // 我们用flatMap算子,遍历所有的<dateHour,(session aggrInfo)>格式的数据 // 然后呢,会遍历每天每小时的session // 如果发现某个session恰巧在我们指定的这天这小时的随机抽取索引上 // 那么抽取该session,直接写入MySQL的random_extract_session表 // 将抽取出来的session id返回回来,形成一个新的JavaRDD<String> // 然后最后一步,是用抽取出来的sessionid,去join它们的访问行为明细数据,写入session表 JavaPairRDD<String, String> extractSessionidsRDD = time2sessionsRDD.flatMapToPair( new PairFlatMapFunction<Tuple2<String,Iterable<String>>, String, String>() { private static final long serialVersionUID = 1L; @Override public Iterator<Tuple2<String, String>> call( Tuple2<String, Iterable<String>> tuple) throws Exception { List<Tuple2<String, String>> extractSessionids = new ArrayList<Tuple2<String, String>>(); String dateHour = tuple._1; String date = dateHour.split("_")[0]; String hour = dateHour.split("_")[1]; Iterator<String> iterator = tuple._2.iterator(); List<Integer> extractIndexList = dateHourExtractMap.get(date).get(hour); ISessionRandomExtractDAO sessionRandomExtractDAO = DAOFactory.getSessionRandomExtractDAO(); int index = 0; while(iterator.hasNext()) { String sessionAggrInfo = iterator.next(); if(extractIndexList.contains(index)) { String sessionid = StringUtils.getFieldFromConcatString( sessionAggrInfo, "\\|", Constants.FIELD_SESSION_ID); // 将数据写入MySQL SessionRandomExtract sessionRandomExtract = new SessionRandomExtract(); sessionRandomExtract.setTaskid(taskid); sessionRandomExtract.setSessionid(sessionid); sessionRandomExtract.setStartTime(StringUtils.getFieldFromConcatString( sessionAggrInfo, "\\|", Constants.FIELD_START_TIME)); sessionRandomExtract.setSerachKeyWords(StringUtils.getFieldFromConcatString( sessionAggrInfo, "\\|", Constants.FIELD_SEARCH_KEYWORDS)); sessionRandomExtract.setClickCategoryIds(StringUtils.getFieldFromConcatString( sessionAggrInfo, "\\|", Constants.FIELD_CLICK_CATEGORY_IDS)); sessionRandomExtractDAO.insert(sessionRandomExtract); // 将sessionid加入list extractSessionids.add(new Tuple2<String, String>(sessionid, sessionid)); } index++; } return (Iterator<Tuple2<String, String>>) extractSessionids; } }); }