目录
本文为《Spark大型电商项目实战》 系列文章之一,主要介绍计算各session范围占比,并写入MySQL的实现步骤。
创建MySQL表
使用SQLyog工具在MySQL中创建数据库sparkproject
,在这个数据库中创建session_aggr_stat
表,表结构如下
将idx_task_id
设置为索引
编码实现
计算各session的访问时长和访问步长所占比例
在UserVisitSessionAnalyzeSpark.java
中实现计算各session范围占比,并写入MySQL的功能,首先实现计算各个访问时长和访问步长所占比例功能
/*
* 计算各session范围占比,并写入MySQL
*/
private static void calculateAndPersistAggrStat(String value, long taskid) {
//从Accumulator统计串中获取值
long session_count = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.SESSION_COUNT));
long visit_length_1s_3s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_1s_3s));
long visit_length_4s_6s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_4s_6s));
long visit_length_7s_9s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_7s_9s));
long visit_length_10s_30s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_10s_30s));
long visit_length_30s_60s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_30s_60s));
long visit_length_1m_3m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_1m_3m));
long visit_length_3m_10m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_3m_10m));
long visit_length_10m_30m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_10m_30m));
long visit_length_30m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_30m));
long step_length_1_3 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_1_3));
long step_length_4_6 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_4_6));
long step_length_7_9 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_7_9));
long step_length_10_30 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_10_30));
long step_length_30_60 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_30_60));
long step_length_60 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_60));
//计算各个访问时长和访问步长的范围
double visit_length_1s_3s_ratio = NumberUtils.formatDouble(
(double)visit_length_1s_3s / (double)session_count, 2);
double visit_length_4s_6s_ratio = NumberUtils.formatDouble(
(double)visit_length_4s_6s / (double)session_count, 2);
double visit_length_7s_9s_ratio = NumberUtils.formatDouble(
(double)visit_length_7s_9s / (double)session_count, 2);
double visit_length_10s_30s_ratio = NumberUtils.formatDouble(
(double)visit_length_10s_30s / (double)session_count, 2);
double visit_length_30s_60s_ratio = NumberUtils.formatDouble(
(double)visit_length_30s_60s / (double)session_count, 2);
double visit_length_1m_3m_ratio = NumberUtils.formatDouble(
(double)visit_length_1m_3m / (double)session_count, 2);
double visit_length_3m_10m_ratio = NumberUtils.formatDouble(
(double)visit_length_3m_10m / (double)session_count, 2);
double visit_length_10m_30m_ratio = NumberUtils.formatDouble(
(double)visit_length_10m_30m / (double)session_count, 2);
double visit_length_30m_ratio = NumberUtils.formatDouble(
(double)visit_length_30m / (double)session_count, 2);
double step_length_1_3_ratio = NumberUtils.formatDouble(
(double)step_length_1_3 / (double)session_count, 2);
double step_length_4_6_ratio = NumberUtils.formatDouble(
(double)step_length_4_6 / (double)session_count, 2);
double step_length_7_9_ratio = NumberUtils.formatDouble(
(double)step_length_7_9 / (double)session_count, 2);
double step_length_10_30_ratio = NumberUtils.formatDouble(
(double)step_length_10_30 / (double)session_count, 2);
double step_length_30_60_ratio = NumberUtils.formatDouble(
(double)step_length_30_60 / (double)session_count, 2);
double step_length_60_ratio = NumberUtils.formatDouble(
(double)step_length_60 / (double)session_count, 2);
封装Domain
在包com.erik.sparkproject.domain
中新建类SessionAggrStat.java
package com.erik.sparkproject.domain;
/**
* session聚合统计
* @author Erik
*
*/
public class SessionAggrStat {
private long taskid;
private long session_count;
private double visit_length_1s_3s_ratio;
private double visit_length_4s_6s_ratio;
private double visit_length_7s_9s_ratio;
private double visit_length_10s_30s_ratio;
private double visit_length_30s_60s_ratio;
private double visit_length_1m_3m_ratio;
private double visit_length_3m_10m_ratio;
private double visit_length_10m_30m_ratio;
private double visit_length_30m_ratio;
private double step_length_1_3_ratio;
private double step_length_4_6_ratio;
private double step_length_7_9_ratio;
private double step_length_10_30_ratio;
private double step_length_30_60_ratio;
private double step_length_60_ratio;
public long getTaskid() {
return taskid;
}
public void setTaskid(long taskid) {
this.taskid = taskid;
}
public long getSession_count() {
return session_count;
}
public void setSession_count(long session_count) {
this.session_count = session_count;
}
public double getVisit_length_1s_3s_ratio() {
return visit_length_1s_3s_ratio;
}
public void setVisit_length_1s_3s_ratio(double visit_length_1s_3s_ratio) {
this.visit_length_1s_3s_ratio = visit_length_1s_3s_ratio;
}
public double getVisit_length_4s_6s_ratio() {
return visit_length_4s_6s_ratio;
}
public void setVisit_length_4s_6s_ratio(double visit_length_4s_6s_ratio) {
this.visit_length_4s_6s_ratio = visit_length_4s_6s_ratio;
}
public double getVisit_length_7s_9s_ratio() {
return visit_length_7s_9s_ratio;
}
public void setVisit_length_7s_9s_ratio(double visit_length_7s_9s_ratio) {
this.visit_length_7s_9s_ratio = visit_length_7s_9s_ratio;
}
public double getVisit_length_10s_30s_ratio() {
return visit_length_10s_30s_ratio;
}
public void setVisit_length_10s_30s_ratio(double visit_length_10s_30s_ratio) {
this.visit_length_10s_30s_ratio = visit_length_10s_30s_ratio;
}
public double getVisit_length_30s_60s_ratio() {
return visit_length_30s_60s_ratio;
}
public void setVisit_length_30s_60s_ratio(double visit_length_30s_60s_ratio) {
this.visit_length_30s_60s_ratio = visit_length_30s_60s_ratio;
}
public double getVisit_length_1m_3m_ratio() {
return visit_length_1m_3m_ratio;
}
public void setVisit_length_1m_3m_ratio(double visit_length_1m_3m_ratio) {
this.visit_length_1m_3m_ratio = visit_length_1m_3m_ratio;
}
public double getVisit_length_3m_10m_ratio() {
return visit_length_3m_10m_ratio;
}
public void setVisit_length_3m_10m_ratio(double visit_length_3m_10m_ratio) {
this.visit_length_3m_10m_ratio = visit_length_3m_10m_ratio;
}
public double getVisit_length_10m_30m_ratio() {
return visit_length_10m_30m_ratio;
}
public void setVisit_length_10m_30m_ratio(double visit_length_10m_30m_ratio) {
this.visit_length_10m_30m_ratio = visit_length_10m_30m_ratio;
}
public double getVisit_length_30m_ratio() {
return visit_length_30m_ratio;
}
public void setVisit_length_30m_ratio(double visit_length_60m_ratio) {
this.visit_length_30m_ratio = visit_length_60m_ratio;
}
public double getStep_length_1_3_ratio() {
return step_length_1_3_ratio;
}
public void setStep_length_1_3_ratio(double step_length_1_3_ratio) {
this.step_length_1_3_ratio = step_length_1_3_ratio;
}
public double getStep_length_4_6_ratio() {
return step_length_4_6_ratio;
}
public void setStep_length_4_6_ratio(double step_length_4_6_ratio) {
this.step_length_4_6_ratio = step_length_4_6_ratio;
}
public double getStep_length_7_9_ratio() {
return step_length_7_9_ratio;
}
public void setStep_length_7_9_ratio(double step_length_7_9_ratio) {
this.step_length_7_9_ratio = step_length_7_9_ratio;
}
public double getStep_length_10_30_ratio() {
return step_length_10_30_ratio;
}
public void setStep_length_10_30_ratio(double step_length_10_30_ratio) {
this.step_length_10_30_ratio = step_length_10_30_ratio;
}
public double getStep_length_30_60_ratio() {
return step_length_30_60_ratio;
}
public void setStep_length_30_60_ratio(double step_length_30_60_ratio) {
this.step_length_30_60_ratio = step_length_30_60_ratio;
}
public double getStep_length_60_ratio() {
return step_length_60_ratio;
}
public void setStep_length_60_ratio(double step_length_60_ratio) {
this.step_length_60_ratio = step_length_60_ratio;
}
}
封装DAO
在包com.erik.sparkproject.dao
里新建接口ISessionAggrStatDAO.java
package com.erik.sparkproject.dao;
import com.erik.sparkproject.domain.SessionAggrStat;
/**
* session聚合统计模块DAO接口
* @author Erik
*
*/
public interface ISessionAggrStatDAO {
void insert (SessionAggrStat sessionAggrStat);
}
添加DAOFactory
在包com.erik.sparkproject.dao.factory
中的DAOFactory.java
类中添加获取session聚合统计DAO方法
public static ISessionAggrStatDAO getSessionAggrStatDAO() {
return new SessionAggrStatDAOImpl();
}
创建Impl
在包com.erik.sparkproject.impl
中新建session聚合统计实现类SessionAggrStatDAOImpl.java
package com.erik.sparkproject.impl;
import com.erik.sparkproject.dao.ISessionAggrStatDAO;
import com.erik.sparkproject.domain.SessionAggrStat;
import com.erik.sparkproject.jdbc.JDBCHelper;
/**
* session聚合统计实现类
* @author Erik
*
*/
public class SessionAggrStatDAOImpl implements ISessionAggrStatDAO {
public void insert(SessionAggrStat sessionAggrStat) {
String sql = "insert into session_aggr_stat values(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)";
Object[] params = new Object[] {sessionAggrStat.getTaskid(),
sessionAggrStat.getSession_count(),
sessionAggrStat.getVisit_length_1s_3s_ratio(),
sessionAggrStat.getVisit_length_4s_6s_ratio(),
sessionAggrStat.getVisit_length_7s_9s_ratio(),
sessionAggrStat.getVisit_length_10s_30s_ratio(),
sessionAggrStat.getVisit_length_30s_60s_ratio(),
sessionAggrStat.getVisit_length_1m_3m_ratio(),
sessionAggrStat.getVisit_length_3m_10m_ratio(),
sessionAggrStat.getVisit_length_10m_30m_ratio(),
sessionAggrStat.getVisit_length_30m_ratio(),
sessionAggrStat.getStep_length_1_3_ratio(),
sessionAggrStat.getStep_length_4_6_ratio(),
sessionAggrStat.getStep_length_7_9_ratio(),
sessionAggrStat.getStep_length_10_30_ratio(),
sessionAggrStat.getStep_length_30_60_ratio(),
sessionAggrStat.getStep_length_60_ratio(),};
JDBCHelper jdbcHelper = JDBCHelper.getInstance();
jdbcHelper.executeUpdate(sql, params);
}
}
将统计数据插入MySQL
在UserVisitSessionAnalyzeSpark.java
中将统计结果封装为Domain对象,并调用对应的DAO插入MySQL统计结果
//将访问结果封装成Domain对象
SessionAggrStat sessionAggrStat = new SessionAggrStat();
sessionAggrStat.setTaskid(taskid);
sessionAggrStat.setSession_count(session_count);
sessionAggrStat.setVisit_length_1s_3s_ratio(visit_length_1s_3s_ratio);
sessionAggrStat.setVisit_length_4s_6s_ratio(visit_length_4s_6s_ratio);
sessionAggrStat.setVisit_length_7s_9s_ratio(visit_length_7s_9s_ratio);
sessionAggrStat.setVisit_length_10s_30s_ratio(visit_length_10s_30s_ratio);
sessionAggrStat.setVisit_length_30s_60s_ratio(visit_length_30s_60s_ratio);
sessionAggrStat.setVisit_length_1m_3m_ratio(visit_length_1m_3m_ratio);
sessionAggrStat.setVisit_length_3m_10m_ratio(visit_length_3m_10m_ratio);
sessionAggrStat.setVisit_length_10m_30m_ratio(visit_length_10m_30m_ratio);
sessionAggrStat.setVisit_length_30m_ratio(visit_length_30m_ratio);
sessionAggrStat.setStep_length_1_3_ratio(step_length_1_3_ratio);
sessionAggrStat.setStep_length_4_6_ratio(step_length_4_6_ratio);
sessionAggrStat.setStep_length_7_9_ratio(step_length_7_9_ratio);
sessionAggrStat.setStep_length_10_30_ratio(step_length_10_30_ratio);
sessionAggrStat.setStep_length_30_60_ratio(step_length_30_60_ratio);
sessionAggrStat.setStep_length_60_ratio(step_length_60_ratio);
//调用对应的DAO插入统计结果
ISessionAggrStatDAO sessionAggrStatDAO = DAOFactory.getSessionAggrStatDAO();
sessionAggrStatDAO.insert(sessionAggrStat);
完整代码
package com.erik.sparkproject.spark;
import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import org.apache.spark.Accumulator;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.hive.HiveContext;
import com.alibaba.fastjson.JSONObject;
import com.erik.sparkproject.conf.ConfigurationManager;
import com.erik.sparkproject.constant.Constants;
import com.erik.sparkproject.dao.ISessionAggrStatDAO;
import com.erik.sparkproject.dao.ITaskDAO;
import com.erik.sparkproject.domain.SessionAggrStat;
import com.erik.sparkproject.domain.Task;
import com.erik.sparkproject.impl.DAOFactory;
import com.erik.sparkproject.test.MockData;
import com.erik.sparkproject.util.*;
import scala.Tuple2;
/**
* 用户访问session分析spark作业
* @author Erik
*
*/
public class UserVisitSessionAnalyzeSpark {
public static void main(String[] args) {
args = new String[]{"2"};
//构建spark上下文
//首先在Constants.java中设置spark作业相关的常量
//String SPARK_APP_NAME = "UserVisitSessionAnalyzeSpark";
//保存Constants.java配置
SparkConf conf = new SparkConf()
.setAppName(Constants.SPARK_APP_NAME)
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
SQLContext sqlContext = getSQLContext(sc.sc());
//生成模拟测试数据
mockData(sc, sqlContext);
//创建需要使用的DAO组件
ITaskDAO taskDAO = DAOFactory.getTaskDAO();
//那么就首先得查询出来指定的任务,并获取任务的查询参数
long taskid = ParamUtils.getTaskIdFromArgs(args);
Task task = taskDAO.findById(taskid);
JSONObject taskParam = JSONObject.parseObject(task.getTaskParam());
//如果要进行session粒度的数据聚合,
//首先要从user_visit_action表中,查询出来指定日期范围内的数据
JavaRDD<Row> actionRDD = getActionRDDByDateRange(sqlContext, taskParam);
JavaPairRDD<String, String> sessionid2AggrInfoRDD =
aggregateBySession(sqlContext, actionRDD);
//重构,同时进行过滤和统计
Accumulator<String> sessionAggrStatAccumulator = sc.accumulator(
"", new SesssionAggrStatAccumulator());
JavaPairRDD<String, String> filteredSessionid2AggrInfoRDD = filterSessionAndAggrStat(
sessionid2AggrInfoRDD, taskParam, sessionAggrStatAccumulator);
System.out.println(filteredSessionid2AggrInfoRDD.count());
//计算出各个范围的session占比,并写入MySQL
calculateAndPersistAggrStat(sessionAggrStatAccumulator.value(), task.getTaskid());
//关闭spark上下文
sc.close();
}
/**
* 获取SQLContext
* 如果在本地测试环境的话,那么久生成SQLC哦那text对象
*如果在生产环境运行的话,那么就生成HiveContext对象
* @param sc SparkContext
* @return SQLContext
*/
private static SQLContext getSQLContext(SparkContext sc) {
boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
if(local) {
return new SQLContext(sc);
}else {
return new HiveContext(sc);
}
}
/**
* 生成模拟数据
* 只有是本地模式,才会生成模拟数据
* @param sc
* @param sqlContext
*/
private static void mockData(JavaSparkContext sc, SQLContext sqlContext) {
boolean local = ConfigurationManager.getBoolean(Constants.SPARK_LOCAL);
if(local) {
MockData.mock(sc, sqlContext);
}
}
/**
* 获取指定日期范围内的用户访问行为数据
* @param sqlContext SQLContext
* @param taskParam 任务参数
* @return 行为数据RDD
*/
private static JavaRDD<Row> getActionRDDByDateRange(
SQLContext sqlContext, JSONObject taskParam) {
String startDate = ParamUtils.getParam(taskParam, Constants.PARAM_START_DATE);
String endDate = ParamUtils.getParam(taskParam, Constants.PARAM_END_DATE);
String sql = "select * "
+ "from user_visit_action"
+ "where date>='" + startDate + "'"
+ "and date<='" + endDate + "'";
DataFrame actionDF = sqlContext.sql(sql);
return actionDF.javaRDD();
}
/**
* 对行为数据按sesssion粒度进行聚合
* @param actionRDD 行为数据RDD
* @return session粒度聚合数据
*/
private static JavaPairRDD<String, String> aggregateBySession(
SQLContext sqlContext, JavaRDD<Row> actionRDD) {
//现在actionRDD中的元素是Row,一个Row就是一行用户访问行为记录,比如一次点击或者搜索
//现在需要将这个Row映射成<sessionid,Row>的格式
JavaPairRDD<String, Row> sessionid2ActionRDD = actionRDD.mapToPair(
/**
* PairFunction
* 第一个参数,相当于是函数的输入
* 第二个参数和第三个参数,相当于是函数的输出(Tuple),分别是Tuple第一个和第二个值
*/
new PairFunction<Row, String, Row>() {
private static final long serialVersionUID = 1L;
public Tuple2<String, Row> call(Row row) throws Exception {
//按照MockData.java中字段顺序获取
//此时需要拿到session_id,序号是2
return new Tuple2<String, Row>(row.getString(2), row);
}
});
//对行为数据按照session粒度进行分组
JavaPairRDD<String, Iterable<Row>> sessionid2ActionsRDD =
sessionid2ActionRDD.groupByKey();
//对每一个session分组进行聚合,将session中所有的搜索词和点击品类都聚合起来
//到此为止,获取的数据格式如下:<userid,partAggrInfo(sessionid,searchKeywords,clickCategoryIds)>
JavaPairRDD<Long, String> userid2PartAggrInfoRDD = sessionid2ActionsRDD.mapToPair(
new PairFunction<Tuple2<String, Iterable<Row>>, Long, String>() {
private static final long serialVersionUID = 1L;
public Tuple2<Long, String> call(Tuple2<String, Iterable<Row>> tuple)
throws Exception {
String sessionid = tuple._1;
Iterator<Row> iterator = tuple._2.iterator();
StringBuffer searchKeywordsBuffer = new StringBuffer("");
StringBuffer clickCategoryIdsBuffer = new StringBuffer("");
Long userid = null;
//session的起始和结束时间
Date startTime = null;
Date endTime = null;
//session的访问步长
int stepLength = 0;
//遍历session所有的访问行为
while(iterator.hasNext()) {
//提取每个 访问行为的搜索词字段和点击品类字段
Row row = iterator.next();
if(userid == null) {
userid = row.getLong(1);
}
String searchKeyword = row.getString(5);
Long clickCategoryId = row.getLong(6);
if(StringUtils.isNotEmpty(searchKeyword)) {
if(!searchKeywordsBuffer.toString().contains(searchKeyword)) {
searchKeywordsBuffer.append(searchKeyword + ",");
}
}
if(clickCategoryId != null) {
if(!clickCategoryIdsBuffer.toString().contains(
String.valueOf(clickCategoryId))) {
clickCategoryIdsBuffer.append(clickCategoryId + ",");
}
}
//计算session开始和结束时间
Date actionTime = DateUtils.parseTime(row.getString(4));
if(startTime == null) {
startTime = actionTime;
}
if(endTime == null) {
endTime = actionTime;
}
if(actionTime.before(startTime)) {
startTime = actionTime;
}
if(actionTime.after(endTime)) {
endTime = actionTime;
}
//计算session访问步长
stepLength ++;
}
//StringUtils引入的包是import com.erik.sparkproject.util.trimComma;
String searchKeywords = StringUtils.trimComma(searchKeywordsBuffer.toString());
String clickCategoryIds = StringUtils.trimComma(clickCategoryIdsBuffer.toString());
//计算session访问时长(秒)
long visitLength = (endTime.getTime() - startTime.getTime()) / 1000;
String partAggrInfo = Constants.FIELD_SESSION_ID + "=" + sessionid + "|"
+ Constants.FIELD_SEARCH_KEYWORDS + "=" + searchKeywords + "|"
+ Constants.FIELD_CLICK_CATEGORY_IDS + "=" + clickCategoryIds + "|"
+ Constants.FIELD_VISIT_LENGTH + "=" + visitLength + "|"
+ Constants.FIELD_STEP_LENGTH + "=" + stepLength;
return new Tuple2<Long, String>(userid, partAggrInfo);
}
});
//查询所有用户数据,并映射成<userid,Row>的格式
String sql = "select * from user_info";
JavaRDD<Row> userInfoRDD = sqlContext.sql(sql).javaRDD();
JavaPairRDD<Long, Row> userid2InfoRDD = userInfoRDD.mapToPair(
new PairFunction<Row, Long, Row>(){
private static final long serialVersionUID = 1L;
public Tuple2<Long, Row> call(Row row) throws Exception {
return new Tuple2<Long, Row>(row.getLong(0), row);
}
});
//将session粒度聚合数据,与用户信息进行join
JavaPairRDD<Long, Tuple2<String, Row>> userid2FullInfoRDD =
userid2PartAggrInfoRDD.join(userid2InfoRDD);
//对join起来的数据进行拼接,并且返回<sessionid,fullAggrInfo>格式的数据
JavaPairRDD<String, String> sessionid2FullAggrInfoRDD = userid2FullInfoRDD.mapToPair(
new PairFunction<Tuple2<Long, Tuple2<String, Row>>, String, String>() {
private static final long serialVersionUID = 1L;
public Tuple2<String, String> call(
Tuple2<Long, Tuple2<String, Row>> tuple) throws Exception {
String partAggrInfo = tuple._2._1;
Row userInfoRow = tuple._2._2;
String sessionid = StringUtils.getFieldFromConcatString(
partAggrInfo, "\\|", Constants.FIELD_SESSION_ID);
int age = userInfoRow.getInt(3);
String professional = userInfoRow.getString(4);
String city = userInfoRow.getString(5);
String sex = userInfoRow.getString(6);
String fullAggrInfo = partAggrInfo + "|"
+ Constants.FIELD_AGE + "=" + age + "|"
+ Constants.FIELD_PROFESSIONAL + "=" + professional + "|"
+ Constants.FIELD_CITY + "=" + city + "|"
+ Constants.FIELD_SEX + "=" + sex ;
return new Tuple2<String, String>(sessionid, fullAggrInfo);
}
});
return sessionid2FullAggrInfoRDD;
}
/**
* 过滤session数据,并进行聚合统计
* @param sessionid2AggrInfoRDD
* @return
*/
private static JavaPairRDD<String, String> filterSessionAndAggrStat(
JavaPairRDD<String, String> sessionid2AggrInfoRDD,
final JSONObject taskParam,
final Accumulator<String> sessionAggrAccumulator) {
//为了使用后面的ValieUtils,所以,首先将所有的筛选参数拼接成一个连接串
String startAge = ParamUtils.getParam(taskParam, Constants.PARAM_END_AGE);
String endAge = ParamUtils.getParam(taskParam, Constants.PARAM_END_AGE);
String professionals = ParamUtils.getParam(taskParam, Constants.PARAM_PROFESSIONALS);
String cities = ParamUtils.getParam(taskParam, Constants.PARAM_CITIES);
String sex = ParamUtils.getParam(taskParam, Constants.PARAM_SEX);
String keywords = ParamUtils.getParam(taskParam, Constants.PARAM_KEYWORDS);
String categoryIds = ParamUtils.getParam(taskParam, Constants.PARAM_CATEGORY_IDS);
String _parameter = (startAge != null ? Constants.PARAM_START_AGE + "=" + startAge + "|" : "")
+ (endAge != null ? Constants.PARAM_END_AGE + "=" + endAge + "|" : "")
+ (professionals != null ? Constants.PARAM_PROFESSIONALS + "=" + professionals + "|" : "")
+ (cities != null ? Constants.PARAM_CITIES + "=" + cities + "|" : "")
+ (sex != null ? Constants.PARAM_SEX + "=" + sex + "|" : "")
+ (keywords != null ? Constants.PARAM_KEYWORDS + "=" + keywords + "|" : "")
+ (categoryIds != null ? Constants.PARAM_CATEGORY_IDS + "=" + categoryIds : "");
if (_parameter.endsWith("\\|")) {
_parameter = _parameter.substring(0, _parameter.length() - 1);
}
final String parameter = _parameter;
//根据筛选参数进行过滤
JavaPairRDD<String, String> filteredSessionid2AggrInfoRDD = sessionid2AggrInfoRDD.filter(
new Function<Tuple2<String, String>, Boolean>() {
private static final long serialVersionUID = 1L;
public Boolean call(Tuple2<String, String> tuple) throws Exception {
//首先,从tuple中,获取聚合数据
String aggrInfo = tuple._2;
if(!ValidUtils.between(aggrInfo, Constants.FIELD_AGE,
parameter, Constants.PARAM_START_AGE, Constants.PARAM_END_AGE)) {
return false;
}
//按照职业范围进行过滤(professionals)
if(!ValidUtils.in(aggrInfo, Constants.FIELD_PROFESSIONAL,
parameter, Constants.PARAM_PROFESSIONALS)) {
return false;
}
//按照城市范围进行过滤(cities)
if(!ValidUtils.in(aggrInfo, Constants.FIELD_CITY,
parameter, Constants.PARAM_CATEGORY_IDS)) {
return false;
}
//按照性别过滤
if(!ValidUtils.equal(aggrInfo, Constants.FIELD_SEX,
parameter, Constants.PARAM_SEX)) {
return false;
}
//按照搜索词过滤
if(!ValidUtils.in(aggrInfo, Constants.FIELD_SEARCH_KEYWORDS,
parameter, Constants.PARAM_KEYWORDS)) {
return false;
}
//按照点击品类id进行搜索
if(!ValidUtils.in(aggrInfo, Constants.FIELD_CLICK_CATEGORY_IDS,
parameter, Constants.PARAM_CATEGORY_IDS)) {
return false;
}
sessionAggrAccumulator.add(Constants.SESSION_COUNT);
//计算出session的访问时长和访问步长的范围,并进行相应的累加
long visitLength = Long.valueOf(StringUtils.getFieldFromConcatString(
aggrInfo, "\\|", Constants.FIELD_VISIT_LENGTH));
long stepLength = Long.valueOf(StringUtils.getFieldFromConcatString(
aggrInfo, "\\|", Constants.FIELD_STEP_LENGTH));
calculateVisitLength(visitLength);
calculateStepLength(stepLength);
return true;
}
/**
* 计算访问时长范围
* @param visitLength
*/
private void calculateVisitLength(long visitLength) {
if(visitLength >= 1 && visitLength <= 3) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_1s_3s);
}else if(visitLength >= 4 && visitLength <= 6) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_4s_6s);
}else if(visitLength >= 7 && visitLength <= 9) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_7s_9s);
}else if(visitLength >= 10 && visitLength <= 30) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_10s_30s);
}else if(visitLength > 30 && visitLength <= 60) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_30s_60s);
}else if(visitLength > 60 && visitLength <= 180) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_1m_3m);
}else if(visitLength > 180 && visitLength <= 600) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_3m_10m);
}else if(visitLength > 600 && visitLength <= 1800) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_10m_30m);
}else if(visitLength > 1800) {
sessionAggrAccumulator.add(Constants.TIME_PERIOD_30m);
}
}
/**
* 计算访问步长范围
* @param stepLength
*/
private void calculateStepLength(long stepLength) {
if(stepLength >= 1 && stepLength <= 3) {
sessionAggrAccumulator.add(Constants.STEP_PERIOD_1_3);
}else if(stepLength >= 4 && stepLength <= 6) {
sessionAggrAccumulator.add(Constants.STEP_PERIOD_4_6);
}else if(stepLength >= 7 && stepLength <= 9) {
sessionAggrAccumulator.add(Constants.STEP_PERIOD_7_9);
}else if(stepLength >= 10 && stepLength <= 30) {
sessionAggrAccumulator.add(Constants.STEP_PERIOD_10_30);
}else if(stepLength > 30 && stepLength <= 60) {
sessionAggrAccumulator.add(Constants.STEP_PERIOD_30_60);
}else if(stepLength > 60) {
sessionAggrAccumulator.add(Constants.STEP_PERIOD_60);
}
}
});
return filteredSessionid2AggrInfoRDD;
}
/*
* 计算各session范围占比,并写入MySQL
*/
private static void calculateAndPersistAggrStat(String value, long taskid) {
//从Accumulator统计串中获取值
long session_count = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.SESSION_COUNT));
long visit_length_1s_3s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_1s_3s));
long visit_length_4s_6s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_4s_6s));
long visit_length_7s_9s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_7s_9s));
long visit_length_10s_30s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_10s_30s));
long visit_length_30s_60s = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_30s_60s));
long visit_length_1m_3m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_1m_3m));
long visit_length_3m_10m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_3m_10m));
long visit_length_10m_30m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_10m_30m));
long visit_length_30m = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.TIME_PERIOD_30m));
long step_length_1_3 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_1_3));
long step_length_4_6 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_4_6));
long step_length_7_9 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_7_9));
long step_length_10_30 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_10_30));
long step_length_30_60 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_30_60));
long step_length_60 = Long.valueOf(StringUtils.getFieldFromConcatString(
value, "\\|", Constants.STEP_PERIOD_60));
//计算各个访问时长和访问步长的范围
double visit_length_1s_3s_ratio = NumberUtils.formatDouble(
(double)visit_length_1s_3s / (double)session_count, 2);
double visit_length_4s_6s_ratio = NumberUtils.formatDouble(
(double)visit_length_4s_6s / (double)session_count, 2);
double visit_length_7s_9s_ratio = NumberUtils.formatDouble(
(double)visit_length_7s_9s / (double)session_count, 2);
double visit_length_10s_30s_ratio = NumberUtils.formatDouble(
(double)visit_length_10s_30s / (double)session_count, 2);
double visit_length_30s_60s_ratio = NumberUtils.formatDouble(
(double)visit_length_30s_60s / (double)session_count, 2);
double visit_length_1m_3m_ratio = NumberUtils.formatDouble(
(double)visit_length_1m_3m / (double)session_count, 2);
double visit_length_3m_10m_ratio = NumberUtils.formatDouble(
(double)visit_length_3m_10m / (double)session_count, 2);
double visit_length_10m_30m_ratio = NumberUtils.formatDouble(
(double)visit_length_10m_30m / (double)session_count, 2);
double visit_length_30m_ratio = NumberUtils.formatDouble(
(double)visit_length_30m / (double)session_count, 2);
double step_length_1_3_ratio = NumberUtils.formatDouble(
(double)step_length_1_3 / (double)session_count, 2);
double step_length_4_6_ratio = NumberUtils.formatDouble(
(double)step_length_4_6 / (double)session_count, 2);
double step_length_7_9_ratio = NumberUtils.formatDouble(
(double)step_length_7_9 / (double)session_count, 2);
double step_length_10_30_ratio = NumberUtils.formatDouble(
(double)step_length_10_30 / (double)session_count, 2);
double step_length_30_60_ratio = NumberUtils.formatDouble(
(double)step_length_30_60 / (double)session_count, 2);
double step_length_60_ratio = NumberUtils.formatDouble(
(double)step_length_60 / (double)session_count, 2);
//将访问结果封装成Domain对象
SessionAggrStat sessionAggrStat = new SessionAggrStat();
sessionAggrStat.setTaskid(taskid);
sessionAggrStat.setSession_count(session_count);
sessionAggrStat.setVisit_length_1s_3s_ratio(visit_length_1s_3s_ratio);
sessionAggrStat.setVisit_length_4s_6s_ratio(visit_length_4s_6s_ratio);
sessionAggrStat.setVisit_length_7s_9s_ratio(visit_length_7s_9s_ratio);
sessionAggrStat.setVisit_length_10s_30s_ratio(visit_length_10s_30s_ratio);
sessionAggrStat.setVisit_length_30s_60s_ratio(visit_length_30s_60s_ratio);
sessionAggrStat.setVisit_length_1m_3m_ratio(visit_length_1m_3m_ratio);
sessionAggrStat.setVisit_length_3m_10m_ratio(visit_length_3m_10m_ratio);
sessionAggrStat.setVisit_length_10m_30m_ratio(visit_length_10m_30m_ratio);
sessionAggrStat.setVisit_length_30m_ratio(visit_length_30m_ratio);
sessionAggrStat.setStep_length_1_3_ratio(step_length_1_3_ratio);
sessionAggrStat.setStep_length_4_6_ratio(step_length_4_6_ratio);
sessionAggrStat.setStep_length_7_9_ratio(step_length_7_9_ratio);
sessionAggrStat.setStep_length_10_30_ratio(step_length_10_30_ratio);
sessionAggrStat.setStep_length_30_60_ratio(step_length_30_60_ratio);
sessionAggrStat.setStep_length_60_ratio(step_length_60_ratio);
//调用对用的DAO插入统计结果
ISessionAggrStatDAO sessionAggrStatDAO = DAOFactory.getSessionAggrStatDAO();
sessionAggrStatDAO.insert(sessionAggrStat);
}
}
总结:数据写入MySQL步骤
从上面的实现方式可以看出规范的数据写入MySQL步骤为:
1. 封装Domain:将MySQL数据库表中的对象生成setX(),getX()方法;
2. 封装DAO:创建DAO接口;
3. 添加DAOFactory:在DAOFactory中添加获取DAO的方法;
4. 创建Impl:创建sql语句,设置一系列getX()和写入MySQL的语句;
5. 写入MySQL:通过setX()语句赋值并将值写入MySQL.
《Spark 大型电商项目实战》源码:https://github.com/Erik-ly/SprakProject
本文为《Spark大型电商项目实战》系列文章之一。
更多文章:Spark大型电商项目实战:http://blog.csdn.net/u012318074/article/category/6744423