案例背景
如果你是在一个互联网公司,然后你的公司现在也在做移动互联网,做了一个手机app
那么你的手机app的用户,每次进行点击,或者是一些搜索操作的时候,都会跟你的远程的后端服务器做一次交互
也就是说,你的手机app,首先会往后端服务器发送一个请求,然后你的后端服务器会给你的手机app返回一个响应,响应的内容可能是图片、或者文字、或者json
此时,就完成了一次你的移动端app和后端服务器之间的交互过程
通常来说,在你的移动端app访问你的后端服务器的时候,你的后端服务器会记录一条日志
这个日志,也就是你的移动端app访问流量的相关日志,但是也可以根据你自己的需要,移动端发送一条日志过来,服务器端的web系统保存日志
我们这里做的就是最基本的,记录你的移动端app和服务器之间的上行数据包和下行数据包,上行流量和下行流量
我们要来计算,就是说,你的每个移动端,唯一的一个标识是你的deviceID
然后呢,每条日志,都会有这一次请求和响应的上行流量和下行流量的记录,这里呢,上行流量指的是手机app向服务器发送的请求数据的流量
下行流量,认为是服务器端给手机app返回的数据(比如说图片、文字、json)的流量
每个设备(deviceID),总上行流量和总下行流量,计算之后,要根据上行流量和下行流量进行排序,需要进行倒序排序
获取流量最大的前10个设备
日志模拟的工具类,及相关日志的生成
public class DataGenerator {
public static void main(String[] args) throws Exception {
Random random = new Random();
// 生成100个deviceID
List<String> deviceIDs = new ArrayList<String>();
for(int i = 0; i < 1000; i++) {
deviceIDs.add(getRandomUUID());
}
StringBuffer buffer = new StringBuffer("");
for(int i = 0; i < 100000; i++) {
// 生成随机时间戳
Calendar cal = Calendar.getInstance();
cal.setTime(new Date());
cal.add(Calendar.MINUTE, -random.nextInt(600));
long timestamp = cal.getTime().getTime();
// 生成随机deviceID
String deviceID = deviceIDs.get(random.nextInt(1000));
// 生成随机的上行流量
long upTraffic = random.nextInt(100000);
// 生成随机的下行流量
long downTraffic = random.nextInt(100000);
buffer.append(timestamp).append("\t")
.append(deviceID).append("\t")
.append(upTraffic).append("\t")
.append(downTraffic).append("\n");
}
PrintWriter pw = null;
try {
pw = new PrintWriter(new OutputStreamWriter(
new FileOutputStream("E:\\testdata\\applog\\access.log")));
pw.write(buffer.toString());
} catch (Exception e) {
e.printStackTrace();
} finally {
pw.close();
}
}
private static String getRandomUUID() {
return UUID.randomUUID().toString().replace("-", "");
}
}
代码实现
读取日志文件并创建RDD
public class AppLogSpark {
public static void main(String[] args) {
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"C://Users//Administrator//Desktop//access.log");
// 关闭Spark上下文
sc.close();
}
}
创建自定义的可序列化类
public class AccessLogInfo implements Serializable {
private static final long serialVersionUID = 0L;
private long timestamp; // 时间戳
private long upTraffic; // 上行流量
private long downTraffic; // 下行流量
public AccessLogInfo() {
}
public AccessLogInfo(long timestamp, long upTraffic, long downTraffic) {
this.timestamp = timestamp;
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
public long getUpTraffic() {
return upTraffic;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
}
将RDD映射为key-value格式
public class AppLogSpark {
public static void main(String[] args) {
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"E:\\testdata\\applog\\access.log");
// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD =
mapAccessLogRDD2Pair(accessLogRDD);
// 关闭Spark上下文
sc.close();
}
/**
* 将日志RDD映射为key-value的格式
* @param accessLogRDD 日志RDD
* @return key-value格式RDD
*/
// Long.parseLong(strings[0]), Long.parseLong(strings[2]), Long.parseLong(strings[3])
private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
JavaRDD<String> accessLogRDD) {
return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {
@Override
public Tuple2<String, AccessLogInfo> call(String s) throws Exception {
String[] strings = s.split("\t");
return new Tuple2<>(strings[1],
new AccessLogInfo(Long.parseLong(strings[0]),Long.parseLong(strings[2]), Long.parseLong(strings[3])));
}
});
}
}
基于deviceID进行聚合操作
public class AppLogSpark {
public static void main(String[] args) {
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"E:\\testdata\\applog\\access.log");
// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD =
mapAccessLogRDD2Pair(accessLogRDD);
// 根据deviceID进行聚合操作
// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD =
aggregateByDeviceID(accessLogPairRDD);
// 关闭Spark上下文
sc.close();
}
/**
* 将日志RDD映射为key-value的格式
* @param accessLogRDD 日志RDD
* @return key-value格式RDD
*/
// Long.parseLong(strings[0]), Long.parseLong(strings[2]), Long.parseLong(strings[3])
private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
JavaRDD<String> accessLogRDD) {
return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {
@Override
public Tuple2<String, AccessLogInfo> call(String s) throws Exception {
String[] strings = s.split("\t");
return new Tuple2<>(strings[1],
new AccessLogInfo(Long.parseLong(strings[0]),Long.parseLong(strings[2]), Long.parseLong(strings[3])));
}
});
}
/**
* 根据deviceID进行聚合操作
* 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
* @param accessLogPairRDD 日志key-value格式RDD
* @return 按deviceID聚合RDD
*/
private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD) {
return accessLogPairRDD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
@Override
public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2)
throws Exception {
long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp() ?
accessLogInfo1.getTimestamp() : accessLogInfo2.getTimestamp();
long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
AccessLogInfo accessLogInfo = new AccessLogInfo();
accessLogInfo.setTimestamp(timestamp);
accessLogInfo.setUpTraffic(upTraffic);
accessLogInfo.setDownTraffic(downTraffic);
return accessLogInfo;
}
});
}
}
自定义二次排序key类
public class AccessLogSortKey implements Ordered<AccessLogSortKey>, Serializable {
private static final long serialVersionUID = 3702442700882342403L;
private long upTraffic;
private long downTraffic;
private long timestamp;
public AccessLogSortKey() {
}
public AccessLogSortKey(long timestamp, long upTraffic, long downTraffic) {
this.timestamp = timestamp;
this.upTraffic = upTraffic;
this.downTraffic = downTraffic;
}
@Override
public boolean $greater(AccessLogSortKey other) {
if(upTraffic > other.upTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic > other.downTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp > other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $greater$eq(AccessLogSortKey other) {
if($greater(other)) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp == other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $less(AccessLogSortKey other) {
if(upTraffic < other.upTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic < other.downTraffic) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp < other.timestamp) {
return true;
}
return false;
}
@Override
public boolean $less$eq(AccessLogSortKey other) {
if($less(other)) {
return true;
} else if(upTraffic == other.upTraffic &&
downTraffic == other.downTraffic &&
timestamp == other.timestamp) {
return true;
}
return false;
}
@Override
public int compare(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0) {
return (int) (upTraffic - other.upTraffic);
} else if(downTraffic - other.downTraffic != 0) {
return (int) (downTraffic - other.downTraffic);
} else if(timestamp - other.timestamp != 0) {
return (int) (timestamp - other.timestamp);
}
return 0;
}
@Override
public int compareTo(AccessLogSortKey other) {
if(upTraffic - other.upTraffic != 0) {
return (int) (upTraffic - other.upTraffic);
} else if(downTraffic - other.downTraffic != 0) {
return (int) (downTraffic - other.downTraffic);
} else if(timestamp - other.timestamp != 0) {
return (int) (timestamp - other.timestamp);
}
return 0;
}
public long getUpTraffic() {
return upTraffic;
}
public void setUpTraffic(long upTraffic) {
this.upTraffic = upTraffic;
}
public long getDownTraffic() {
return downTraffic;
}
public void setDownTraffic(long downTraffic) {
this.downTraffic = downTraffic;
}
public long getTimestamp() {
return timestamp;
}
public void setTimestamp(long timestamp) {
this.timestamp = timestamp;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (int) (downTraffic ^ (downTraffic >>> 32));
result = prime * result + (int) (timestamp ^ (timestamp >>> 32));
result = prime * result + (int) (upTraffic ^ (upTraffic >>> 32));
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null){
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
AccessLogSortKey other = (AccessLogSortKey) obj;
if (downTraffic != other.downTraffic) {
return false;
}
if (timestamp != other.timestamp) {
return false;
}
if (upTraffic != other.upTraffic) {
return false;
}
return true;
}
@Override
public String toString() {
return "AccessLogSortKey{" +
"upTraffic=" + upTraffic +
", downTraffic=" + downTraffic +
", timestamp=" + timestamp +
'}';
}
}
将二次排序key映射为RDD的key
public class AppLogSpark {
public static void main(String[] args) {
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"E:\\testdata\\applog\\access.log");
// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD =
mapAccessLogRDD2Pair(accessLogRDD);
// 根据deviceID进行聚合操作
// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD =
aggregateByDeviceID(accessLogPairRDD);
// 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD =
mapRDDKey2SortKey(aggrAccessLogPairRDD);
// 关闭Spark上下文
sc.close();
}
/**
* 将日志RDD映射为key-value的格式
* @param accessLogRDD 日志RDD
* @return key-value格式RDD
*/
// Long.parseLong(strings[0]), Long.parseLong(strings[2]), Long.parseLong(strings[3])
private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
JavaRDD<String> accessLogRDD) {
return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {
@Override
public Tuple2<String, AccessLogInfo> call(String s) throws Exception {
String[] strings = s.split("\t");
return new Tuple2<>(strings[1],
new AccessLogInfo(Long.parseLong(strings[0]),Long.parseLong(strings[2]), Long.parseLong(strings[3])));
}
});
}
/**
* 根据deviceID进行聚合操作
* 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
* @param accessLogPairRDD 日志key-value格式RDD
* @return 按deviceID聚合RDD
*/
private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD) {
return accessLogPairRDD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
@Override
public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2)
throws Exception {
long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp() ?
accessLogInfo1.getTimestamp() : accessLogInfo2.getTimestamp();
long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
AccessLogInfo accessLogInfo = new AccessLogInfo();
accessLogInfo.setTimestamp(timestamp);
accessLogInfo.setUpTraffic(upTraffic);
accessLogInfo.setDownTraffic(downTraffic);
return accessLogInfo;
}
});
}
/**
* 将RDD的key映射为二次排序key
* @param aggrAccessLogPairRDD 按deviceID聚合RDD
* @return 二次排序key RDD
*/
private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD) {
return aggrAccessLogPairRDD.mapToPair(new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {
@Override
public Tuple2<AccessLogSortKey, String> call(
Tuple2<String, AccessLogInfo> tuple) throws Exception {
// 获取tuple数据
String deviceID = tuple._1;
AccessLogInfo accessLogInfo = tuple._2;
// 将日志信息封装为二次排序key
AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
accessLogInfo.getTimestamp(),
accessLogInfo.getUpTraffic(),
accessLogInfo.getDownTraffic()
);
// 返回新的Tuple
return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
}
});
}
}
执行二次排序以及获取top10数据
public class AppLogSpark {
public static void main(String[] args) {
// 创建Spark配置和上下文对象
SparkConf conf = new SparkConf()
.setAppName("AppLogSpark")
.setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// 读取日志文件,并创建一个RDD
// 使用SparkContext的textFile()方法,即可读取本地磁盘文件,或者是HDFS上的文件
// 创建出来一个初始的RDD,其中包含了日志文件中的所有数据
JavaRDD<String> accessLogRDD = sc.textFile(
"E:\\testdata\\applog\\access.log");
// 将RDD映射为key-value格式,为后面的reduceByKey聚合做准备
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD =
mapAccessLogRDD2Pair(accessLogRDD);
// 根据deviceID进行聚合操作
// 获取每个deviceID的总上行流量、总下行流量、最早访问时间戳
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD =
aggregateByDeviceID(accessLogPairRDD);
// 将按deviceID聚合RDD的key映射为二次排序key,value映射为deviceID
JavaPairRDD<AccessLogSortKey, String> accessLogSortRDD =
mapRDDKey2SortKey(aggrAccessLogPairRDD);
// 执行二次排序操作,按照上行流量、下行流量以及时间戳进行倒序排序
JavaPairRDD<AccessLogSortKey ,String> sortedAccessLogRDD =
accessLogSortRDD.sortByKey(false);
// 获取top10数据
List<Tuple2<AccessLogSortKey, String>> top10DataList =
sortedAccessLogRDD.take(10);
for(Tuple2<AccessLogSortKey, String> data : top10DataList) {
System.out.println(data._2 + ": " + data._1);
}
// 关闭Spark上下文
sc.close();
}
/**
* 将日志RDD映射为key-value的格式
* @param accessLogRDD 日志RDD
* @return key-value格式RDD
*/
// Long.parseLong(strings[0]), Long.parseLong(strings[2]), Long.parseLong(strings[3])
private static JavaPairRDD<String, AccessLogInfo> mapAccessLogRDD2Pair(
JavaRDD<String> accessLogRDD) {
return accessLogRDD.mapToPair(new PairFunction<String, String, AccessLogInfo>() {
@Override
public Tuple2<String, AccessLogInfo> call(String s) throws Exception {
String[] strings = s.split("\t");
return new Tuple2<>(strings[1],
new AccessLogInfo(Long.parseLong(strings[0]),Long.parseLong(strings[2]), Long.parseLong(strings[3])));
}
});
}
/**
* 根据deviceID进行聚合操作
* 计算出每个deviceID的总上行流量、总下行流量以及最早访问时间
* @param accessLogPairRDD 日志key-value格式RDD
* @return 按deviceID聚合RDD
*/
private static JavaPairRDD<String, AccessLogInfo> aggregateByDeviceID(
JavaPairRDD<String, AccessLogInfo> accessLogPairRDD) {
return accessLogPairRDD.reduceByKey(new Function2<AccessLogInfo, AccessLogInfo, AccessLogInfo>() {
@Override
public AccessLogInfo call(AccessLogInfo accessLogInfo1, AccessLogInfo accessLogInfo2)
throws Exception {
long timestamp = accessLogInfo1.getTimestamp() < accessLogInfo2.getTimestamp() ?
accessLogInfo1.getTimestamp() : accessLogInfo2.getTimestamp();
long upTraffic = accessLogInfo1.getUpTraffic() + accessLogInfo2.getUpTraffic();
long downTraffic = accessLogInfo1.getDownTraffic() + accessLogInfo2.getDownTraffic();
AccessLogInfo accessLogInfo = new AccessLogInfo();
accessLogInfo.setTimestamp(timestamp);
accessLogInfo.setUpTraffic(upTraffic);
accessLogInfo.setDownTraffic(downTraffic);
return accessLogInfo;
}
});
}
/**
* 将RDD的key映射为二次排序key
* @param aggrAccessLogPairRDD 按deviceID聚合RDD
* @return 二次排序key RDD
*/
private static JavaPairRDD<AccessLogSortKey, String> mapRDDKey2SortKey(
JavaPairRDD<String, AccessLogInfo> aggrAccessLogPairRDD) {
return aggrAccessLogPairRDD.mapToPair(new PairFunction<Tuple2<String,AccessLogInfo>, AccessLogSortKey, String>() {
@Override
public Tuple2<AccessLogSortKey, String> call(
Tuple2<String, AccessLogInfo> tuple) throws Exception {
// 获取tuple数据
String deviceID = tuple._1;
AccessLogInfo accessLogInfo = tuple._2;
// 将日志信息封装为二次排序key
AccessLogSortKey accessLogSortKey = new AccessLogSortKey(
accessLogInfo.getTimestamp(),
accessLogInfo.getUpTraffic(),
accessLogInfo.getDownTraffic()
);
// 返回新的Tuple
return new Tuple2<AccessLogSortKey, String>(accessLogSortKey, deviceID);
}
});
}
}