之前用过scala写过一些spark的小例子,最近在准备面试,因此在这里用java实现简单的Spark例子。
数据很简单,是一个被预先处理的日志文件,包括时间、电话号,上行流量和下行流量。一行为一条记录,不同数据之间用制表符隔开。
样本类
样本类是为了将日志文件的一条记录封装起来
package com.icesun.java.accessLog;
import java.io.Serializable;
public class LogInfo implements Serializable {
private static final long serialVersionUID = 5749943279909593929L;
private long timeStamp;
private String phoneNo;
private long down;
private long up;
LogInfo(){}
LogInfo(long timeStamp, String phoneNo, long down, long up){
this.timeStamp = timeStamp;
this.phoneNo = phoneNo;
this.down = down;
this.up = up;
}
public long getTimeStamp() {
return timeStamp;
}
public String getPhoneNo() {
return phoneNo;
}
public long getDown() {
return down;
}
public long getUp() {
return up;
}
}
复制代码
Spark Core API
package com.icesun.java.accessLog;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class LogSpark {
public static void main(String [] args){
SparkConf conf = new SparkConf().setMaster("local").setAppName("AccessLog");
JavaSparkContext sc = new JavaSparkContext(conf);
sc.setLogLevel("WARN");
String path = "files/access.log";
JavaRDD<String> lines = sc.textFile(path);
JavaPairRDD<String,LogInfo> logPairRDD = RDD2RDDPair(lines);
JavaPairRDD<String,LogInfo> reduceByKey = aggregateByDeviceID(logPairRDD);
reduceByKey.foreach(x -> System.out.println(x._2.getDown()));
System.out.println(reduceByKey.count());
sc.close();
}
//实现strRDD到LogInfo RDD的转换 <K,V> 电话号为K,LogInfor为V
private static JavaPairRDD<String, LogInfo> RDD2RDDPair(JavaRDD<String> accessLogRDD){
return accessLogRDD.mapToPair((PairFunction<String, String, LogInfo>) line -> {
String[] logInfo = line.split("\t");
long timeStamp = Long.valueOf(logInfo[0]);
String phone = logInfo[1];
long down = Long.valueOf(logInfo[2]);
long up = Long.valueOf(logInfo[2]);
LogInfo log = new LogInfo(timeStamp, phone, down, up);
return new Tuple2<String, LogInfo>(phone, log);
});
}
//实现reduceByKey 电话号为K,将上行流量和下行流量加和
private static JavaPairRDD<String, LogInfo> aggregateByDeviceID(JavaPairRDD<String, LogInfo> pairRDD){
return pairRDD.reduceByKey((Function2<LogInfo, LogInfo, LogInfo>)(v1, v2) -> {
//时间戳为最早的时间
long timeStamp = v1.getTimeStamp() < v2.getTimeStamp() ? v1.getTimeStamp(): v2.getTimeStamp();
//上行流量和下行流量进行add
long up = v1.getUp() + v2.getUp();
long down = v1.getDown() + v2.getDown();
String phone = v1.getPhoneNo();
return new LogInfo(timeStamp, phone, up, down);
}
);
}
}
复制代码
SparkSQL
package com.icesun.java.accessLog;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
public class LogSparkSQL {
public static void main(String[] args){
SparkConf conf = new SparkConf().setAppName("SparkSQL").setMaster("local");
JavaSparkContext sc = new JavaSparkContext(conf);
// HiveConf hiveConf = new HiveConf(sc);
SQLContext sqlContext = new SQLContext(sc);
JavaRDD<String> lines = sc.textFile("files/access.log");
//将字符串转换成LogInfoRDD
JavaRDD<LogInfo> logInfo = lines.map( line ->{
String[] str = line.split("\t");
long timeStamp = Long.valueOf(str[0]);
String phone = str[1];
long down = Long.valueOf(str[2]);
long up = Long.valueOf(str[3]);
LogInfo log = new LogInfo(timeStamp, phone, down, up);
return log;
});
//将RDD转换成DataSet数据集
Dataset<Row> df = sqlContext.createDataFrame(logInfo, LogInfo.class);
//在dataset数据集上进行查询操作
df.select("phoneNo", "down").where("up > 50000").show();
//将df注册成临时视图,这样可以用SQL表达式进行查询操作
df.createOrReplaceTempView("log");
Dataset<Row> seleRs = sqlContext.sql("select * from log where up > 50000 and down < 10000");
seleRs.toJavaRDD().foreach(row -> System.out.println(row.get(1)));
}
}
复制代码
在最后推广一下我自己的个人博客 zicesun.com,欢迎留言!!!